Package ganeti :: Module backend
[hide private]
[frames] | no frames]

Source Code for Module ganeti.backend

   1  # 
   2  # 
   3   
   4  # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc. 
   5  # All rights reserved. 
   6  # 
   7  # Redistribution and use in source and binary forms, with or without 
   8  # modification, are permitted provided that the following conditions are 
   9  # met: 
  10  # 
  11  # 1. Redistributions of source code must retain the above copyright notice, 
  12  # this list of conditions and the following disclaimer. 
  13  # 
  14  # 2. Redistributions in binary form must reproduce the above copyright 
  15  # notice, this list of conditions and the following disclaimer in the 
  16  # documentation and/or other materials provided with the distribution. 
  17  # 
  18  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
  19  # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
  20  # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
  21  # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
  22  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  23  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  24  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  25  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  26  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  27  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  28  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  29   
  30   
  31  """Functions used by the node daemon 
  32   
  33  @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in 
  34       the L{UploadFile} function 
  35  @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted 
  36       in the L{_CleanDirectory} function 
  37   
  38  """ 
  39   
  40  # pylint: disable=E1103,C0302 
  41   
  42  # E1103: %s %r has no %r member (but some types could not be 
  43  # inferred), because the _TryOSFromDisk returns either (True, os_obj) 
  44  # or (False, "string") which confuses pylint 
  45   
  46  # C0302: This module has become too big and should be split up 
  47   
  48   
  49  import base64 
  50  import errno 
  51  import logging 
  52  import os 
  53  import os.path 
  54  import pycurl 
  55  import random 
  56  import re 
  57  import shutil 
  58  import signal 
  59  import stat 
  60  import tempfile 
  61  import time 
  62  import zlib 
  63  import contextlib 
  64  import collections 
  65   
  66  from ganeti import errors 
  67  from ganeti import http 
  68  from ganeti import utils 
  69  from ganeti import ssh 
  70  from ganeti import hypervisor 
  71  from ganeti.hypervisor import hv_base 
  72  from ganeti import constants 
  73  from ganeti.storage import bdev 
  74  from ganeti.storage import drbd 
  75  from ganeti.storage import extstorage 
  76  from ganeti.storage import filestorage 
  77  from ganeti import objects 
  78  from ganeti import ssconf 
  79  from ganeti import serializer 
  80  from ganeti import netutils 
  81  from ganeti import runtime 
  82  from ganeti import compat 
  83  from ganeti import pathutils 
  84  from ganeti import vcluster 
  85  from ganeti import ht 
  86  from ganeti.storage.base import BlockDev 
  87  from ganeti.storage.drbd import DRBD8 
  88  from ganeti import hooksmaster 
  89  import ganeti.metad as metad 
  90   
  91   
  92  _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id" 
  93  _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([ 
  94    pathutils.DATA_DIR, 
  95    pathutils.JOB_QUEUE_ARCHIVE_DIR, 
  96    pathutils.QUEUE_DIR, 
  97    pathutils.CRYPTO_KEYS_DIR, 
  98    ]) 
  99  _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60 
 100  _X509_KEY_FILE = "key" 
 101  _X509_CERT_FILE = "cert" 
 102  _IES_STATUS_FILE = "status" 
 103  _IES_PID_FILE = "pid" 
 104  _IES_CA_FILE = "ca" 
 105   
 106  #: Valid LVS output line regex 
 107  _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$") 
 108   
 109  # Actions for the master setup script 
 110  _MASTER_START = "start" 
 111  _MASTER_STOP = "stop" 
 112   
 113  #: Maximum file permissions for restricted command directory and executables 
 114  _RCMD_MAX_MODE = (stat.S_IRWXU | 
 115                    stat.S_IRGRP | stat.S_IXGRP | 
 116                    stat.S_IROTH | stat.S_IXOTH) 
 117   
 118  #: Delay before returning an error for restricted commands 
 119  _RCMD_INVALID_DELAY = 10 
 120   
 121  #: How long to wait to acquire lock for restricted commands (shorter than 
 122  #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many 
 123  #: command requests arrive 
 124  _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8 
125 126 127 -class RPCFail(Exception):
128 """Class denoting RPC failure. 129 130 Its argument is the error message. 131 132 """
133
134 135 -def _GetInstReasonFilename(instance_name):
136 """Path of the file containing the reason of the instance status change. 137 138 @type instance_name: string 139 @param instance_name: The name of the instance 140 @rtype: string 141 @return: The path of the file 142 143 """ 144 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
145
146 147 -def _StoreInstReasonTrail(instance_name, trail):
148 """Serialize a reason trail related to an instance change of state to file. 149 150 The exact location of the file depends on the name of the instance and on 151 the configuration of the Ganeti cluster defined at deploy time. 152 153 @type instance_name: string 154 @param instance_name: The name of the instance 155 156 @type trail: list of reasons 157 @param trail: reason trail 158 159 @rtype: None 160 161 """ 162 json = serializer.DumpJson(trail) 163 filename = _GetInstReasonFilename(instance_name) 164 utils.WriteFile(filename, data=json)
165
166 167 -def _Fail(msg, *args, **kwargs):
168 """Log an error and the raise an RPCFail exception. 169 170 This exception is then handled specially in the ganeti daemon and 171 turned into a 'failed' return type. As such, this function is a 172 useful shortcut for logging the error and returning it to the master 173 daemon. 174 175 @type msg: string 176 @param msg: the text of the exception 177 @raise RPCFail 178 179 """ 180 if args: 181 msg = msg % args 182 if "log" not in kwargs or kwargs["log"]: # if we should log this error 183 if "exc" in kwargs and kwargs["exc"]: 184 logging.exception(msg) 185 else: 186 logging.error(msg) 187 raise RPCFail(msg)
188
189 190 -def _GetConfig():
191 """Simple wrapper to return a SimpleStore. 192 193 @rtype: L{ssconf.SimpleStore} 194 @return: a SimpleStore instance 195 196 """ 197 return ssconf.SimpleStore()
198
199 200 -def _GetSshRunner(cluster_name):
201 """Simple wrapper to return an SshRunner. 202 203 @type cluster_name: str 204 @param cluster_name: the cluster name, which is needed 205 by the SshRunner constructor 206 @rtype: L{ssh.SshRunner} 207 @return: an SshRunner instance 208 209 """ 210 return ssh.SshRunner(cluster_name)
211
212 213 -def _Decompress(data):
214 """Unpacks data compressed by the RPC client. 215 216 @type data: list or tuple 217 @param data: Data sent by RPC client 218 @rtype: str 219 @return: Decompressed data 220 221 """ 222 assert isinstance(data, (list, tuple)) 223 assert len(data) == 2 224 (encoding, content) = data 225 if encoding == constants.RPC_ENCODING_NONE: 226 return content 227 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64: 228 return zlib.decompress(base64.b64decode(content)) 229 else: 230 raise AssertionError("Unknown data encoding")
231
232 233 -def _CleanDirectory(path, exclude=None):
234 """Removes all regular files in a directory. 235 236 @type path: str 237 @param path: the directory to clean 238 @type exclude: list 239 @param exclude: list of files to be excluded, defaults 240 to the empty list 241 242 """ 243 if path not in _ALLOWED_CLEAN_DIRS: 244 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'", 245 path) 246 247 if not os.path.isdir(path): 248 return 249 if exclude is None: 250 exclude = [] 251 else: 252 # Normalize excluded paths 253 exclude = [os.path.normpath(i) for i in exclude] 254 255 for rel_name in utils.ListVisibleFiles(path): 256 full_name = utils.PathJoin(path, rel_name) 257 if full_name in exclude: 258 continue 259 if os.path.isfile(full_name) and not os.path.islink(full_name): 260 utils.RemoveFile(full_name)
261
262 263 -def _BuildUploadFileList():
264 """Build the list of allowed upload files. 265 266 This is abstracted so that it's built only once at module import time. 267 268 """ 269 allowed_files = set([ 270 pathutils.CLUSTER_CONF_FILE, 271 pathutils.ETC_HOSTS, 272 pathutils.SSH_KNOWN_HOSTS_FILE, 273 pathutils.VNC_PASSWORD_FILE, 274 pathutils.RAPI_CERT_FILE, 275 pathutils.SPICE_CERT_FILE, 276 pathutils.SPICE_CACERT_FILE, 277 pathutils.RAPI_USERS_FILE, 278 pathutils.CONFD_HMAC_KEY, 279 pathutils.CLUSTER_DOMAIN_SECRET_FILE, 280 ]) 281 282 for hv_name in constants.HYPER_TYPES: 283 hv_class = hypervisor.GetHypervisorClass(hv_name) 284 allowed_files.update(hv_class.GetAncillaryFiles()[0]) 285 286 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \ 287 "Allowed file storage paths should never be uploaded via RPC" 288 289 return frozenset(allowed_files)
290 291 292 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
293 294 295 -def JobQueuePurge():
296 """Removes job queue files and archived jobs. 297 298 @rtype: tuple 299 @return: True, None 300 301 """ 302 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE]) 303 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
304
305 306 -def GetMasterNodeName():
307 """Returns the master node name. 308 309 @rtype: string 310 @return: name of the master node 311 @raise RPCFail: in case of errors 312 313 """ 314 try: 315 return _GetConfig().GetMasterNode() 316 except errors.ConfigurationError, err: 317 _Fail("Cluster configuration incomplete: %s", err, exc=True)
318
319 320 -def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
321 """Decorator that runs hooks before and after the decorated function. 322 323 @type hook_opcode: string 324 @param hook_opcode: opcode of the hook 325 @type hooks_path: string 326 @param hooks_path: path of the hooks 327 @type env_builder_fn: function 328 @param env_builder_fn: function that returns a dictionary containing the 329 environment variables for the hooks. Will get all the parameters of the 330 decorated function. 331 @raise RPCFail: in case of pre-hook failure 332 333 """ 334 def decorator(fn): 335 def wrapper(*args, **kwargs): 336 # Despite the hooks run locally, we still have to pass an uuid which 337 # will be ignored in RunLocalHooks then. 338 nodes = ([constants.DUMMY_UUID], [constants.DUMMY_UUID]) 339 340 env_fn = compat.partial(env_builder_fn, *args, **kwargs) 341 342 cfg = _GetConfig() 343 hr = HooksRunner() 344 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes, 345 hr.RunLocalHooks, None, env_fn, None, 346 logging.warning, cfg.GetClusterName(), 347 cfg.GetMasterNode()) 348 hm.RunPhase(constants.HOOKS_PHASE_PRE) 349 result = fn(*args, **kwargs) 350 hm.RunPhase(constants.HOOKS_PHASE_POST) 351 352 return result
353 return wrapper 354 return decorator 355
356 357 -def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
358 """Builds environment variables for master IP hooks. 359 360 @type master_params: L{objects.MasterNetworkParameters} 361 @param master_params: network parameters of the master 362 @type use_external_mip_script: boolean 363 @param use_external_mip_script: whether to use an external master IP 364 address setup script (unused, but necessary per the implementation of the 365 _RunLocalHooks decorator) 366 367 """ 368 # pylint: disable=W0613 369 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family) 370 env = { 371 "MASTER_NETDEV": master_params.netdev, 372 "MASTER_IP": master_params.ip, 373 "MASTER_NETMASK": str(master_params.netmask), 374 "CLUSTER_IP_VERSION": str(ver), 375 } 376 377 return env
378
379 380 -def _RunMasterSetupScript(master_params, action, use_external_mip_script):
381 """Execute the master IP address setup script. 382 383 @type master_params: L{objects.MasterNetworkParameters} 384 @param master_params: network parameters of the master 385 @type action: string 386 @param action: action to pass to the script. Must be one of 387 L{backend._MASTER_START} or L{backend._MASTER_STOP} 388 @type use_external_mip_script: boolean 389 @param use_external_mip_script: whether to use an external master IP 390 address setup script 391 @raise backend.RPCFail: if there are errors during the execution of the 392 script 393 394 """ 395 env = _BuildMasterIpEnv(master_params) 396 397 if use_external_mip_script: 398 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT 399 else: 400 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT 401 402 result = utils.RunCmd([setup_script, action], env=env, reset_env=True) 403 404 if result.failed: 405 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" % 406 (action, result.exit_code, result.output), log=True)
407 408 409 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup", 410 _BuildMasterIpEnv)
411 -def ActivateMasterIp(master_params, use_external_mip_script):
412 """Activate the IP address of the master daemon. 413 414 @type master_params: L{objects.MasterNetworkParameters} 415 @param master_params: network parameters of the master 416 @type use_external_mip_script: boolean 417 @param use_external_mip_script: whether to use an external master IP 418 address setup script 419 @raise RPCFail: in case of errors during the IP startup 420 421 """ 422 _RunMasterSetupScript(master_params, _MASTER_START, 423 use_external_mip_script)
424
425 426 -def StartMasterDaemons(no_voting):
427 """Activate local node as master node. 428 429 The function will start the master daemons (ganeti-masterd and ganeti-rapi). 430 431 @type no_voting: boolean 432 @param no_voting: whether to start ganeti-masterd without a node vote 433 but still non-interactively 434 @rtype: None 435 436 """ 437 438 if no_voting: 439 daemon_args = "--no-voting --yes-do-it" 440 else: 441 daemon_args = "" 442 443 env = { 444 "EXTRA_LUXID_ARGS": daemon_args, 445 "EXTRA_WCONFD_ARGS": daemon_args, 446 } 447 448 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env) 449 if result.failed: 450 msg = "Can't start Ganeti master: %s" % result.output 451 logging.error(msg) 452 _Fail(msg)
453 454 455 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown", 456 _BuildMasterIpEnv)
457 -def DeactivateMasterIp(master_params, use_external_mip_script):
458 """Deactivate the master IP on this node. 459 460 @type master_params: L{objects.MasterNetworkParameters} 461 @param master_params: network parameters of the master 462 @type use_external_mip_script: boolean 463 @param use_external_mip_script: whether to use an external master IP 464 address setup script 465 @raise RPCFail: in case of errors during the IP turndown 466 467 """ 468 _RunMasterSetupScript(master_params, _MASTER_STOP, 469 use_external_mip_script)
470
471 472 -def StopMasterDaemons():
473 """Stop the master daemons on this node. 474 475 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node. 476 477 @rtype: None 478 479 """ 480 # TODO: log and report back to the caller the error failures; we 481 # need to decide in which case we fail the RPC for this 482 483 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"]) 484 if result.failed: 485 logging.error("Could not stop Ganeti master, command %s had exitcode %s" 486 " and error %s", 487 result.cmd, result.exit_code, result.output)
488
489 490 -def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
491 """Change the netmask of the master IP. 492 493 @param old_netmask: the old value of the netmask 494 @param netmask: the new value of the netmask 495 @param master_ip: the master IP 496 @param master_netdev: the master network device 497 498 """ 499 if old_netmask == netmask: 500 return 501 502 if not netutils.IPAddress.Own(master_ip): 503 _Fail("The master IP address is not up, not attempting to change its" 504 " netmask") 505 506 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add", 507 "%s/%s" % (master_ip, netmask), 508 "dev", master_netdev, "label", 509 "%s:0" % master_netdev]) 510 if result.failed: 511 _Fail("Could not set the new netmask on the master IP address") 512 513 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del", 514 "%s/%s" % (master_ip, old_netmask), 515 "dev", master_netdev, "label", 516 "%s:0" % master_netdev]) 517 if result.failed: 518 _Fail("Could not bring down the master IP address with the old netmask")
519
520 521 -def EtcHostsModify(mode, host, ip):
522 """Modify a host entry in /etc/hosts. 523 524 @param mode: The mode to operate. Either add or remove entry 525 @param host: The host to operate on 526 @param ip: The ip associated with the entry 527 528 """ 529 if mode == constants.ETC_HOSTS_ADD: 530 if not ip: 531 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not" 532 " present") 533 utils.AddHostToEtcHosts(host, ip) 534 elif mode == constants.ETC_HOSTS_REMOVE: 535 if ip: 536 RPCFail("Mode 'remove' does not allow 'ip' parameter, but" 537 " parameter is present") 538 utils.RemoveHostFromEtcHosts(host) 539 else: 540 RPCFail("Mode not supported")
541
542 543 -def LeaveCluster(modify_ssh_setup):
544 """Cleans up and remove the current node. 545 546 This function cleans up and prepares the current node to be removed 547 from the cluster. 548 549 If processing is successful, then it raises an 550 L{errors.QuitGanetiException} which is used as a special case to 551 shutdown the node daemon. 552 553 @param modify_ssh_setup: boolean 554 555 """ 556 _CleanDirectory(pathutils.DATA_DIR) 557 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR) 558 JobQueuePurge() 559 560 if modify_ssh_setup: 561 try: 562 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER) 563 564 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key)) 565 566 utils.RemoveFile(priv_key) 567 utils.RemoveFile(pub_key) 568 except errors.OpExecError: 569 logging.exception("Error while processing ssh files") 570 except IOError: 571 logging.exception("At least one SSH file was not accessible.") 572 573 try: 574 utils.RemoveFile(pathutils.CONFD_HMAC_KEY) 575 utils.RemoveFile(pathutils.RAPI_CERT_FILE) 576 utils.RemoveFile(pathutils.SPICE_CERT_FILE) 577 utils.RemoveFile(pathutils.SPICE_CACERT_FILE) 578 utils.RemoveFile(pathutils.NODED_CERT_FILE) 579 except: # pylint: disable=W0702 580 logging.exception("Error while removing cluster secrets") 581 582 utils.StopDaemon(constants.CONFD) 583 utils.StopDaemon(constants.MOND) 584 utils.StopDaemon(constants.KVMD) 585 586 # Raise a custom exception (handled in ganeti-noded) 587 raise errors.QuitGanetiException(True, "Shutdown scheduled")
588
589 590 -def _CheckStorageParams(params, num_params):
591 """Performs sanity checks for storage parameters. 592 593 @type params: list 594 @param params: list of storage parameters 595 @type num_params: int 596 @param num_params: expected number of parameters 597 598 """ 599 if params is None: 600 raise errors.ProgrammerError("No storage parameters for storage" 601 " reporting is provided.") 602 if not isinstance(params, list): 603 raise errors.ProgrammerError("The storage parameters are not of type" 604 " list: '%s'" % params) 605 if not len(params) == num_params: 606 raise errors.ProgrammerError("Did not receive the expected number of" 607 "storage parameters: expected %s," 608 " received '%s'" % (num_params, len(params)))
609
610 611 -def _CheckLvmStorageParams(params):
612 """Performs sanity check for the 'exclusive storage' flag. 613 614 @see: C{_CheckStorageParams} 615 616 """ 617 _CheckStorageParams(params, 1) 618 excl_stor = params[0] 619 if not isinstance(params[0], bool): 620 raise errors.ProgrammerError("Exclusive storage parameter is not" 621 " boolean: '%s'." % excl_stor) 622 return excl_stor
623
624 625 -def _GetLvmVgSpaceInfo(name, params):
626 """Wrapper around C{_GetVgInfo} which checks the storage parameters. 627 628 @type name: string 629 @param name: name of the volume group 630 @type params: list 631 @param params: list of storage parameters, which in this case should be 632 containing only one for exclusive storage 633 634 """ 635 excl_stor = _CheckLvmStorageParams(params) 636 return _GetVgInfo(name, excl_stor)
637
638 639 -def _GetVgInfo( 640 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
641 """Retrieves information about a LVM volume group. 642 643 """ 644 # TODO: GetVGInfo supports returning information for multiple VGs at once 645 vginfo = info_fn([name], excl_stor) 646 if vginfo: 647 vg_free = int(round(vginfo[0][0], 0)) 648 vg_size = int(round(vginfo[0][1], 0)) 649 else: 650 vg_free = None 651 vg_size = None 652 653 return { 654 "type": constants.ST_LVM_VG, 655 "name": name, 656 "storage_free": vg_free, 657 "storage_size": vg_size, 658 }
659
660 661 -def _GetLvmPvSpaceInfo(name, params):
662 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks. 663 664 @see: C{_GetLvmVgSpaceInfo} 665 666 """ 667 excl_stor = _CheckLvmStorageParams(params) 668 return _GetVgSpindlesInfo(name, excl_stor)
669
670 671 -def _GetVgSpindlesInfo( 672 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
673 """Retrieves information about spindles in an LVM volume group. 674 675 @type name: string 676 @param name: VG name 677 @type excl_stor: bool 678 @param excl_stor: exclusive storage 679 @rtype: dict 680 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name, 681 free spindles, total spindles respectively 682 683 """ 684 if excl_stor: 685 (vg_free, vg_size) = info_fn(name) 686 else: 687 vg_free = 0 688 vg_size = 0 689 return { 690 "type": constants.ST_LVM_PV, 691 "name": name, 692 "storage_free": vg_free, 693 "storage_size": vg_size, 694 }
695
696 697 -def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
698 """Retrieves node information from a hypervisor. 699 700 The information returned depends on the hypervisor. Common items: 701 702 - vg_size is the size of the configured volume group in MiB 703 - vg_free is the free size of the volume group in MiB 704 - memory_dom0 is the memory allocated for domain0 in MiB 705 - memory_free is the currently available (free) ram in MiB 706 - memory_total is the total number of ram in MiB 707 - hv_version: the hypervisor version, if available 708 709 @type hvparams: dict of string 710 @param hvparams: the hypervisor's hvparams 711 712 """ 713 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
714
715 716 -def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
717 """Retrieves node information for all hypervisors. 718 719 See C{_GetHvInfo} for information on the output. 720 721 @type hv_specs: list of pairs (string, dict of strings) 722 @param hv_specs: list of pairs of a hypervisor's name and its hvparams 723 724 """ 725 if hv_specs is None: 726 return None 727 728 result = [] 729 for hvname, hvparams in hv_specs: 730 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn)) 731 return result
732
733 734 -def _GetNamedNodeInfo(names, fn):
735 """Calls C{fn} for all names in C{names} and returns a dictionary. 736 737 @rtype: None or dict 738 739 """ 740 if names is None: 741 return None 742 else: 743 return map(fn, names)
744
745 746 -def GetNodeInfo(storage_units, hv_specs):
747 """Gives back a hash with different information about the node. 748 749 @type storage_units: list of tuples (string, string, list) 750 @param storage_units: List of tuples (storage unit, identifier, parameters) to 751 ask for disk space information. In case of lvm-vg, the identifier is 752 the VG name. The parameters can contain additional, storage-type-specific 753 parameters, for example exclusive storage for lvm storage. 754 @type hv_specs: list of pairs (string, dict of strings) 755 @param hv_specs: list of pairs of a hypervisor's name and its hvparams 756 @rtype: tuple; (string, None/dict, None/dict) 757 @return: Tuple containing boot ID, volume group information and hypervisor 758 information 759 760 """ 761 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n") 762 storage_info = _GetNamedNodeInfo( 763 storage_units, 764 (lambda (storage_type, storage_key, storage_params): 765 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params))) 766 hv_info = _GetHvInfoAll(hv_specs) 767 return (bootid, storage_info, hv_info)
768
769 770 -def _GetFileStorageSpaceInfo(path, params):
771 """Wrapper around filestorage.GetSpaceInfo. 772 773 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo 774 and ignore the *args parameter to not leak it into the filestorage 775 module's code. 776 777 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the 778 parameters. 779 780 """ 781 _CheckStorageParams(params, 0) 782 return filestorage.GetFileStorageSpaceInfo(path)
783 784 785 # FIXME: implement storage reporting for all missing storage types. 786 _STORAGE_TYPE_INFO_FN = { 787 constants.ST_BLOCK: None, 788 constants.ST_DISKLESS: None, 789 constants.ST_EXT: None, 790 constants.ST_FILE: _GetFileStorageSpaceInfo, 791 constants.ST_LVM_PV: _GetLvmPvSpaceInfo, 792 constants.ST_LVM_VG: _GetLvmVgSpaceInfo, 793 constants.ST_SHARED_FILE: None, 794 constants.ST_GLUSTER: None, 795 constants.ST_RADOS: None, 796 }
797 798 799 -def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
800 """Looks up and applies the correct function to calculate free and total 801 storage for the given storage type. 802 803 @type storage_type: string 804 @param storage_type: the storage type for which the storage shall be reported. 805 @type storage_key: string 806 @param storage_key: identifier of a storage unit, e.g. the volume group name 807 of an LVM storage unit 808 @type args: any 809 @param args: various parameters that can be used for storage reporting. These 810 parameters and their semantics vary from storage type to storage type and 811 are just propagated in this function. 812 @return: the results of the application of the storage space function (see 813 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that 814 storage type 815 @raises NotImplementedError: for storage types who don't support space 816 reporting yet 817 """ 818 fn = _STORAGE_TYPE_INFO_FN[storage_type] 819 if fn is not None: 820 return fn(storage_key, *args) 821 else: 822 raise NotImplementedError
823
824 825 -def _CheckExclusivePvs(pvi_list):
826 """Check that PVs are not shared among LVs 827 828 @type pvi_list: list of L{objects.LvmPvInfo} objects 829 @param pvi_list: information about the PVs 830 831 @rtype: list of tuples (string, list of strings) 832 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...]) 833 834 """ 835 res = [] 836 for pvi in pvi_list: 837 if len(pvi.lv_list) > 1: 838 res.append((pvi.name, pvi.lv_list)) 839 return res
840
841 842 -def _VerifyHypervisors(what, vm_capable, result, all_hvparams, 843 get_hv_fn=hypervisor.GetHypervisor):
844 """Verifies the hypervisor. Appends the results to the 'results' list. 845 846 @type what: C{dict} 847 @param what: a dictionary of things to check 848 @type vm_capable: boolean 849 @param vm_capable: whether or not this node is vm capable 850 @type result: dict 851 @param result: dictionary of verification results; results of the 852 verifications in this function will be added here 853 @type all_hvparams: dict of dict of string 854 @param all_hvparams: dictionary mapping hypervisor names to hvparams 855 @type get_hv_fn: function 856 @param get_hv_fn: function to retrieve the hypervisor, to improve testability 857 858 """ 859 if not vm_capable: 860 return 861 862 if constants.NV_HYPERVISOR in what: 863 result[constants.NV_HYPERVISOR] = {} 864 for hv_name in what[constants.NV_HYPERVISOR]: 865 hvparams = all_hvparams[hv_name] 866 try: 867 val = get_hv_fn(hv_name).Verify(hvparams=hvparams) 868 except errors.HypervisorError, err: 869 val = "Error while checking hypervisor: %s" % str(err) 870 result[constants.NV_HYPERVISOR][hv_name] = val
871
872 873 -def _VerifyHvparams(what, vm_capable, result, 874 get_hv_fn=hypervisor.GetHypervisor):
875 """Verifies the hvparams. Appends the results to the 'results' list. 876 877 @type what: C{dict} 878 @param what: a dictionary of things to check 879 @type vm_capable: boolean 880 @param vm_capable: whether or not this node is vm capable 881 @type result: dict 882 @param result: dictionary of verification results; results of the 883 verifications in this function will be added here 884 @type get_hv_fn: function 885 @param get_hv_fn: function to retrieve the hypervisor, to improve testability 886 887 """ 888 if not vm_capable: 889 return 890 891 if constants.NV_HVPARAMS in what: 892 result[constants.NV_HVPARAMS] = [] 893 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]: 894 try: 895 logging.info("Validating hv %s, %s", hv_name, hvparms) 896 get_hv_fn(hv_name).ValidateParameters(hvparms) 897 except errors.HypervisorError, err: 898 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
899
900 901 -def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
902 """Verifies the instance list. 903 904 @type what: C{dict} 905 @param what: a dictionary of things to check 906 @type vm_capable: boolean 907 @param vm_capable: whether or not this node is vm capable 908 @type result: dict 909 @param result: dictionary of verification results; results of the 910 verifications in this function will be added here 911 @type all_hvparams: dict of dict of string 912 @param all_hvparams: dictionary mapping hypervisor names to hvparams 913 914 """ 915 if constants.NV_INSTANCELIST in what and vm_capable: 916 # GetInstanceList can fail 917 try: 918 val = GetInstanceList(what[constants.NV_INSTANCELIST], 919 all_hvparams=all_hvparams) 920 except RPCFail, err: 921 val = str(err) 922 result[constants.NV_INSTANCELIST] = val
923
924 925 -def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
926 """Verifies the node info. 927 928 @type what: C{dict} 929 @param what: a dictionary of things to check 930 @type vm_capable: boolean 931 @param vm_capable: whether or not this node is vm capable 932 @type result: dict 933 @param result: dictionary of verification results; results of the 934 verifications in this function will be added here 935 @type all_hvparams: dict of dict of string 936 @param all_hvparams: dictionary mapping hypervisor names to hvparams 937 938 """ 939 if constants.NV_HVINFO in what and vm_capable: 940 hvname = what[constants.NV_HVINFO] 941 hyper = hypervisor.GetHypervisor(hvname) 942 hvparams = all_hvparams[hvname] 943 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
944
945 946 -def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
947 """Verify the existance and validity of the client SSL certificate. 948 949 Also, verify that the client certificate is not self-signed. Self- 950 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4 951 and should be replaced by client certificates signed by the server 952 certificate. Hence we output a warning when we encounter a self-signed 953 one. 954 955 """ 956 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates" 957 if not os.path.exists(cert_file): 958 return (constants.CV_ERROR, 959 "The client certificate does not exist. Run '%s' to create" 960 " client certificates for all nodes." % create_cert_cmd) 961 962 (errcode, msg) = utils.VerifyCertificate(cert_file) 963 if errcode is not None: 964 return (errcode, msg) 965 966 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file) 967 if errcode is not None: 968 return (errcode, msg) 969 970 # if everything is fine, we return the digest to be compared to the config 971 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
972
973 974 -def _VerifySshSetup(node_status_list, my_name, ssh_key_type, 975 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS):
976 """Verifies the state of the SSH key files. 977 978 @type node_status_list: list of tuples 979 @param node_status_list: list of nodes of the cluster associated with a 980 couple of flags: (uuid, name, is_master_candidate, 981 is_potential_master_candidate, online) 982 @type my_name: str 983 @param my_name: name of this node 984 @type ssh_key_type: one of L{constants.SSHK_ALL} 985 @param ssh_key_type: type of key used on nodes 986 @type ganeti_pub_keys_file: str 987 @param ganeti_pub_keys_file: filename of the public keys file 988 989 """ 990 if node_status_list is None: 991 return ["No node list to check against the pub_key_file received."] 992 993 my_status_list = [(my_uuid, name, mc, pot_mc, online) for 994 (my_uuid, name, mc, pot_mc, online) 995 in node_status_list if name == my_name] 996 if len(my_status_list) == 0: 997 return ["Cannot find node information for node '%s'." % my_name] 998 (my_uuid, _, _, potential_master_candidate, online) = \ 999 my_status_list[0] 1000 1001 result = [] 1002 1003 if not os.path.exists(ganeti_pub_keys_file): 1004 result.append("The public key file '%s' does not exist. Consider running" 1005 " 'gnt-cluster renew-crypto --new-ssh-keys" 1006 " [--no-ssh-key-check]' to fix this." % ganeti_pub_keys_file) 1007 return result 1008 1009 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list] 1010 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list 1011 if not online] 1012 pub_keys = ssh.QueryPubKeyFile(None, key_file=ganeti_pub_keys_file) 1013 1014 if potential_master_candidate: 1015 # Check that the set of potential master candidates matches the 1016 # public key file 1017 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes) 1018 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes) 1019 missing_uuids = set([]) 1020 if pub_uuids_set != pot_mc_uuids_set: 1021 unknown_uuids = pub_uuids_set - pot_mc_uuids_set 1022 if unknown_uuids: 1023 result.append("The following node UUIDs are listed in the public key" 1024 " file on node '%s', but are not potential master" 1025 " candidates: %s." 1026 % (my_name, ", ".join(list(unknown_uuids)))) 1027 missing_uuids = pot_mc_uuids_set - pub_uuids_set 1028 if missing_uuids: 1029 result.append("The following node UUIDs of potential master candidates" 1030 " are missing in the public key file on node %s: %s." 1031 % (my_name, ", ".join(list(missing_uuids)))) 1032 1033 (_, key_files) = \ 1034 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False) 1035 (_, node_pub_key_file) = key_files[ssh_key_type] 1036 1037 my_keys = pub_keys[my_uuid] 1038 1039 node_pub_key = utils.ReadFile(node_pub_key_file) 1040 if node_pub_key.strip() not in my_keys: 1041 result.append("The dsa key of node %s does not match this node's key" 1042 " in the pub key file." % my_name) 1043 if len(my_keys) != 1: 1044 result.append("There is more than one key for node %s in the public key" 1045 " file." % my_name) 1046 else: 1047 if len(pub_keys.keys()) > 0: 1048 result.append("The public key file of node '%s' is not empty, although" 1049 " the node is not a potential master candidate." 1050 % my_name) 1051 1052 # Check that all master candidate keys are in the authorized_keys file 1053 (auth_key_file, _) = \ 1054 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False) 1055 for (uuid, name, mc, _, online) in node_status_list: 1056 if not online: 1057 continue 1058 if uuid in missing_uuids: 1059 continue 1060 if mc: 1061 for key in pub_keys[uuid]: 1062 if not ssh.HasAuthorizedKey(auth_key_file, key): 1063 result.append("A SSH key of master candidate '%s' (UUID: '%s') is" 1064 " not in the 'authorized_keys' file of node '%s'." 1065 % (name, uuid, my_name)) 1066 else: 1067 for key in pub_keys[uuid]: 1068 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key): 1069 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the" 1070 " 'authorized_keys' file of node '%s'." 1071 % (name, uuid, my_name)) 1072 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key): 1073 result.append("A SSH key of normal node '%s' (UUID: '%s') is not" 1074 " in the 'authorized_keys' file of itself." 1075 % (my_name, uuid)) 1076 1077 return result
1078
1079 1080 -def _VerifySshClutter(node_status_list, my_name):
1081 """Verifies that the 'authorized_keys' files are not cluttered up. 1082 1083 @type node_status_list: list of tuples 1084 @param node_status_list: list of nodes of the cluster associated with a 1085 couple of flags: (uuid, name, is_master_candidate, 1086 is_potential_master_candidate, online) 1087 @type my_name: str 1088 @param my_name: name of this node 1089 1090 """ 1091 result = [] 1092 (auth_key_file, _) = \ 1093 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False) 1094 node_names = [name for (_, name, _, _) in node_status_list] 1095 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names) 1096 if multiple_occurrences: 1097 msg = "There are hosts which have more than one SSH key stored for the" \ 1098 " same user in the 'authorized_keys' file of node %s. This can be" \ 1099 " due to an unsuccessful operation which cluttered up the" \ 1100 " 'authorized_keys' file. We recommend to clean this up manually. " \ 1101 % my_name 1102 for host, occ in multiple_occurrences.items(): 1103 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ)) 1104 result.append(msg) 1105 1106 return result
1107
1108 1109 -def VerifyNode(what, cluster_name, all_hvparams):
1110 """Verify the status of the local node. 1111 1112 Based on the input L{what} parameter, various checks are done on the 1113 local node. 1114 1115 If the I{filelist} key is present, this list of 1116 files is checksummed and the file/checksum pairs are returned. 1117 1118 If the I{nodelist} key is present, we check that we have 1119 connectivity via ssh with the target nodes (and check the hostname 1120 report). 1121 1122 If the I{node-net-test} key is present, we check that we have 1123 connectivity to the given nodes via both primary IP and, if 1124 applicable, secondary IPs. 1125 1126 @type what: C{dict} 1127 @param what: a dictionary of things to check: 1128 - filelist: list of files for which to compute checksums 1129 - nodelist: list of nodes we should check ssh communication with 1130 - node-net-test: list of nodes we should check node daemon port 1131 connectivity with 1132 - hypervisor: list with hypervisors to run the verify for 1133 @type cluster_name: string 1134 @param cluster_name: the cluster's name 1135 @type all_hvparams: dict of dict of strings 1136 @param all_hvparams: a dictionary mapping hypervisor names to hvparams 1137 @rtype: dict 1138 @return: a dictionary with the same keys as the input dict, and 1139 values representing the result of the checks 1140 1141 """ 1142 result = {} 1143 my_name = netutils.Hostname.GetSysName() 1144 port = netutils.GetDaemonPort(constants.NODED) 1145 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, []) 1146 1147 _VerifyHypervisors(what, vm_capable, result, all_hvparams) 1148 _VerifyHvparams(what, vm_capable, result) 1149 1150 if constants.NV_FILELIST in what: 1151 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath, 1152 what[constants.NV_FILELIST])) 1153 result[constants.NV_FILELIST] = \ 1154 dict((vcluster.MakeVirtualPath(key), value) 1155 for (key, value) in fingerprints.items()) 1156 1157 if constants.NV_CLIENT_CERT in what: 1158 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate() 1159 1160 if constants.NV_SSH_SETUP in what: 1161 node_status_list, key_type = what[constants.NV_SSH_SETUP] 1162 result[constants.NV_SSH_SETUP] = \ 1163 _VerifySshSetup(node_status_list, my_name, key_type) 1164 if constants.NV_SSH_CLUTTER in what: 1165 result[constants.NV_SSH_CLUTTER] = \ 1166 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name) 1167 1168 if constants.NV_NODELIST in what: 1169 (nodes, bynode, mcs) = what[constants.NV_NODELIST] 1170 1171 # Add nodes from other groups (different for each node) 1172 try: 1173 nodes.extend(bynode[my_name]) 1174 except KeyError: 1175 pass 1176 1177 # Use a random order 1178 random.shuffle(nodes) 1179 1180 # Try to contact all nodes 1181 val = {} 1182 ssh_port_map = ssconf.SimpleStore().GetSshPortMap() 1183 for node in nodes: 1184 # We only test if master candidates can communicate to other nodes. 1185 # We cannot test if normal nodes cannot communicate with other nodes, 1186 # because the administrator might have installed additional SSH keys, 1187 # over which Ganeti has no power. 1188 if my_name in mcs: 1189 success, message = _GetSshRunner(cluster_name). \ 1190 VerifyNodeHostname(node, ssh_port_map[node]) 1191 if not success: 1192 val[node] = message 1193 1194 result[constants.NV_NODELIST] = val 1195 1196 if constants.NV_NODENETTEST in what: 1197 result[constants.NV_NODENETTEST] = tmp = {} 1198 my_pip = my_sip = None 1199 for name, pip, sip in what[constants.NV_NODENETTEST]: 1200 if name == my_name: 1201 my_pip = pip 1202 my_sip = sip 1203 break 1204 if not my_pip: 1205 tmp[my_name] = ("Can't find my own primary/secondary IP" 1206 " in the node list") 1207 else: 1208 for name, pip, sip in what[constants.NV_NODENETTEST]: 1209 fail = [] 1210 if not netutils.TcpPing(pip, port, source=my_pip): 1211 fail.append("primary") 1212 if sip != pip: 1213 if not netutils.TcpPing(sip, port, source=my_sip): 1214 fail.append("secondary") 1215 if fail: 1216 tmp[name] = ("failure using the %s interface(s)" % 1217 " and ".join(fail)) 1218 1219 if constants.NV_MASTERIP in what: 1220 # FIXME: add checks on incoming data structures (here and in the 1221 # rest of the function) 1222 master_name, master_ip = what[constants.NV_MASTERIP] 1223 if master_name == my_name: 1224 source = constants.IP4_ADDRESS_LOCALHOST 1225 else: 1226 source = None 1227 result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port, 1228 source=source) 1229 1230 if constants.NV_USERSCRIPTS in what: 1231 result[constants.NV_USERSCRIPTS] = \ 1232 [script for script in what[constants.NV_USERSCRIPTS] 1233 if not utils.IsExecutable(script)] 1234 1235 if constants.NV_OOB_PATHS in what: 1236 result[constants.NV_OOB_PATHS] = tmp = [] 1237 for path in what[constants.NV_OOB_PATHS]: 1238 try: 1239 st = os.stat(path) 1240 except OSError, err: 1241 tmp.append("error stating out of band helper: %s" % err) 1242 else: 1243 if stat.S_ISREG(st.st_mode): 1244 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR: 1245 tmp.append(None) 1246 else: 1247 tmp.append("out of band helper %s is not executable" % path) 1248 else: 1249 tmp.append("out of band helper %s is not a file" % path) 1250 1251 if constants.NV_LVLIST in what and vm_capable: 1252 try: 1253 val = GetVolumeList(utils.ListVolumeGroups().keys()) 1254 except RPCFail, err: 1255 val = str(err) 1256 result[constants.NV_LVLIST] = val 1257 1258 _VerifyInstanceList(what, vm_capable, result, all_hvparams) 1259 1260 if constants.NV_VGLIST in what and vm_capable: 1261 result[constants.NV_VGLIST] = utils.ListVolumeGroups() 1262 1263 if constants.NV_PVLIST in what and vm_capable: 1264 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what 1265 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST], 1266 filter_allocatable=False, 1267 include_lvs=check_exclusive_pvs) 1268 if check_exclusive_pvs: 1269 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val) 1270 for pvi in val: 1271 # Avoid sending useless data on the wire 1272 pvi.lv_list = [] 1273 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val) 1274 1275 if constants.NV_VERSION in what: 1276 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION, 1277 constants.RELEASE_VERSION) 1278 1279 _VerifyNodeInfo(what, vm_capable, result, all_hvparams) 1280 1281 if constants.NV_DRBDVERSION in what and vm_capable: 1282 try: 1283 drbd_version = DRBD8.GetProcInfo().GetVersionString() 1284 except errors.BlockDeviceError, err: 1285 logging.warning("Can't get DRBD version", exc_info=True) 1286 drbd_version = str(err) 1287 result[constants.NV_DRBDVERSION] = drbd_version 1288 1289 if constants.NV_DRBDLIST in what and vm_capable: 1290 try: 1291 used_minors = drbd.DRBD8.GetUsedDevs() 1292 except errors.BlockDeviceError, err: 1293 logging.warning("Can't get used minors list", exc_info=True) 1294 used_minors = str(err) 1295 result[constants.NV_DRBDLIST] = used_minors 1296 1297 if constants.NV_DRBDHELPER in what and vm_capable: 1298 status = True 1299 try: 1300 payload = drbd.DRBD8.GetUsermodeHelper() 1301 except errors.BlockDeviceError, err: 1302 logging.error("Can't get DRBD usermode helper: %s", str(err)) 1303 status = False 1304 payload = str(err) 1305 result[constants.NV_DRBDHELPER] = (status, payload) 1306 1307 if constants.NV_NODESETUP in what: 1308 result[constants.NV_NODESETUP] = tmpr = [] 1309 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"): 1310 tmpr.append("The sysfs filesytem doesn't seem to be mounted" 1311 " under /sys, missing required directories /sys/block" 1312 " and /sys/class/net") 1313 if (not os.path.isdir("/proc/sys") or 1314 not os.path.isfile("/proc/sysrq-trigger")): 1315 tmpr.append("The procfs filesystem doesn't seem to be mounted" 1316 " under /proc, missing required directory /proc/sys and" 1317 " the file /proc/sysrq-trigger") 1318 1319 if constants.NV_TIME in what: 1320 result[constants.NV_TIME] = utils.SplitTime(time.time()) 1321 1322 if constants.NV_OSLIST in what and vm_capable: 1323 result[constants.NV_OSLIST] = DiagnoseOS() 1324 1325 if constants.NV_BRIDGES in what and vm_capable: 1326 result[constants.NV_BRIDGES] = [bridge 1327 for bridge in what[constants.NV_BRIDGES] 1328 if not utils.BridgeExists(bridge)] 1329 1330 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name: 1331 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \ 1332 filestorage.ComputeWrongFileStoragePaths() 1333 1334 if what.get(constants.NV_FILE_STORAGE_PATH): 1335 pathresult = filestorage.CheckFileStoragePath( 1336 what[constants.NV_FILE_STORAGE_PATH]) 1337 if pathresult: 1338 result[constants.NV_FILE_STORAGE_PATH] = pathresult 1339 1340 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH): 1341 pathresult = filestorage.CheckFileStoragePath( 1342 what[constants.NV_SHARED_FILE_STORAGE_PATH]) 1343 if pathresult: 1344 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult 1345 1346 return result
1347
1348 1349 -def GetCryptoTokens(token_requests):
1350 """Perform actions on the node's cryptographic tokens. 1351 1352 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented 1353 for 'ssl'. Action 'get' returns the digest of the public client ssl 1354 certificate. Action 'create' creates a new client certificate and private key 1355 and also returns the digest of the certificate. The third parameter of a 1356 token request are optional parameters for the actions, so far only the 1357 filename is supported. 1358 1359 @type token_requests: list of tuples of (string, string, dict), where the 1360 first string is in constants.CRYPTO_TYPES, the second in 1361 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string 1362 to string. 1363 @param token_requests: list of requests of cryptographic tokens and actions 1364 to perform on them. The actions come with a dictionary of options. 1365 @rtype: list of tuples (string, string) 1366 @return: list of tuples of the token type and the public crypto token 1367 1368 """ 1369 tokens = [] 1370 for (token_type, action, _) in token_requests: 1371 if token_type not in constants.CRYPTO_TYPES: 1372 raise errors.ProgrammerError("Token type '%s' not supported." % 1373 token_type) 1374 if action not in constants.CRYPTO_ACTIONS: 1375 raise errors.ProgrammerError("Action '%s' is not supported." % 1376 action) 1377 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST: 1378 tokens.append((token_type, 1379 utils.GetCertificateDigest())) 1380 return tokens
1381
1382 1383 -def EnsureDaemon(daemon_name, run):
1384 """Ensures the given daemon is running or stopped. 1385 1386 @type daemon_name: string 1387 @param daemon_name: name of the daemon (e.g., constants.KVMD) 1388 1389 @type run: bool 1390 @param run: whether to start or stop the daemon 1391 1392 @rtype: bool 1393 @return: 'True' if daemon successfully started/stopped, 1394 'False' otherwise 1395 1396 """ 1397 allowed_daemons = [constants.KVMD] 1398 1399 if daemon_name not in allowed_daemons: 1400 fn = lambda _: False 1401 elif run: 1402 fn = utils.EnsureDaemon 1403 else: 1404 fn = utils.StopDaemon 1405 1406 return fn(daemon_name)
1407
1408 1409 -def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1410 (_, noded_cert) = \ 1411 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file)) 1412 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert 1413 1414 cluster_name = ssconf_store.GetClusterName() 1415 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1416
1417 1418 -def AddNodeSshKey(node_uuid, node_name, 1419 potential_master_candidates, 1420 to_authorized_keys=False, 1421 to_public_keys=False, 1422 get_public_keys=False, 1423 pub_key_file=pathutils.SSH_PUB_KEYS, 1424 ssconf_store=None, 1425 noded_cert_file=pathutils.NODED_CERT_FILE, 1426 run_cmd_fn=ssh.RunSshCmdWithStdin, 1427 ssh_update_debug=False, 1428 ssh_update_verbose=False):
1429 """Distributes a node's public SSH key across the cluster. 1430 1431 Note that this function should only be executed on the master node, which 1432 then will copy the new node's key to all nodes in the cluster via SSH. 1433 1434 Also note: at least one of the flags C{to_authorized_keys}, 1435 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for 1436 the function to actually perform any actions. 1437 1438 @type node_uuid: str 1439 @param node_uuid: the UUID of the node whose key is added 1440 @type node_name: str 1441 @param node_name: the name of the node whose key is added 1442 @type potential_master_candidates: list of str 1443 @param potential_master_candidates: list of node names of potential master 1444 candidates; this should match the list of uuids in the public key file 1445 @type to_authorized_keys: boolean 1446 @param to_authorized_keys: whether the key should be added to the 1447 C{authorized_keys} file of all nodes 1448 @type to_public_keys: boolean 1449 @param to_public_keys: whether the keys should be added to the public key file 1450 @type get_public_keys: boolean 1451 @param get_public_keys: whether the node should add the clusters' public keys 1452 to its {ganeti_pub_keys} file 1453 1454 """ 1455 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid, 1456 to_authorized_keys=to_authorized_keys, 1457 to_public_keys=to_public_keys, 1458 get_public_keys=get_public_keys)] 1459 return AddNodeSshKeyBulk(node_list, 1460 potential_master_candidates, 1461 pub_key_file=pub_key_file, 1462 ssconf_store=ssconf_store, 1463 noded_cert_file=noded_cert_file, 1464 run_cmd_fn=run_cmd_fn, 1465 ssh_update_debug=ssh_update_debug, 1466 ssh_update_verbose=ssh_update_verbose)
1467 1468 1469 # Node info named tuple specifically for the use with AddNodeSshKeyBulk 1470 SshAddNodeInfo = collections.namedtuple( 1471 "SshAddNodeInfo", 1472 ["uuid", 1473 "name", 1474 "to_authorized_keys", 1475 "to_public_keys", 1476 "get_public_keys"])
1477 1478 1479 -def AddNodeSshKeyBulk(node_list, 1480 potential_master_candidates, 1481 pub_key_file=pathutils.SSH_PUB_KEYS, 1482 ssconf_store=None, 1483 noded_cert_file=pathutils.NODED_CERT_FILE, 1484 run_cmd_fn=ssh.RunSshCmdWithStdin, 1485 ssh_update_debug=False, 1486 ssh_update_verbose=False):
1487 """Distributes a node's public SSH key across the cluster. 1488 1489 Note that this function should only be executed on the master node, which 1490 then will copy the new node's key to all nodes in the cluster via SSH. 1491 1492 Also note: at least one of the flags C{to_authorized_keys}, 1493 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for 1494 the function to actually perform any actions. 1495 1496 @type node_list: list of SshAddNodeInfo tuples 1497 @param node_list: list of tuples containing the necessary node information for 1498 adding their keys 1499 @type potential_master_candidates: list of str 1500 @param potential_master_candidates: list of node names of potential master 1501 candidates; this should match the list of uuids in the public key file 1502 1503 """ 1504 # whether there are any keys to be added or retrieved at all 1505 to_authorized_keys = any([node_info.to_authorized_keys for node_info in 1506 node_list]) 1507 to_public_keys = any([node_info.to_public_keys for node_info in 1508 node_list]) 1509 1510 if not ssconf_store: 1511 ssconf_store = ssconf.SimpleStore() 1512 1513 for node_info in node_list: 1514 # replacement not necessary for keys that are not supposed to be in the 1515 # list of public keys 1516 if not node_info.to_public_keys: 1517 continue 1518 # Check and fix sanity of key file 1519 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file) 1520 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file) 1521 1522 if (not keys_by_name or node_info.name not in keys_by_name) \ 1523 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid): 1524 raise errors.SshUpdateError( 1525 "No keys found for the new node '%s' (UUID %s) in the list of public" 1526 " SSH keys, neither for the name or the UUID" % 1527 (node_info.name, node_info.uuid)) 1528 else: 1529 if node_info.name in keys_by_name: 1530 # Replace the name by UUID in the file as the name should only be used 1531 # temporarily 1532 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name, 1533 error_fn=errors.SshUpdateError, 1534 key_file=pub_key_file) 1535 1536 # Retrieve updated map of UUIDs to keys 1537 keys_by_uuid = ssh.QueryPubKeyFile( 1538 [node_info.uuid for node_info in node_list], key_file=pub_key_file) 1539 1540 # Update the master node's key files 1541 (auth_key_file, _) = \ 1542 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False) 1543 for node_info in node_list: 1544 if node_info.to_authorized_keys: 1545 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid]) 1546 1547 base_data = {} 1548 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store) 1549 cluster_name = base_data[constants.SSHS_CLUSTER_NAME] 1550 1551 ssh_port_map = ssconf_store.GetSshPortMap() 1552 1553 # Update the target nodes themselves 1554 for node_info in node_list: 1555 logging.debug("Updating SSH key files of target node '%s'.", node_info.name) 1556 if node_info.get_public_keys: 1557 node_data = {} 1558 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store) 1559 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file) 1560 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \ 1561 (constants.SSHS_OVERRIDE, all_keys) 1562 1563 try: 1564 backoff = 5 # seconds 1565 utils.RetryByNumberOfTimes( 1566 constants.SSHS_MAX_RETRIES, backoff, 1567 errors.SshUpdateError, 1568 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE, 1569 ssh_port_map.get(node_info.name), node_data, 1570 debug=ssh_update_debug, verbose=ssh_update_verbose, 1571 use_cluster_key=False, ask_key=False, strict_host_check=False) 1572 except errors.SshUpdateError as e: 1573 # Clean up the master's public key file if adding key fails 1574 if node_info.to_public_keys: 1575 ssh.RemovePublicKey(node_info.uuid) 1576 raise e 1577 1578 # Update all nodes except master and the target nodes 1579 keys_by_uuid_auth = ssh.QueryPubKeyFile( 1580 [node_info.uuid for node_info in node_list 1581 if node_info.to_authorized_keys], 1582 key_file=pub_key_file) 1583 if to_authorized_keys: 1584 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \ 1585 (constants.SSHS_ADD, keys_by_uuid_auth) 1586 1587 pot_mc_data = base_data.copy() 1588 keys_by_uuid_pub = ssh.QueryPubKeyFile( 1589 [node_info.uuid for node_info in node_list 1590 if node_info.to_public_keys], 1591 key_file=pub_key_file) 1592 if to_public_keys: 1593 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \ 1594 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub) 1595 1596 all_nodes = ssconf_store.GetNodeList() 1597 master_node = ssconf_store.GetMasterNode() 1598 online_nodes = ssconf_store.GetOnlineNodeList() 1599 1600 node_errors = [] 1601 for node in all_nodes: 1602 if node == master_node: 1603 logging.debug("Skipping master node '%s'.", master_node) 1604 continue 1605 if node not in online_nodes: 1606 logging.debug("Skipping offline node '%s'.", node) 1607 continue 1608 if node in potential_master_candidates: 1609 logging.debug("Updating SSH key files of node '%s'.", node) 1610 try: 1611 backoff = 5 # seconds 1612 utils.RetryByNumberOfTimes( 1613 constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError, 1614 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, 1615 ssh_port_map.get(node), pot_mc_data, 1616 debug=ssh_update_debug, verbose=ssh_update_verbose, 1617 use_cluster_key=False, ask_key=False, strict_host_check=False) 1618 except errors.SshUpdateError as last_exception: 1619 error_msg = ("When adding the key of node '%s', updating SSH key" 1620 " files of node '%s' failed after %s retries." 1621 " Not trying again. Last error was: %s." % 1622 (node, node_info.name, constants.SSHS_MAX_RETRIES, 1623 last_exception)) 1624 node_errors.append((node, error_msg)) 1625 # We only log the error and don't throw an exception, because 1626 # one unreachable node shall not abort the entire procedure. 1627 logging.error(error_msg) 1628 1629 else: 1630 if to_authorized_keys: 1631 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, 1632 ssh_port_map.get(node), base_data, 1633 debug=ssh_update_debug, verbose=ssh_update_verbose, 1634 use_cluster_key=False, ask_key=False, 1635 strict_host_check=False) 1636 1637 return node_errors
1638
1639 1640 # TODO: will be fixed with pending patch series. 1641 # pylint: disable=R0913 1642 -def RemoveNodeSshKey(node_uuid, node_name, 1643 master_candidate_uuids, 1644 potential_master_candidates, 1645 master_uuid=None, 1646 keys_to_remove=None, 1647 from_authorized_keys=False, 1648 from_public_keys=False, 1649 clear_authorized_keys=False, 1650 clear_public_keys=False, 1651 pub_key_file=pathutils.SSH_PUB_KEYS, 1652 ssconf_store=None, 1653 noded_cert_file=pathutils.NODED_CERT_FILE, 1654 readd=False, 1655 run_cmd_fn=ssh.RunSshCmdWithStdin, 1656 ssh_update_debug=False, 1657 ssh_update_verbose=False):
1658 """Removes the node's SSH keys from the key files and distributes those. 1659 1660 Note that at least one of the flags C{from_authorized_keys}, 1661 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys} 1662 has to be set to C{True} for the function to perform any action at all. 1663 Not doing so will trigger an assertion in the function. 1664 1665 @type node_uuid: str 1666 @param node_uuid: UUID of the node whose key is removed 1667 @type node_name: str 1668 @param node_name: name of the node whose key is remove 1669 @type master_candidate_uuids: list of str 1670 @param master_candidate_uuids: list of UUIDs of the current master candidates 1671 @type potential_master_candidates: list of str 1672 @param potential_master_candidates: list of names of potential master 1673 candidates 1674 @type keys_to_remove: dict of str to list of str 1675 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys 1676 to be removed. This list is supposed to be used only if the keys are not 1677 in the public keys file. This is for example the case when removing a 1678 master node's key. 1679 @type from_authorized_keys: boolean 1680 @param from_authorized_keys: whether or not the key should be removed 1681 from the C{authorized_keys} file 1682 @type from_public_keys: boolean 1683 @param from_public_keys: whether or not the key should be remove from 1684 the C{ganeti_pub_keys} file 1685 @type clear_authorized_keys: boolean 1686 @param clear_authorized_keys: whether or not the C{authorized_keys} file 1687 should be cleared on the node whose keys are removed 1688 @type clear_public_keys: boolean 1689 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file 1690 @type readd: boolean 1691 @param readd: whether this is called during a readd operation. 1692 @rtype: list of string 1693 @returns: list of feedback messages 1694 1695 """ 1696 node_list = [SshRemoveNodeInfo(uuid=node_uuid, 1697 name=node_name, 1698 from_authorized_keys=from_authorized_keys, 1699 from_public_keys=from_public_keys, 1700 clear_authorized_keys=clear_authorized_keys, 1701 clear_public_keys=clear_public_keys)] 1702 return RemoveNodeSshKeyBulk(node_list, 1703 master_candidate_uuids, 1704 potential_master_candidates, 1705 master_uuid=master_uuid, 1706 keys_to_remove=keys_to_remove, 1707 pub_key_file=pub_key_file, 1708 ssconf_store=ssconf_store, 1709 noded_cert_file=noded_cert_file, 1710 readd=readd, 1711 run_cmd_fn=run_cmd_fn, 1712 ssh_update_debug=ssh_update_debug, 1713 ssh_update_verbose=ssh_update_verbose)
1714 1715 1716 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk 1717 SshRemoveNodeInfo = collections.namedtuple( 1718 "SshRemoveNodeInfo", 1719 ["uuid", 1720 "name", 1721 "from_authorized_keys", 1722 "from_public_keys", 1723 "clear_authorized_keys", 1724 "clear_public_keys"])
1725 1726 1727 -def RemoveNodeSshKeyBulk(node_list, 1728 master_candidate_uuids, 1729 potential_master_candidates, 1730 master_uuid=None, 1731 keys_to_remove=None, 1732 pub_key_file=pathutils.SSH_PUB_KEYS, 1733 ssconf_store=None, 1734 noded_cert_file=pathutils.NODED_CERT_FILE, 1735 readd=False, 1736 run_cmd_fn=ssh.RunSshCmdWithStdin, 1737 ssh_update_debug=False, 1738 ssh_update_verbose=False):
1739 """Removes the node's SSH keys from the key files and distributes those. 1740 1741 Note that at least one of the flags C{from_authorized_keys}, 1742 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys} 1743 of at least one node has to be set to C{True} for the function to perform any 1744 action at all. Not doing so will trigger an assertion in the function. 1745 1746 @type node_list: list of C{SshRemoveNodeInfo}. 1747 @param node_list: list of information about nodes whose keys are being removed 1748 @type master_candidate_uuids: list of str 1749 @param master_candidate_uuids: list of UUIDs of the current master candidates 1750 @type potential_master_candidates: list of str 1751 @param potential_master_candidates: list of names of potential master 1752 candidates 1753 @type keys_to_remove: dict of str to list of str 1754 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys 1755 to be removed. This list is supposed to be used only if the keys are not 1756 in the public keys file. This is for example the case when removing a 1757 master node's key. 1758 @type readd: boolean 1759 @param readd: whether this is called during a readd operation. 1760 @rtype: list of string 1761 @returns: list of feedback messages 1762 1763 """ 1764 # Non-disruptive error messages, list of (node, msg) pairs 1765 result_msgs = [] 1766 1767 # whether there are any keys to be added or retrieved at all 1768 from_authorized_keys = any([node_info.from_authorized_keys for node_info in 1769 node_list]) 1770 from_public_keys = any([node_info.from_public_keys for node_info in 1771 node_list]) 1772 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in 1773 node_list]) 1774 clear_public_keys = any([node_info.clear_public_keys for node_info in 1775 node_list]) 1776 1777 # Make sure at least one of these flags is true. 1778 if not (from_authorized_keys or from_public_keys or clear_authorized_keys 1779 or clear_public_keys): 1780 raise errors.SshUpdateError("No removal from any key file was requested.") 1781 1782 if not ssconf_store: 1783 ssconf_store = ssconf.SimpleStore() 1784 1785 master_node = ssconf_store.GetMasterNode() 1786 ssh_port_map = ssconf_store.GetSshPortMap() 1787 1788 all_keys_to_remove = {} 1789 if from_authorized_keys or from_public_keys: 1790 for node_info in node_list: 1791 # Skip nodes that don't actually need any keys to be removed. 1792 if not (node_info.from_authorized_keys or node_info.from_public_keys): 1793 continue 1794 if node_info.name == master_node and not keys_to_remove: 1795 raise errors.SshUpdateError("Cannot remove the master node's keys.") 1796 if keys_to_remove: 1797 keys = keys_to_remove 1798 else: 1799 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file) 1800 if (not keys or node_info.uuid not in keys) and not readd: 1801 raise errors.SshUpdateError("Node '%s' not found in the list of" 1802 " public SSH keys. It seems someone" 1803 " tries to remove a key from outside" 1804 " the cluster!" % node_info.uuid) 1805 # During an upgrade all nodes have the master key. In this case we 1806 # should not remove it to avoid accidentally shutting down cluster 1807 # SSH communication 1808 master_keys = None 1809 if master_uuid: 1810 master_keys = ssh.QueryPubKeyFile([master_uuid], 1811 key_file=pub_key_file) 1812 for master_key in master_keys: 1813 if master_key in keys[node_info.uuid]: 1814 keys[node_info.uuid].remove(master_key) 1815 1816 all_keys_to_remove.update(keys) 1817 1818 if all_keys_to_remove: 1819 base_data = {} 1820 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store) 1821 cluster_name = base_data[constants.SSHS_CLUSTER_NAME] 1822 1823 if from_authorized_keys: 1824 # UUIDs of nodes that are supposed to be removed from the 1825 # authorized_keys files. 1826 nodes_remove_from_authorized_keys = [ 1827 node_info.uuid for node_info in node_list 1828 if node_info.from_authorized_keys] 1829 keys_to_remove_from_authorized_keys = dict([ 1830 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items() 1831 if uuid in nodes_remove_from_authorized_keys]) 1832 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \ 1833 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys) 1834 (auth_key_file, _) = \ 1835 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, 1836 dircheck=False) 1837 1838 for uuid in nodes_remove_from_authorized_keys: 1839 ssh.RemoveAuthorizedKeys(auth_key_file, 1840 keys_to_remove_from_authorized_keys[uuid]) 1841 1842 pot_mc_data = base_data.copy() 1843 1844 if from_public_keys: 1845 nodes_remove_from_public_keys = [ 1846 node_info.uuid for node_info in node_list 1847 if node_info.from_public_keys] 1848 keys_to_remove_from_public_keys = dict([ 1849 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items() 1850 if uuid in nodes_remove_from_public_keys]) 1851 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \ 1852 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys) 1853 1854 all_nodes = ssconf_store.GetNodeList() 1855 online_nodes = ssconf_store.GetOnlineNodeList() 1856 all_nodes_to_remove = [node_info.name for node_info in node_list] 1857 logging.debug("Removing keys of nodes '%s' from all nodes but itself and" 1858 " master.", ", ".join(all_nodes_to_remove)) 1859 for node in all_nodes: 1860 if node == master_node: 1861 logging.debug("Skipping master node '%s'.", master_node) 1862 continue 1863 if node not in online_nodes: 1864 logging.debug("Skipping offline node '%s'.", node) 1865 continue 1866 if node in all_nodes_to_remove: 1867 logging.debug("Skipping node whose key is removed itself '%s'.", node) 1868 continue 1869 ssh_port = ssh_port_map.get(node) 1870 if not ssh_port: 1871 raise errors.OpExecError("No SSH port information available for" 1872 " node '%s', map: %s." % 1873 (node, ssh_port_map)) 1874 error_msg_final = ("When removing the key of node '%s', updating the" 1875 " SSH key files of node '%s' failed. Last error" 1876 " was: %s.") 1877 if node in potential_master_candidates: 1878 logging.debug("Updating key setup of potential master candidate node" 1879 " %s.", node) 1880 try: 1881 backoff = 5 # seconds 1882 utils.RetryByNumberOfTimes( 1883 constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError, 1884 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, 1885 ssh_port, pot_mc_data, 1886 debug=ssh_update_debug, verbose=ssh_update_verbose, 1887 use_cluster_key=False, ask_key=False, strict_host_check=False) 1888 except errors.SshUpdateError as last_exception: 1889 error_msg = error_msg_final % ( 1890 node_info.name, node, last_exception) 1891 result_msgs.append((node, error_msg)) 1892 logging.error(error_msg) 1893 1894 else: 1895 if from_authorized_keys: 1896 logging.debug("Updating key setup of normal node %s.", node) 1897 try: 1898 backoff = 5 # seconds 1899 utils.RetryByNumberOfTimes( 1900 constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError, 1901 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, 1902 ssh_port, base_data, 1903 debug=ssh_update_debug, verbose=ssh_update_verbose, 1904 use_cluster_key=False, ask_key=False, strict_host_check=False) 1905 except errors.SshUpdateError as last_exception: 1906 error_msg = error_msg_final % ( 1907 node_info.name, node, last_exception) 1908 result_msgs.append((node, error_msg)) 1909 logging.error(error_msg) 1910 1911 for node_info in node_list: 1912 if node_info.clear_authorized_keys or node_info.from_public_keys or \ 1913 node_info.clear_public_keys: 1914 data = {} 1915 _InitSshUpdateData(data, noded_cert_file, ssconf_store) 1916 cluster_name = data[constants.SSHS_CLUSTER_NAME] 1917 ssh_port = ssh_port_map.get(node_info.name) 1918 if not ssh_port: 1919 raise errors.OpExecError("No SSH port information available for" 1920 " node '%s', which is leaving the cluster.") 1921 1922 if node_info.clear_authorized_keys: 1923 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore, 1924 # we have to specify exactly which keys to clear to leave keys untouched 1925 # that were not added by Ganeti. 1926 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids 1927 if uuid != node_info.uuid] 1928 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids, 1929 key_file=pub_key_file) 1930 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \ 1931 (constants.SSHS_REMOVE, candidate_keys) 1932 1933 if node_info.clear_public_keys: 1934 data[constants.SSHS_SSH_PUBLIC_KEYS] = \ 1935 (constants.SSHS_CLEAR, {}) 1936 elif node_info.from_public_keys: 1937 # Since clearing the public keys subsumes removing just a single key, 1938 # we only do it if clear_public_keys is 'False'. 1939 1940 if all_keys_to_remove: 1941 data[constants.SSHS_SSH_PUBLIC_KEYS] = \ 1942 (constants.SSHS_REMOVE, all_keys_to_remove) 1943 1944 # If we have no changes to any keyfile, just return 1945 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or 1946 constants.SSHS_SSH_AUTHORIZED_KEYS in data): 1947 return 1948 1949 logging.debug("Updating SSH key setup of target node '%s'.", 1950 node_info.name) 1951 try: 1952 backoff = 5 # seconds 1953 utils.RetryByNumberOfTimes( 1954 constants.SSHS_MAX_RETRIES, backoff, 1955 errors.SshUpdateError, 1956 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE, 1957 ssh_port, data, 1958 debug=ssh_update_debug, verbose=ssh_update_verbose, 1959 use_cluster_key=False, ask_key=False, strict_host_check=False) 1960 except errors.SshUpdateError as last_exception: 1961 result_msgs.append( 1962 (node_info.name, 1963 ("Removing SSH keys from node '%s' failed." 1964 " This can happen when the node is already unreachable." 1965 " Error: %s" % (node_info.name, last_exception)))) 1966 1967 if all_keys_to_remove and from_public_keys: 1968 for node_uuid in nodes_remove_from_public_keys: 1969 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file) 1970 1971 return result_msgs
1972 # pylint: enable=R0913
1973 1974 1975 -def RemoveSshKeyFromPublicKeyFile(node_name, 1976 pub_key_file=pathutils.SSH_PUB_KEYS, 1977 ssconf_store=None):
1978 """Removes a SSH key from the master's public key file. 1979 1980 This is an operation that is only used to clean up after failed operations 1981 (for example failed hooks before adding a node). To avoid abuse of this 1982 function (and the matching RPC call), we add a safety check to make sure 1983 that only stray keys can be removed that belong to nodes that are not 1984 in the cluster (anymore). 1985 1986 @type node_name: string 1987 @param node_name: the name of the node whose key is removed 1988 1989 """ 1990 if not ssconf_store: 1991 ssconf_store = ssconf.SimpleStore() 1992 1993 node_list = ssconf_store.GetNodeList() 1994 1995 if node_name in node_list: 1996 raise errors.SshUpdateError("Cannot remove key of node '%s'," 1997 " because it still belongs to the cluster." 1998 % node_name) 1999 2000 keys_by_name = ssh.QueryPubKeyFile([node_name], key_file=pub_key_file) 2001 if not keys_by_name or node_name not in keys_by_name: 2002 logging.info("The node '%s' whose key is supposed to be removed does not" 2003 " have an entry in the public key file. Hence, there is" 2004 " nothing left to do.", node_name) 2005 2006 ssh.RemovePublicKey(node_name, key_file=pub_key_file)
2007
2008 2009 -def _GenerateNodeSshKey(node_name, ssh_port_map, ssh_key_type, ssh_key_bits, 2010 ssconf_store=None, 2011 noded_cert_file=pathutils.NODED_CERT_FILE, 2012 run_cmd_fn=ssh.RunSshCmdWithStdin, 2013 suffix="", 2014 ssh_update_debug=False, 2015 ssh_update_verbose=False):
2016 """Generates the root SSH key pair on the node. 2017 2018 @type node_name: str 2019 @param node_name: name of the node whose key is remove 2020 @type ssh_port_map: dict of str to int 2021 @param ssh_port_map: mapping of node names to their SSH port 2022 @type ssh_key_type: One of L{constants.SSHK_ALL} 2023 @param ssh_key_type: the type of SSH key to be generated 2024 @type ssh_key_bits: int 2025 @param ssh_key_bits: the length of the key to be generated 2026 2027 """ 2028 if not ssconf_store: 2029 ssconf_store = ssconf.SimpleStore() 2030 2031 data = {} 2032 _InitSshUpdateData(data, noded_cert_file, ssconf_store) 2033 cluster_name = data[constants.SSHS_CLUSTER_NAME] 2034 data[constants.SSHS_GENERATE] = (ssh_key_type, ssh_key_bits, suffix) 2035 2036 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, 2037 ssh_port_map.get(node_name), data, 2038 debug=ssh_update_debug, verbose=ssh_update_verbose, 2039 use_cluster_key=False, ask_key=False, strict_host_check=False)
2040
2041 2042 -def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
2043 master_node_uuids = [node_uuid for (node_uuid, node_name) 2044 in node_uuid_name_map 2045 if node_name == master_node_name] 2046 if len(master_node_uuids) != 1: 2047 raise errors.SshUpdateError("No (unique) master UUID found. Master node" 2048 " name: '%s', Master UUID: '%s'" % 2049 (master_node_name, master_node_uuids)) 2050 return master_node_uuids[0]
2051
2052 2053 -def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2054 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid], 2055 key_file=pub_key_file) 2056 if not old_master_keys_by_uuid: 2057 raise errors.SshUpdateError("No public key of the master node (UUID '%s')" 2058 " found, not generating a new key." 2059 % master_node_uuid) 2060 return old_master_keys_by_uuid
2061
2062 2063 -def RenewSshKeys(node_uuids, node_names, master_candidate_uuids, 2064 potential_master_candidates, old_key_type, new_key_type, 2065 new_key_bits, 2066 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS, 2067 ssconf_store=None, 2068 noded_cert_file=pathutils.NODED_CERT_FILE, 2069 run_cmd_fn=ssh.RunSshCmdWithStdin, 2070 ssh_update_debug=False, 2071 ssh_update_verbose=False):
2072 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys. 2073 2074 @type node_uuids: list of str 2075 @param node_uuids: list of node UUIDs whose keys should be renewed 2076 @type node_names: list of str 2077 @param node_names: list of node names whose keys should be removed. This list 2078 should match the C{node_uuids} parameter 2079 @type master_candidate_uuids: list of str 2080 @param master_candidate_uuids: list of UUIDs of master candidates or 2081 master node 2082 @type old_key_type: One of L{constants.SSHK_ALL} 2083 @param old_key_type: the type of SSH key already present on nodes 2084 @type new_key_type: One of L{constants.SSHK_ALL} 2085 @param new_key_type: the type of SSH key to be generated 2086 @type new_key_bits: int 2087 @param new_key_bits: the length of the key to be generated 2088 @type ganeti_pub_keys_file: str 2089 @param ganeti_pub_keys_file: file path of the the public key file 2090 @type noded_cert_file: str 2091 @param noded_cert_file: path of the noded SSL certificate file 2092 @type run_cmd_fn: function 2093 @param run_cmd_fn: function to run commands on remote nodes via SSH 2094 @raises ProgrammerError: if node_uuids and node_names don't match; 2095 SshUpdateError if a node's key is missing from the public key file, 2096 if a node's new SSH key could not be fetched from it, if there is 2097 none or more than one entry in the public key list for the master 2098 node. 2099 2100 """ 2101 if not ssconf_store: 2102 ssconf_store = ssconf.SimpleStore() 2103 cluster_name = ssconf_store.GetClusterName() 2104 2105 if not len(node_uuids) == len(node_names): 2106 raise errors.ProgrammerError("List of nodes UUIDs and node names" 2107 " does not match in length.") 2108 2109 old_pub_keyfile = ssh.GetSshPubKeyFilename(old_key_type) 2110 new_pub_keyfile = ssh.GetSshPubKeyFilename(new_key_type) 2111 old_master_key = ssh.ReadLocalSshPubKeys([old_key_type]) 2112 2113 node_uuid_name_map = zip(node_uuids, node_names) 2114 2115 master_node_name = ssconf_store.GetMasterNode() 2116 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name) 2117 ssh_port_map = ssconf_store.GetSshPortMap() 2118 # List of all node errors that happened, but which did not abort the 2119 # procedure as a whole. It is important that this is a list to have a 2120 # somewhat chronological history of events. 2121 all_node_errors = [] 2122 2123 # process non-master nodes 2124 2125 # keys to add in bulk at the end 2126 node_keys_to_add = [] 2127 2128 # list of all nodes 2129 node_list = [] 2130 2131 # list of keys to be removed before generating new keys 2132 node_info_to_remove = [] 2133 2134 for node_uuid, node_name in node_uuid_name_map: 2135 if node_name == master_node_name: 2136 continue 2137 master_candidate = node_uuid in master_candidate_uuids 2138 potential_master_candidate = node_name in potential_master_candidates 2139 node_list.append((node_uuid, node_name, master_candidate, 2140 potential_master_candidate)) 2141 2142 if master_candidate: 2143 logging.debug("Fetching old SSH key from node '%s'.", node_name) 2144 old_pub_key = ssh.ReadRemoteSshPubKey(old_pub_keyfile, 2145 node_name, cluster_name, 2146 ssh_port_map[node_name], 2147 False, # ask_key 2148 False) # key_check 2149 if old_pub_key != old_master_key: 2150 # If we are already in a multi-key setup (that is past Ganeti 2.12), 2151 # we can safely remove the old key of the node. Otherwise, we cannot 2152 # remove that node's key, because it is also the master node's key 2153 # and that would terminate all communication from the master to the 2154 # node. 2155 node_info_to_remove.append(SshRemoveNodeInfo( 2156 uuid=node_uuid, 2157 name=node_name, 2158 from_authorized_keys=master_candidate, 2159 from_public_keys=False, 2160 clear_authorized_keys=False, 2161 clear_public_keys=False)) 2162 else: 2163 logging.debug("Old key of node '%s' is the same as the current master" 2164 " key. Not deleting that key on the node.", node_name) 2165 2166 logging.debug("Removing old SSH keys of all master candidates.") 2167 if node_info_to_remove: 2168 node_errors = RemoveNodeSshKeyBulk( 2169 node_info_to_remove, 2170 master_candidate_uuids, 2171 potential_master_candidates, 2172 master_uuid=master_node_uuid, 2173 pub_key_file=ganeti_pub_keys_file, 2174 ssconf_store=ssconf_store, 2175 noded_cert_file=noded_cert_file, 2176 run_cmd_fn=run_cmd_fn, 2177 ssh_update_debug=ssh_update_debug, 2178 ssh_update_verbose=ssh_update_verbose) 2179 if node_errors: 2180 all_node_errors = all_node_errors + node_errors 2181 2182 for (node_uuid, node_name, master_candidate, potential_master_candidate) \ 2183 in node_list: 2184 2185 logging.debug("Generating new SSH key for node '%s'.", node_name) 2186 _GenerateNodeSshKey(node_name, ssh_port_map, new_key_type, new_key_bits, 2187 ssconf_store=ssconf_store, 2188 noded_cert_file=noded_cert_file, 2189 run_cmd_fn=run_cmd_fn, 2190 ssh_update_verbose=ssh_update_verbose, 2191 ssh_update_debug=ssh_update_debug) 2192 2193 try: 2194 logging.debug("Fetching newly created SSH key from node '%s'.", node_name) 2195 pub_key = ssh.ReadRemoteSshPubKey(new_pub_keyfile, 2196 node_name, cluster_name, 2197 ssh_port_map[node_name], 2198 False, # ask_key 2199 False) # key_check 2200 except: 2201 raise errors.SshUpdateError("Could not fetch key of node %s" 2202 " (UUID %s)" % (node_name, node_uuid)) 2203 2204 if potential_master_candidate: 2205 ssh.RemovePublicKey(node_uuid, key_file=ganeti_pub_keys_file) 2206 ssh.AddPublicKey(node_uuid, pub_key, key_file=ganeti_pub_keys_file) 2207 2208 node_info = SshAddNodeInfo(name=node_name, 2209 uuid=node_uuid, 2210 to_authorized_keys=master_candidate, 2211 to_public_keys=potential_master_candidate, 2212 get_public_keys=True) 2213 node_keys_to_add.append(node_info) 2214 2215 node_errors = AddNodeSshKeyBulk( 2216 node_keys_to_add, potential_master_candidates, 2217 pub_key_file=ganeti_pub_keys_file, ssconf_store=ssconf_store, 2218 noded_cert_file=noded_cert_file, 2219 run_cmd_fn=run_cmd_fn, 2220 ssh_update_debug=ssh_update_debug, 2221 ssh_update_verbose=ssh_update_verbose) 2222 if node_errors: 2223 all_node_errors = all_node_errors + node_errors 2224 2225 # Renewing the master node's key 2226 2227 # Preserve the old keys for now 2228 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid, 2229 ganeti_pub_keys_file) 2230 2231 # Generate a new master key with a suffix, don't touch the old one for now 2232 logging.debug("Generate new ssh key of master.") 2233 _GenerateNodeSshKey(master_node_name, ssh_port_map, 2234 new_key_type, new_key_bits, 2235 ssconf_store=ssconf_store, 2236 noded_cert_file=noded_cert_file, 2237 run_cmd_fn=run_cmd_fn, 2238 suffix=constants.SSHS_MASTER_SUFFIX, 2239 ssh_update_debug=ssh_update_debug, 2240 ssh_update_verbose=ssh_update_verbose) 2241 # Read newly created master key 2242 new_master_keys = ssh.ReadLocalSshPubKeys( 2243 [new_key_type], suffix=constants.SSHS_MASTER_SUFFIX) 2244 2245 # Replace master key in the master nodes' public key file 2246 ssh.RemovePublicKey(master_node_uuid, key_file=ganeti_pub_keys_file) 2247 for pub_key in new_master_keys: 2248 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=ganeti_pub_keys_file) 2249 2250 # Add new master key to all node's public and authorized keys 2251 logging.debug("Add new master key to all nodes.") 2252 node_errors = AddNodeSshKey( 2253 master_node_uuid, master_node_name, potential_master_candidates, 2254 to_authorized_keys=True, to_public_keys=True, 2255 get_public_keys=False, pub_key_file=ganeti_pub_keys_file, 2256 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file, 2257 run_cmd_fn=run_cmd_fn, 2258 ssh_update_debug=ssh_update_debug, 2259 ssh_update_verbose=ssh_update_verbose) 2260 if node_errors: 2261 all_node_errors = all_node_errors + node_errors 2262 2263 # Remove the old key file and rename the new key to the non-temporary filename 2264 ssh.ReplaceSshKeys(new_key_type, new_key_type, 2265 src_key_suffix=constants.SSHS_MASTER_SUFFIX) 2266 2267 # Remove old key from authorized keys 2268 (auth_key_file, _) = \ 2269 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False) 2270 ssh.RemoveAuthorizedKeys(auth_key_file, 2271 old_master_keys_by_uuid[master_node_uuid]) 2272 2273 # Remove the old key from all node's authorized keys file 2274 logging.debug("Remove the old master key from all nodes.") 2275 node_errors = RemoveNodeSshKey( 2276 master_node_uuid, master_node_name, master_candidate_uuids, 2277 potential_master_candidates, 2278 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True, 2279 from_public_keys=False, clear_authorized_keys=False, 2280 clear_public_keys=False, 2281 pub_key_file=ganeti_pub_keys_file, 2282 ssconf_store=ssconf_store, 2283 noded_cert_file=noded_cert_file, 2284 run_cmd_fn=run_cmd_fn, 2285 ssh_update_debug=ssh_update_debug, 2286 ssh_update_verbose=ssh_update_verbose) 2287 if node_errors: 2288 all_node_errors = all_node_errors + node_errors 2289 2290 return all_node_errors
2291
2292 2293 -def GetBlockDevSizes(devices):
2294 """Return the size of the given block devices 2295 2296 @type devices: list 2297 @param devices: list of block device nodes to query 2298 @rtype: dict 2299 @return: 2300 dictionary of all block devices under /dev (key). The value is their 2301 size in MiB. 2302 2303 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124} 2304 2305 """ 2306 DEV_PREFIX = "/dev/" 2307 blockdevs = {} 2308 2309 for devpath in devices: 2310 if not utils.IsBelowDir(DEV_PREFIX, devpath): 2311 continue 2312 2313 try: 2314 st = os.stat(devpath) 2315 except EnvironmentError, err: 2316 logging.warning("Error stat()'ing device %s: %s", devpath, str(err)) 2317 continue 2318 2319 if stat.S_ISBLK(st.st_mode): 2320 result = utils.RunCmd(["blockdev", "--getsize64", devpath]) 2321 if result.failed: 2322 # We don't want to fail, just do not list this device as available 2323 logging.warning("Cannot get size for block device %s", devpath) 2324 continue 2325 2326 size = int(result.stdout) / (1024 * 1024) 2327 blockdevs[devpath] = size 2328 return blockdevs
2329
2330 2331 -def GetVolumeList(vg_names):
2332 """Compute list of logical volumes and their size. 2333 2334 @type vg_names: list 2335 @param vg_names: the volume groups whose LVs we should list, or 2336 empty for all volume groups 2337 @rtype: dict 2338 @return: 2339 dictionary of all partions (key) with value being a tuple of 2340 their size (in MiB), inactive and online status:: 2341 2342 {'xenvg/test1': ('20.06', True, True)} 2343 2344 in case of errors, a string is returned with the error 2345 details. 2346 2347 """ 2348 lvs = {} 2349 sep = "|" 2350 if not vg_names: 2351 vg_names = [] 2352 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix", 2353 "--separator=%s" % sep, 2354 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names) 2355 if result.failed: 2356 _Fail("Failed to list logical volumes, lvs output: %s", result.output) 2357 2358 for line in result.stdout.splitlines(): 2359 line = line.strip() 2360 match = _LVSLINE_REGEX.match(line) 2361 if not match: 2362 logging.error("Invalid line returned from lvs output: '%s'", line) 2363 continue 2364 vg_name, name, size, attr = match.groups() 2365 inactive = attr[4] == "-" 2366 online = attr[5] == "o" 2367 virtual = attr[0] == "v" 2368 if virtual: 2369 # we don't want to report such volumes as existing, since they 2370 # don't really hold data 2371 continue 2372 lvs[vg_name + "/" + name] = (size, inactive, online) 2373 2374 return lvs
2375
2376 2377 -def ListVolumeGroups():
2378 """List the volume groups and their size. 2379 2380 @rtype: dict 2381 @return: dictionary with keys volume name and values the 2382 size of the volume 2383 2384 """ 2385 return utils.ListVolumeGroups()
2386
2387 2388 -def NodeVolumes():
2389 """List all volumes on this node. 2390 2391 @rtype: list 2392 @return: 2393 A list of dictionaries, each having four keys: 2394 - name: the logical volume name, 2395 - size: the size of the logical volume 2396 - dev: the physical device on which the LV lives 2397 - vg: the volume group to which it belongs 2398 2399 In case of errors, we return an empty list and log the 2400 error. 2401 2402 Note that since a logical volume can live on multiple physical 2403 volumes, the resulting list might include a logical volume 2404 multiple times. 2405 2406 """ 2407 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix", 2408 "--separator=|", 2409 "--options=lv_name,lv_size,devices,vg_name"]) 2410 if result.failed: 2411 _Fail("Failed to list logical volumes, lvs output: %s", 2412 result.output) 2413 2414 def parse_dev(dev): 2415 return dev.split("(")[0]
2416 2417 def handle_dev(dev): 2418 return [parse_dev(x) for x in dev.split(",")] 2419 2420 def map_line(line): 2421 line = [v.strip() for v in line] 2422 return [{"name": line[0], "size": line[1], 2423 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])] 2424 2425 all_devs = [] 2426 for line in result.stdout.splitlines(): 2427 if line.count("|") >= 3: 2428 all_devs.extend(map_line(line.split("|"))) 2429 else: 2430 logging.warning("Strange line in the output from lvs: '%s'", line) 2431 return all_devs 2432
2433 2434 -def BridgesExist(bridges_list):
2435 """Check if a list of bridges exist on the current node. 2436 2437 @rtype: boolean 2438 @return: C{True} if all of them exist, C{False} otherwise 2439 2440 """ 2441 missing = [] 2442 for bridge in bridges_list: 2443 if not utils.BridgeExists(bridge): 2444 missing.append(bridge) 2445 2446 if missing: 2447 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2448
2449 2450 -def GetInstanceListForHypervisor(hname, hvparams=None, 2451 get_hv_fn=hypervisor.GetHypervisor):
2452 """Provides a list of instances of the given hypervisor. 2453 2454 @type hname: string 2455 @param hname: name of the hypervisor 2456 @type hvparams: dict of strings 2457 @param hvparams: hypervisor parameters for the given hypervisor 2458 @type get_hv_fn: function 2459 @param get_hv_fn: function that returns a hypervisor for the given hypervisor 2460 name; optional parameter to increase testability 2461 2462 @rtype: list 2463 @return: a list of all running instances on the current node 2464 - instance1.example.com 2465 - instance2.example.com 2466 2467 """ 2468 try: 2469 return get_hv_fn(hname).ListInstances(hvparams=hvparams) 2470 except errors.HypervisorError, err: 2471 _Fail("Error enumerating instances (hypervisor %s): %s", 2472 hname, err, exc=True)
2473
2474 2475 -def GetInstanceList(hypervisor_list, all_hvparams=None, 2476 get_hv_fn=hypervisor.GetHypervisor):
2477 """Provides a list of instances. 2478 2479 @type hypervisor_list: list 2480 @param hypervisor_list: the list of hypervisors to query information 2481 @type all_hvparams: dict of dict of strings 2482 @param all_hvparams: a dictionary mapping hypervisor types to respective 2483 cluster-wide hypervisor parameters 2484 @type get_hv_fn: function 2485 @param get_hv_fn: function that returns a hypervisor for the given hypervisor 2486 name; optional parameter to increase testability 2487 2488 @rtype: list 2489 @return: a list of all running instances on the current node 2490 - instance1.example.com 2491 - instance2.example.com 2492 2493 """ 2494 results = [] 2495 for hname in hypervisor_list: 2496 hvparams = all_hvparams[hname] 2497 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams, 2498 get_hv_fn=get_hv_fn)) 2499 return results
2500
2501 2502 -def GetInstanceInfo(instance, hname, hvparams=None):
2503 """Gives back the information about an instance as a dictionary. 2504 2505 @type instance: string 2506 @param instance: the instance name 2507 @type hname: string 2508 @param hname: the hypervisor type of the instance 2509 @type hvparams: dict of strings 2510 @param hvparams: the instance's hvparams 2511 2512 @rtype: dict 2513 @return: dictionary with the following keys: 2514 - memory: memory size of instance (int) 2515 - state: state of instance (HvInstanceState) 2516 - time: cpu time of instance (float) 2517 - vcpus: the number of vcpus (int) 2518 2519 """ 2520 output = {} 2521 2522 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance, 2523 hvparams=hvparams) 2524 if iinfo is not None: 2525 output["memory"] = iinfo[2] 2526 output["vcpus"] = iinfo[3] 2527 output["state"] = iinfo[4] 2528 output["time"] = iinfo[5] 2529 2530 return output
2531
2532 2533 -def GetInstanceMigratable(instance):
2534 """Computes whether an instance can be migrated. 2535 2536 @type instance: L{objects.Instance} 2537 @param instance: object representing the instance to be checked. 2538 2539 @rtype: tuple 2540 @return: tuple of (result, description) where: 2541 - result: whether the instance can be migrated or not 2542 - description: a description of the issue, if relevant 2543 2544 """ 2545 hyper = hypervisor.GetHypervisor(instance.hypervisor) 2546 iname = instance.name 2547 if iname not in hyper.ListInstances(hvparams=instance.hvparams): 2548 _Fail("Instance %s is not running", iname)
2549
2550 2551 -def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2552 """Gather data about all instances. 2553 2554 This is the equivalent of L{GetInstanceInfo}, except that it 2555 computes data for all instances at once, thus being faster if one 2556 needs data about more than one instance. 2557 2558 @type hypervisor_list: list 2559 @param hypervisor_list: list of hypervisors to query for instance data 2560 @type all_hvparams: dict of dict of strings 2561 @param all_hvparams: mapping of hypervisor names to hvparams 2562 2563 @rtype: dict 2564 @return: dictionary of instance: data, with data having the following keys: 2565 - memory: memory size of instance (int) 2566 - state: xen state of instance (string) 2567 - time: cpu time of instance (float) 2568 - vcpus: the number of vcpus 2569 2570 """ 2571 output = {} 2572 for hname in hypervisor_list: 2573 hvparams = all_hvparams[hname] 2574 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams) 2575 if iinfo: 2576 for name, _, memory, vcpus, state, times in iinfo: 2577 value = { 2578 "memory": memory, 2579 "vcpus": vcpus, 2580 "state": state, 2581 "time": times, 2582 } 2583 if name in output: 2584 # we only check static parameters, like memory and vcpus, 2585 # and not state and time which can change between the 2586 # invocations of the different hypervisors 2587 for key in "memory", "vcpus": 2588 if value[key] != output[name][key]: 2589 _Fail("Instance %s is running twice" 2590 " with different parameters", name) 2591 output[name] = value 2592 2593 return output
2594
2595 2596 -def GetInstanceConsoleInfo(instance_param_dict, 2597 get_hv_fn=hypervisor.GetHypervisor):
2598 """Gather data about the console access of a set of instances of this node. 2599 2600 This function assumes that the caller already knows which instances are on 2601 this node, by calling a function such as L{GetAllInstancesInfo} or 2602 L{GetInstanceList}. 2603 2604 For every instance, a large amount of configuration data needs to be 2605 provided to the hypervisor interface in order to receive the console 2606 information. Whether this could or should be cut down can be discussed. 2607 The information is provided in a dictionary indexed by instance name, 2608 allowing any number of instance queries to be done. 2609 2610 @type instance_param_dict: dict of string to tuple of dictionaries, where the 2611 dictionaries represent: L{objects.Instance}, L{objects.Node}, 2612 L{objects.NodeGroup}, HvParams, BeParams 2613 @param instance_param_dict: mapping of instance name to parameters necessary 2614 for console information retrieval 2615 2616 @rtype: dict 2617 @return: dictionary of instance: data, with data having the following keys: 2618 - instance: instance name 2619 - kind: console kind 2620 - message: used with kind == CONS_MESSAGE, indicates console to be 2621 unavailable, supplies error message 2622 - host: host to connect to 2623 - port: port to use 2624 - user: user for login 2625 - command: the command, broken into parts as an array 2626 - display: unknown, potentially unused? 2627 2628 """ 2629 2630 output = {} 2631 for inst_name in instance_param_dict: 2632 instance = instance_param_dict[inst_name]["instance"] 2633 pnode = instance_param_dict[inst_name]["node"] 2634 group = instance_param_dict[inst_name]["group"] 2635 hvparams = instance_param_dict[inst_name]["hvParams"] 2636 beparams = instance_param_dict[inst_name]["beParams"] 2637 2638 instance = objects.Instance.FromDict(instance) 2639 pnode = objects.Node.FromDict(pnode) 2640 group = objects.NodeGroup.FromDict(group) 2641 2642 h = get_hv_fn(instance.hypervisor) 2643 output[inst_name] = h.GetInstanceConsole(instance, pnode, group, 2644 hvparams, beparams).ToDict() 2645 2646 return output
2647
2648 2649 -def _InstanceLogName(kind, os_name, instance, component):
2650 """Compute the OS log filename for a given instance and operation. 2651 2652 The instance name and os name are passed in as strings since not all 2653 operations have these as part of an instance object. 2654 2655 @type kind: string 2656 @param kind: the operation type (e.g. add, import, etc.) 2657 @type os_name: string 2658 @param os_name: the os name 2659 @type instance: string 2660 @param instance: the name of the instance being imported/added/etc. 2661 @type component: string or None 2662 @param component: the name of the component of the instance being 2663 transferred 2664 2665 """ 2666 # TODO: Use tempfile.mkstemp to create unique filename 2667 if component: 2668 assert "/" not in component 2669 c_msg = "-%s" % component 2670 else: 2671 c_msg = "" 2672 base = ("%s-%s-%s%s-%s.log" % 2673 (kind, os_name, instance, c_msg, utils.TimestampForFilename())) 2674 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2675
2676 2677 -def InstanceOsAdd(instance, reinstall, debug):
2678 """Add an OS to an instance. 2679 2680 @type instance: L{objects.Instance} 2681 @param instance: Instance whose OS is to be installed 2682 @type reinstall: boolean 2683 @param reinstall: whether this is an instance reinstall 2684 @type debug: integer 2685 @param debug: debug level, passed to the OS scripts 2686 @rtype: None 2687 2688 """ 2689 inst_os = OSFromDisk(instance.os) 2690 2691 create_env = OSEnvironment(instance, inst_os, debug) 2692 if reinstall: 2693 create_env["INSTANCE_REINSTALL"] = "1" 2694 2695 logfile = _InstanceLogName("add", instance.os, instance.name, None) 2696 2697 result = utils.RunCmd([inst_os.create_script], env=create_env, 2698 cwd=inst_os.path, output=logfile, reset_env=True) 2699 if result.failed: 2700 logging.error("os create command '%s' returned error: %s, logfile: %s," 2701 " output: %s", result.cmd, result.fail_reason, logfile, 2702 result.output) 2703 lines = [utils.SafeEncode(val) 2704 for val in utils.TailFile(logfile, lines=20)] 2705 _Fail("OS create script failed (%s), last lines in the" 2706 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2707
2708 2709 -def RunRenameInstance(instance, old_name, debug):
2710 """Run the OS rename script for an instance. 2711 2712 @type instance: L{objects.Instance} 2713 @param instance: Instance whose OS is to be installed 2714 @type old_name: string 2715 @param old_name: previous instance name 2716 @type debug: integer 2717 @param debug: debug level, passed to the OS scripts 2718 @rtype: boolean 2719 @return: the success of the operation 2720 2721 """ 2722 inst_os = OSFromDisk(instance.os) 2723 2724 rename_env = OSEnvironment(instance, inst_os, debug) 2725 rename_env["OLD_INSTANCE_NAME"] = old_name 2726 2727 logfile = _InstanceLogName("rename", instance.os, 2728 "%s-%s" % (old_name, instance.name), None) 2729 2730 result = utils.RunCmd([inst_os.rename_script], env=rename_env, 2731 cwd=inst_os.path, output=logfile, reset_env=True) 2732 2733 if result.failed: 2734 logging.error("os create command '%s' returned error: %s output: %s", 2735 result.cmd, result.fail_reason, result.output) 2736 lines = [utils.SafeEncode(val) 2737 for val in utils.TailFile(logfile, lines=20)] 2738 _Fail("OS rename script failed (%s), last lines in the" 2739 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2740
2741 2742 -def _GetBlockDevSymlinkPath(instance_name, idx=None, uuid=None, _dir=None):
2743 """Returns symlink path for block device. 2744 2745 """ 2746 if _dir is None: 2747 _dir = pathutils.DISK_LINKS_DIR 2748 2749 assert idx is not None or uuid is not None 2750 2751 # Using the idx is deprecated. Use the uuid instead if it is available. 2752 if uuid: 2753 ident = uuid 2754 else: 2755 ident = idx 2756 2757 return utils.PathJoin(_dir, 2758 ("%s%s%s" % 2759 (instance_name, constants.DISK_SEPARATOR, ident)))
2760
2761 2762 -def _SymlinkBlockDev(instance_name, device_path, idx=None, uuid=None):
2763 """Set up symlinks to a instance's block device. 2764 2765 This is an auxiliary function run when an instance is start (on the primary 2766 node) or when an instance is migrated (on the target node). 2767 2768 2769 @param instance_name: the name of the target instance 2770 @param device_path: path of the physical block device, on the node 2771 @param idx: the disk index 2772 @param uuid: the disk uuid 2773 @return: absolute path to the disk's symlink 2774 2775 """ 2776 # In case we have only a userspace access URI, device_path is None 2777 if not device_path: 2778 return None 2779 2780 link_name = _GetBlockDevSymlinkPath(instance_name, idx, uuid) 2781 2782 try: 2783 os.symlink(device_path, link_name) 2784 except OSError, err: 2785 if err.errno == errno.EEXIST: 2786 if (not os.path.islink(link_name) or 2787 os.readlink(link_name) != device_path): 2788 os.remove(link_name) 2789 os.symlink(device_path, link_name) 2790 else: 2791 raise 2792 2793 return link_name
2794 2806 2807 for idx, disk in enumerate(disks): 2808 link_name = _GetBlockDevSymlinkPath(instance_name, uuid=disk.uuid) 2809 _remove_symlink(link_name) 2810 # Remove also the deprecated symlink (if any) 2811 link_name = _GetBlockDevSymlinkPath(instance_name, idx=idx) 2812 _remove_symlink(link_name) 2813
2814 2815 -def _CalculateDeviceURI(instance, disk, device):
2816 """Get the URI for the device. 2817 2818 @type instance: L{objects.Instance} 2819 @param instance: the instance which disk belongs to 2820 @type disk: L{objects.Disk} 2821 @param disk: the target disk object 2822 @type device: L{bdev.BlockDev} 2823 @param device: the corresponding BlockDevice 2824 @rtype: string 2825 @return: the device uri if any else None 2826 2827 """ 2828 access_mode = disk.params.get(constants.LDP_ACCESS, 2829 constants.DISK_KERNELSPACE) 2830 if access_mode == constants.DISK_USERSPACE: 2831 # This can raise errors.BlockDeviceError 2832 return device.GetUserspaceAccessUri(instance.hypervisor) 2833 else: 2834 return None
2835
2836 2837 -def _GatherAndLinkBlockDevs(instance):
2838 """Set up an instance's block device(s). 2839 2840 This is run on the primary node at instance startup. The block 2841 devices must be already assembled. 2842 2843 @type instance: L{objects.Instance} 2844 @param instance: the instance whose disks we should assemble 2845 @rtype: list 2846 @return: list of (disk_object, link_name, drive_uri) 2847 2848 """ 2849 block_devices = [] 2850 for idx, disk in enumerate(instance.disks_info): 2851 device = _RecursiveFindBD(disk) 2852 if device is None: 2853 raise errors.BlockDeviceError("Block device '%s' is not set up." % 2854 str(disk)) 2855 device.Open() 2856 try: 2857 # Create both index-based and uuid-based symlinks 2858 # for backwards compatibility 2859 _SymlinkBlockDev(instance.name, device.dev_path, idx=idx) 2860 link_name = _SymlinkBlockDev(instance.name, device.dev_path, 2861 uuid=disk.uuid) 2862 except OSError, e: 2863 raise errors.BlockDeviceError("Cannot create block device symlink: %s" % 2864 e.strerror) 2865 uri = _CalculateDeviceURI(instance, disk, device) 2866 2867 block_devices.append((disk, link_name, uri)) 2868 2869 return block_devices
2870
2871 2872 -def _IsInstanceUserDown(instance_info):
2873 return instance_info and \ 2874 "state" in instance_info and \ 2875 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2876
2877 2878 -def _GetInstanceInfo(instance):
2879 """Helper function L{GetInstanceInfo}""" 2880 return GetInstanceInfo(instance.name, instance.hypervisor, 2881 hvparams=instance.hvparams)
2882
2883 2884 -def StartInstance(instance, startup_paused, reason, store_reason=True):
2885 """Start an instance. 2886 2887 @type instance: L{objects.Instance} 2888 @param instance: the instance object 2889 @type startup_paused: bool 2890 @param instance: pause instance at startup? 2891 @type reason: list of reasons 2892 @param reason: the reason trail for this startup 2893 @type store_reason: boolean 2894 @param store_reason: whether to store the shutdown reason trail on file 2895 @rtype: None 2896 2897 """ 2898 instance_info = _GetInstanceInfo(instance) 2899 2900 if instance_info and not _IsInstanceUserDown(instance_info): 2901 logging.info("Instance '%s' already running, not starting", instance.name) 2902 return 2903 2904 try: 2905 block_devices = _GatherAndLinkBlockDevs(instance) 2906 hyper = hypervisor.GetHypervisor(instance.hypervisor) 2907 hyper.StartInstance(instance, block_devices, startup_paused) 2908 if store_reason: 2909 _StoreInstReasonTrail(instance.name, reason) 2910 except errors.BlockDeviceError, err: 2911 _Fail("Block device error: %s", err, exc=True) 2912 except errors.HypervisorError, err: 2913 _RemoveBlockDevLinks(instance.name, instance.disks_info) 2914 _Fail("Hypervisor error: %s", err, exc=True)
2915
2916 2917 -def InstanceShutdown(instance, timeout, reason, store_reason=True):
2918 """Shut an instance down. 2919 2920 @note: this functions uses polling with a hardcoded timeout. 2921 2922 @type instance: L{objects.Instance} 2923 @param instance: the instance object 2924 @type timeout: integer 2925 @param timeout: maximum timeout for soft shutdown 2926 @type reason: list of reasons 2927 @param reason: the reason trail for this shutdown 2928 @type store_reason: boolean 2929 @param store_reason: whether to store the shutdown reason trail on file 2930 @rtype: None 2931 2932 """ 2933 hyper = hypervisor.GetHypervisor(instance.hypervisor) 2934 2935 if not _GetInstanceInfo(instance): 2936 logging.info("Instance '%s' not running, doing nothing", instance.name) 2937 return 2938 2939 class _TryShutdown(object): 2940 def __init__(self): 2941 self.tried_once = False
2942 2943 def __call__(self): 2944 if not _GetInstanceInfo(instance): 2945 return 2946 2947 try: 2948 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout) 2949 if store_reason: 2950 _StoreInstReasonTrail(instance.name, reason) 2951 except errors.HypervisorError, err: 2952 # if the instance is no longer existing, consider this a 2953 # success and go to cleanup 2954 if not _GetInstanceInfo(instance): 2955 return 2956 2957 _Fail("Failed to stop instance '%s': %s", instance.name, err) 2958 2959 self.tried_once = True 2960 2961 raise utils.RetryAgain() 2962 2963 try: 2964 utils.Retry(_TryShutdown(), 5, timeout) 2965 except utils.RetryTimeout: 2966 # the shutdown did not succeed 2967 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name) 2968 2969 try: 2970 hyper.StopInstance(instance, force=True) 2971 except errors.HypervisorError, err: 2972 # only raise an error if the instance still exists, otherwise 2973 # the error could simply be "instance ... unknown"! 2974 if _GetInstanceInfo(instance): 2975 _Fail("Failed to force stop instance '%s': %s", instance.name, err) 2976 2977 time.sleep(1) 2978 2979 if _GetInstanceInfo(instance): 2980 _Fail("Could not shutdown instance '%s' even by destroy", instance.name) 2981 2982 try: 2983 hyper.CleanupInstance(instance.name) 2984 except errors.HypervisorError, err: 2985 logging.warning("Failed to execute post-shutdown cleanup step: %s", err) 2986 2987 _RemoveBlockDevLinks(instance.name, instance.disks_info) 2988
2989 2990 -def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
2991 """Reboot an instance. 2992 2993 @type instance: L{objects.Instance} 2994 @param instance: the instance object to reboot 2995 @type reboot_type: str 2996 @param reboot_type: the type of reboot, one the following 2997 constants: 2998 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the 2999 instance OS, do not recreate the VM 3000 - L{constants.INSTANCE_REBOOT_HARD}: tear down and 3001 restart the VM (at the hypervisor level) 3002 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is 3003 not accepted here, since that mode is handled differently, in 3004 cmdlib, and translates into full stop and start of the 3005 instance (instead of a call_instance_reboot RPC) 3006 @type shutdown_timeout: integer 3007 @param shutdown_timeout: maximum timeout for soft shutdown 3008 @type reason: list of reasons 3009 @param reason: the reason trail for this reboot 3010 @rtype: None 3011 3012 """ 3013 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown' 3014 # because those functions simply 'return' on error whereas this one 3015 # raises an exception with '_Fail' 3016 if not _GetInstanceInfo(instance): 3017 _Fail("Cannot reboot instance '%s' that is not running", instance.name) 3018 3019 hyper = hypervisor.GetHypervisor(instance.hypervisor) 3020 if reboot_type == constants.INSTANCE_REBOOT_SOFT: 3021 try: 3022 hyper.RebootInstance(instance) 3023 except errors.HypervisorError, err: 3024 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err) 3025 elif reboot_type == constants.INSTANCE_REBOOT_HARD: 3026 try: 3027 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False) 3028 result = StartInstance(instance, False, reason, store_reason=False) 3029 _StoreInstReasonTrail(instance.name, reason) 3030 return result 3031 except errors.HypervisorError, err: 3032 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err) 3033 else: 3034 _Fail("Invalid reboot_type received: '%s'", reboot_type)
3035
3036 3037 -def InstanceBalloonMemory(instance, memory):
3038 """Resize an instance's memory. 3039 3040 @type instance: L{objects.Instance} 3041 @param instance: the instance object 3042 @type memory: int 3043 @param memory: new memory amount in MB 3044 @rtype: None 3045 3046 """ 3047 hyper = hypervisor.GetHypervisor(instance.hypervisor) 3048 running = hyper.ListInstances(hvparams=instance.hvparams) 3049 if instance.name not in running: 3050 logging.info("Instance %s is not running, cannot balloon", instance.name) 3051 return 3052 try: 3053 hyper.BalloonInstanceMemory(instance, memory) 3054 except errors.HypervisorError, err: 3055 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3056
3057 3058 -def MigrationInfo(instance):
3059 """Gather information about an instance to be migrated. 3060 3061 @type instance: L{objects.Instance} 3062 @param instance: the instance definition 3063 3064 """ 3065 hyper = hypervisor.GetHypervisor(instance.hypervisor) 3066 try: 3067 info = hyper.MigrationInfo(instance) 3068 except errors.HypervisorError, err: 3069 _Fail("Failed to fetch migration information: %s", err, exc=True) 3070 return info
3071
3072 3073 -def AcceptInstance(instance, info, target):
3074 """Prepare the node to accept an instance. 3075 3076 @type instance: L{objects.Instance} 3077 @param instance: the instance definition 3078 @type info: string/data (opaque) 3079 @param info: migration information, from the source node 3080 @type target: string 3081 @param target: target host (usually ip), on this node 3082 3083 """ 3084 hyper = hypervisor.GetHypervisor(instance.hypervisor) 3085 try: 3086 hyper.AcceptInstance(instance, info, target) 3087 except errors.HypervisorError, err: 3088 _Fail("Failed to accept instance: %s", err, exc=True)
3089
3090 3091 -def FinalizeMigrationDst(instance, info, success):
3092 """Finalize any preparation to accept an instance. 3093 3094 @type instance: L{objects.Instance} 3095 @param instance: the instance definition 3096 @type info: string/data (opaque) 3097 @param info: migration information, from the source node 3098 @type success: boolean 3099 @param success: whether the migration was a success or a failure 3100 3101 """ 3102 hyper = hypervisor.GetHypervisor(instance.hypervisor) 3103 try: 3104 hyper.FinalizeMigrationDst(instance, info, success) 3105 except errors.HypervisorError, err: 3106 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3107
3108 3109 -def MigrateInstance(cluster_name, instance, target, live):
3110 """Migrates an instance to another node. 3111 3112 @type cluster_name: string 3113 @param cluster_name: name of the cluster 3114 @type instance: L{objects.Instance} 3115 @param instance: the instance definition 3116 @type target: string 3117 @param target: the target node name 3118 @type live: boolean 3119 @param live: whether the migration should be done live or not (the 3120 interpretation of this parameter is left to the hypervisor) 3121 @raise RPCFail: if migration fails for some reason 3122 3123 """ 3124 hyper = hypervisor.GetHypervisor(instance.hypervisor) 3125 3126 try: 3127 hyper.MigrateInstance(cluster_name, instance, target, live) 3128 except errors.HypervisorError, err: 3129 _Fail("Failed to migrate instance: %s", err, exc=True)
3130
3131 3132 -def FinalizeMigrationSource(instance, success, live):
3133 """Finalize the instance migration on the source node. 3134 3135 @type instance: L{objects.Instance} 3136 @param instance: the instance definition of the migrated instance 3137 @type success: bool 3138 @param success: whether the migration succeeded or not 3139 @type live: bool 3140 @param live: whether the user requested a live migration or not 3141 @raise RPCFail: If the execution fails for some reason 3142 3143 """ 3144 hyper = hypervisor.GetHypervisor(instance.hypervisor) 3145 3146 try: 3147 hyper.FinalizeMigrationSource(instance, success, live) 3148 except Exception, err: # pylint: disable=W0703 3149 _Fail("Failed to finalize the migration on the source node: %s", err, 3150 exc=True)
3151
3152 3153 -def GetMigrationStatus(instance):
3154 """Get the migration status 3155 3156 @type instance: L{objects.Instance} 3157 @param instance: the instance that is being migrated 3158 @rtype: L{objects.MigrationStatus} 3159 @return: the status of the current migration (one of 3160 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional 3161 progress info that can be retrieved from the hypervisor 3162 @raise RPCFail: If the migration status cannot be retrieved 3163 3164 """ 3165 hyper = hypervisor.GetHypervisor(instance.hypervisor) 3166 try: 3167 return hyper.GetMigrationStatus(instance) 3168 except Exception, err: # pylint: disable=W0703 3169 _Fail("Failed to get migration status: %s", err, exc=True)
3170
3171 3172 -def HotplugDevice(instance, action, dev_type, device, extra, seq):
3173 """Hotplug a device 3174 3175 Hotplug is currently supported only for KVM Hypervisor. 3176 @type instance: L{objects.Instance} 3177 @param instance: the instance to which we hotplug a device 3178 @type action: string 3179 @param action: the hotplug action to perform 3180 @type dev_type: string 3181 @param dev_type: the device type to hotplug 3182 @type device: either L{objects.NIC} or L{objects.Disk} 3183 @param device: the device object to hotplug 3184 @type extra: tuple 3185 @param extra: extra info used for disk hotplug (disk link, drive uri) 3186 @type seq: int 3187 @param seq: the index of the device from master perspective 3188 @raise RPCFail: in case instance does not have KVM hypervisor 3189 3190 """ 3191 hyper = hypervisor.GetHypervisor(instance.hypervisor) 3192 try: 3193 hyper.VerifyHotplugSupport(instance, action, dev_type) 3194 except errors.HotplugError, err: 3195 _Fail("Hotplug is not supported: %s", err) 3196 3197 if action == constants.HOTPLUG_ACTION_ADD: 3198 fn = hyper.HotAddDevice 3199 elif action == constants.HOTPLUG_ACTION_REMOVE: 3200 fn = hyper.HotDelDevice 3201 elif action == constants.HOTPLUG_ACTION_MODIFY: 3202 fn = hyper.HotModDevice 3203 else: 3204 assert action in constants.HOTPLUG_ALL_ACTIONS 3205 3206 return fn(instance, dev_type, device, extra, seq)
3207
3208 3209 -def HotplugSupported(instance):
3210 """Checks if hotplug is generally supported. 3211 3212 """ 3213 hyper = hypervisor.GetHypervisor(instance.hypervisor) 3214 try: 3215 hyper.HotplugSupported(instance) 3216 except errors.HotplugError, err: 3217 _Fail("Hotplug is not supported: %s", err)
3218
3219 3220 -def ModifyInstanceMetadata(metadata):
3221 """Sends instance data to the metadata daemon. 3222 3223 Uses the Luxi transport layer to communicate with the metadata 3224 daemon configuration server. It starts the metadata daemon if it is 3225 not running. 3226 The daemon must be enabled during at configuration time. 3227 3228 @type metadata: dict 3229 @param metadata: instance metadata obtained by calling 3230 L{objects.Instance.ToDict} on an instance object 3231 3232 """ 3233 if not constants.ENABLE_METAD: 3234 raise errors.ProgrammerError("The metadata deamon is disabled, yet" 3235 " ModifyInstanceMetadata has been called") 3236 3237 if not utils.IsDaemonAlive(constants.METAD): 3238 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD]) 3239 if result.failed: 3240 raise errors.HypervisorError("Failed to start metadata daemon") 3241 3242 with contextlib.closing(metad.Client()) as client: 3243 client.UpdateConfig(metadata)
3244
3245 3246 -def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3247 """Creates a block device for an instance. 3248 3249 @type disk: L{objects.Disk} 3250 @param disk: the object describing the disk we should create 3251 @type size: int 3252 @param size: the size of the physical underlying device, in MiB 3253 @type owner: str 3254 @param owner: the name of the instance for which disk is created, 3255 used for device cache data 3256 @type on_primary: boolean 3257 @param on_primary: indicates if it is the primary node or not 3258 @type info: string 3259 @param info: string that will be sent to the physical device 3260 creation, used for example to set (LVM) tags on LVs 3261 @type excl_stor: boolean 3262 @param excl_stor: Whether exclusive_storage is active 3263 3264 @return: the new unique_id of the device (this can sometime be 3265 computed only after creation), or None. On secondary nodes, 3266 it's not required to return anything. 3267 3268 """ 3269 # TODO: remove the obsolete "size" argument 3270 # pylint: disable=W0613 3271 clist = [] 3272 if disk.children: 3273 for child in disk.children: 3274 try: 3275 crdev = _RecursiveAssembleBD(child, owner, on_primary) 3276 except errors.BlockDeviceError, err: 3277 _Fail("Can't assemble device %s: %s", child, err) 3278 if on_primary or disk.AssembleOnSecondary(): 3279 # we need the children open in case the device itself has to 3280 # be assembled 3281 try: 3282 # pylint: disable=E1103 3283 crdev.Open() 3284 except errors.BlockDeviceError, err: 3285 _Fail("Can't make child '%s' read-write: %s", child, err) 3286 clist.append(crdev) 3287 3288 try: 3289 device = bdev.Create(disk, clist, excl_stor) 3290 except errors.BlockDeviceError, err: 3291 _Fail("Can't create block device: %s", err) 3292 3293 if on_primary or disk.AssembleOnSecondary(): 3294 try: 3295 device.Assemble() 3296 except errors.BlockDeviceError, err: 3297 _Fail("Can't assemble device after creation, unusual event: %s", err) 3298 if on_primary or disk.OpenOnSecondary(): 3299 try: 3300 device.Open(force=True) 3301 except errors.BlockDeviceError, err: 3302 _Fail("Can't make device r/w after creation, unusual event: %s", err) 3303 DevCacheManager.UpdateCache(device.dev_path, owner, 3304 on_primary, disk.iv_name) 3305 3306 device.SetInfo(info) 3307 3308 return device.unique_id
3309
3310 3311 -def _DumpDevice(source_path, target_path, offset, size, truncate):
3312 """This function images/wipes the device using a local file. 3313 3314 @type source_path: string 3315 @param source_path: path of the image or data source (e.g., "/dev/zero") 3316 3317 @type target_path: string 3318 @param target_path: path of the device to image/wipe 3319 3320 @type offset: int 3321 @param offset: offset in MiB in the output file 3322 3323 @type size: int 3324 @param size: maximum size in MiB to write (data source might be smaller) 3325 3326 @type truncate: bool 3327 @param truncate: whether the file should be truncated 3328 3329 @return: None 3330 @raise RPCFail: in case of failure 3331 3332 """ 3333 # Internal sizes are always in Mebibytes; if the following "dd" command 3334 # should use a different block size the offset and size given to this 3335 # function must be adjusted accordingly before being passed to "dd". 3336 block_size = constants.DD_BLOCK_SIZE 3337 3338 cmd = [constants.DD_CMD, "if=%s" % source_path, "seek=%d" % offset, 3339 "bs=%s" % block_size, "oflag=direct", "of=%s" % target_path, 3340 "count=%d" % size] 3341 3342 if not truncate: 3343 cmd.append("conv=notrunc") 3344 3345 result = utils.RunCmd(cmd) 3346 3347 if result.failed: 3348 _Fail("Dump command '%s' exited with error: %s; output: %s", result.cmd, 3349 result.fail_reason, result.output)
3350
3351 3352 -def _DownloadAndDumpDevice(source_url, target_path, size):
3353 """This function images a device using a downloaded image file. 3354 3355 @type source_url: string 3356 @param source_url: URL of image to dump to disk 3357 3358 @type target_path: string 3359 @param target_path: path of the device to image 3360 3361 @type size: int 3362 @param size: maximum size in MiB to write (data source might be smaller) 3363 3364 @rtype: NoneType 3365 @return: None 3366 @raise RPCFail: in case of download or write failures 3367 3368 """ 3369 class DDParams(object): 3370 def __init__(self, current_size, total_size): 3371 self.current_size = current_size 3372 self.total_size = total_size 3373 self.image_size_error = False
3374 3375 def dd_write(ddparams, out): 3376 if ddparams.current_size < ddparams.total_size: 3377 ddparams.current_size += len(out) 3378 target_file.write(out) 3379 else: 3380 ddparams.image_size_error = True 3381 return -1 3382 3383 target_file = open(target_path, "r+") 3384 ddparams = DDParams(0, 1024 * 1024 * size) 3385 3386 curl = pycurl.Curl() 3387 curl.setopt(pycurl.VERBOSE, True) 3388 curl.setopt(pycurl.NOSIGNAL, True) 3389 curl.setopt(pycurl.USERAGENT, http.HTTP_GANETI_VERSION) 3390 curl.setopt(pycurl.URL, source_url) 3391 curl.setopt(pycurl.WRITEFUNCTION, lambda out: dd_write(ddparams, out)) 3392 3393 try: 3394 curl.perform() 3395 except pycurl.error: 3396 if ddparams.image_size_error: 3397 _Fail("Disk image larger than the disk") 3398 else: 3399 raise 3400 3401 target_file.close() 3402
3403 3404 -def BlockdevConvert(src_disk, target_disk):
3405 """Copies data from source block device to target. 3406 3407 This function gets the export and import commands from the source and 3408 target devices respectively, and then concatenates them to a single 3409 command using a pipe ("|"). Finally, executes the unified command that 3410 will transfer the data between the devices during the disk template 3411 conversion operation. 3412 3413 @type src_disk: L{objects.Disk} 3414 @param src_disk: the disk object we want to copy from 3415 @type target_disk: L{objects.Disk} 3416 @param target_disk: the disk object we want to copy to 3417 3418 @rtype: NoneType 3419 @return: None 3420 @raise RPCFail: in case of failure 3421 3422 """ 3423 src_dev = _RecursiveFindBD(src_disk) 3424 if src_dev is None: 3425 _Fail("Cannot copy from device '%s': device not found", src_disk.uuid) 3426 3427 dest_dev = _RecursiveFindBD(target_disk) 3428 if dest_dev is None: 3429 _Fail("Cannot copy to device '%s': device not found", target_disk.uuid) 3430 3431 src_cmd = src_dev.Export() 3432 dest_cmd = dest_dev.Import() 3433 command = "%s | %s" % (utils.ShellQuoteArgs(src_cmd), 3434 utils.ShellQuoteArgs(dest_cmd)) 3435 3436 result = utils.RunCmd(command) 3437 if result.failed: 3438 _Fail("Disk conversion command '%s' exited with error: %s; output: %s", 3439 result.cmd, result.fail_reason, result.output)
3440
3441 3442 -def BlockdevWipe(disk, offset, size):
3443 """Wipes a block device. 3444 3445 @type disk: L{objects.Disk} 3446 @param disk: the disk object we want to wipe 3447 @type offset: int 3448 @param offset: The offset in MiB in the file 3449 @type size: int 3450 @param size: The size in MiB to write 3451 3452 """ 3453 try: 3454 rdev = _RecursiveFindBD(disk) 3455 except errors.BlockDeviceError: 3456 rdev = None 3457 3458 if not rdev: 3459 _Fail("Cannot wipe device %s: device not found", disk.iv_name) 3460 if offset < 0: 3461 _Fail("Negative offset") 3462 if size < 0: 3463 _Fail("Negative size") 3464 if offset > rdev.size: 3465 _Fail("Wipe offset is bigger than device size") 3466 if (