Package ganeti :: Package client :: Module gnt_cluster
[hide private]
[frames] | no frames]

Source Code for Module ganeti.client.gnt_cluster

   1  # 
   2  # 
   3   
   4  # Copyright (C) 2006, 2007, 2010, 2011, 2012, 2013 Google Inc. 
   5  # All rights reserved. 
   6  # 
   7  # Redistribution and use in source and binary forms, with or without 
   8  # modification, are permitted provided that the following conditions are 
   9  # met: 
  10  # 
  11  # 1. Redistributions of source code must retain the above copyright notice, 
  12  # this list of conditions and the following disclaimer. 
  13  # 
  14  # 2. Redistributions in binary form must reproduce the above copyright 
  15  # notice, this list of conditions and the following disclaimer in the 
  16  # documentation and/or other materials provided with the distribution. 
  17  # 
  18  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
  19  # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
  20  # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
  21  # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
  22  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  23  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  24  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  25  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  26  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  27  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  28  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  29   
  30  """Cluster related commands""" 
  31   
  32  # pylint: disable=W0401,W0613,W0614,C0103 
  33  # W0401: Wildcard import ganeti.cli 
  34  # W0613: Unused argument, since all functions follow the same API 
  35  # W0614: Unused import %s from wildcard import (since we need cli) 
  36  # C0103: Invalid name gnt-cluster 
  37   
  38  from cStringIO import StringIO 
  39  import os 
  40  import time 
  41  import OpenSSL 
  42  import tempfile 
  43  import itertools 
  44   
  45  from ganeti.cli import * 
  46  from ganeti import opcodes 
  47  from ganeti import constants 
  48  from ganeti import errors 
  49  from ganeti import utils 
  50  from ganeti import bootstrap 
  51  from ganeti import ssh 
  52  from ganeti import objects 
  53  from ganeti import uidpool 
  54  from ganeti import compat 
  55  from ganeti import netutils 
  56  from ganeti import ssconf 
  57  from ganeti import pathutils 
  58  from ganeti import serializer 
  59  from ganeti import qlang 
  60   
  61   
  62  ON_OPT = cli_option("--on", default=False, 
  63                      action="store_true", dest="on", 
  64                      help="Recover from an EPO") 
  65   
  66  GROUPS_OPT = cli_option("--groups", default=False, 
  67                          action="store_true", dest="groups", 
  68                          help="Arguments are node groups instead of nodes") 
  69   
  70  FORCE_FAILOVER = cli_option("--yes-do-it", dest="yes_do_it", 
  71                              help="Override interactive check for --no-voting", 
  72                              default=False, action="store_true") 
  73   
  74  FORCE_DISTRIBUTION = cli_option("--yes-do-it", dest="yes_do_it", 
  75                                  help="Unconditionally distribute the" 
  76                                  " configuration, even if the queue" 
  77                                  " is drained", 
  78                                  default=False, action="store_true") 
  79   
  80  TO_OPT = cli_option("--to", default=None, type="string", 
  81                      help="The Ganeti version to upgrade to") 
  82   
  83  RESUME_OPT = cli_option("--resume", default=False, action="store_true", 
  84                          help="Resume any pending Ganeti upgrades") 
  85   
  86  _EPO_PING_INTERVAL = 30 # 30 seconds between pings 
  87  _EPO_PING_TIMEOUT = 1 # 1 second 
  88  _EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes 
89 90 91 -def _InitEnabledDiskTemplates(opts):
92 """Initialize the list of enabled disk templates. 93 94 """ 95 if opts.enabled_disk_templates: 96 return opts.enabled_disk_templates.split(",") 97 else: 98 return constants.DEFAULT_ENABLED_DISK_TEMPLATES
99
100 101 -def _InitVgName(opts, enabled_disk_templates):
102 """Initialize the volume group name. 103 104 @type enabled_disk_templates: list of strings 105 @param enabled_disk_templates: cluster-wide enabled disk templates 106 107 """ 108 vg_name = None 109 if opts.vg_name is not None: 110 vg_name = opts.vg_name 111 if vg_name: 112 if not utils.IsLvmEnabled(enabled_disk_templates): 113 ToStdout("You specified a volume group with --vg-name, but you did not" 114 " enable any disk template that uses lvm.") 115 elif utils.IsLvmEnabled(enabled_disk_templates): 116 raise errors.OpPrereqError( 117 "LVM disk templates are enabled, but vg name not set.") 118 elif utils.IsLvmEnabled(enabled_disk_templates): 119 vg_name = constants.DEFAULT_VG 120 return vg_name
121
122 123 -def _InitDrbdHelper(opts, enabled_disk_templates):
124 """Initialize the DRBD usermode helper. 125 126 """ 127 drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates 128 129 if not drbd_enabled and opts.drbd_helper is not None: 130 ToStdout("Note: You specified a DRBD usermode helper, while DRBD storage" 131 " is not enabled.") 132 133 if drbd_enabled: 134 if opts.drbd_helper is None: 135 return constants.DEFAULT_DRBD_HELPER 136 if opts.drbd_helper == '': 137 raise errors.OpPrereqError( 138 "Unsetting the drbd usermode helper while enabling DRBD is not" 139 " allowed.") 140 141 return opts.drbd_helper
142
143 144 @UsesRPC 145 -def InitCluster(opts, args):
146 """Initialize the cluster. 147 148 @param opts: the command line options selected by the user 149 @type args: list 150 @param args: should contain only one element, the desired 151 cluster name 152 @rtype: int 153 @return: the desired exit code 154 155 """ 156 enabled_disk_templates = _InitEnabledDiskTemplates(opts) 157 158 try: 159 vg_name = _InitVgName(opts, enabled_disk_templates) 160 drbd_helper = _InitDrbdHelper(opts, enabled_disk_templates) 161 except errors.OpPrereqError, e: 162 ToStderr(str(e)) 163 return 1 164 165 master_netdev = opts.master_netdev 166 if master_netdev is None: 167 nic_mode = opts.nicparams.get(constants.NIC_MODE, None) 168 if not nic_mode: 169 # default case, use bridging 170 master_netdev = constants.DEFAULT_BRIDGE 171 elif nic_mode == constants.NIC_MODE_OVS: 172 # default ovs is different from default bridge 173 master_netdev = constants.DEFAULT_OVS 174 opts.nicparams[constants.NIC_LINK] = constants.DEFAULT_OVS 175 176 hvlist = opts.enabled_hypervisors 177 if hvlist is None: 178 hvlist = constants.DEFAULT_ENABLED_HYPERVISOR 179 hvlist = hvlist.split(",") 180 181 hvparams = dict(opts.hvparams) 182 beparams = opts.beparams 183 nicparams = opts.nicparams 184 185 diskparams = dict(opts.diskparams) 186 187 # check the disk template types here, as we cannot rely on the type check done 188 # by the opcode parameter types 189 diskparams_keys = set(diskparams.keys()) 190 if not (diskparams_keys <= constants.DISK_TEMPLATES): 191 unknown = utils.NiceSort(diskparams_keys - constants.DISK_TEMPLATES) 192 ToStderr("Disk templates unknown: %s" % utils.CommaJoin(unknown)) 193 return 1 194 195 # prepare beparams dict 196 beparams = objects.FillDict(constants.BEC_DEFAULTS, beparams) 197 utils.ForceDictType(beparams, constants.BES_PARAMETER_COMPAT) 198 199 # prepare nicparams dict 200 nicparams = objects.FillDict(constants.NICC_DEFAULTS, nicparams) 201 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES) 202 203 # prepare ndparams dict 204 if opts.ndparams is None: 205 ndparams = dict(constants.NDC_DEFAULTS) 206 else: 207 ndparams = objects.FillDict(constants.NDC_DEFAULTS, opts.ndparams) 208 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES) 209 210 # prepare hvparams dict 211 for hv in constants.HYPER_TYPES: 212 if hv not in hvparams: 213 hvparams[hv] = {} 214 hvparams[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], hvparams[hv]) 215 utils.ForceDictType(hvparams[hv], constants.HVS_PARAMETER_TYPES) 216 217 # prepare diskparams dict 218 for templ in constants.DISK_TEMPLATES: 219 if templ not in diskparams: 220 diskparams[templ] = {} 221 diskparams[templ] = objects.FillDict(constants.DISK_DT_DEFAULTS[templ], 222 diskparams[templ]) 223 utils.ForceDictType(diskparams[templ], constants.DISK_DT_TYPES) 224 225 # prepare ipolicy dict 226 ipolicy = CreateIPolicyFromOpts( 227 ispecs_mem_size=opts.ispecs_mem_size, 228 ispecs_cpu_count=opts.ispecs_cpu_count, 229 ispecs_disk_count=opts.ispecs_disk_count, 230 ispecs_disk_size=opts.ispecs_disk_size, 231 ispecs_nic_count=opts.ispecs_nic_count, 232 minmax_ispecs=opts.ipolicy_bounds_specs, 233 std_ispecs=opts.ipolicy_std_specs, 234 ipolicy_disk_templates=opts.ipolicy_disk_templates, 235 ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio, 236 ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio, 237 fill_all=True) 238 239 if opts.candidate_pool_size is None: 240 opts.candidate_pool_size = constants.MASTER_POOL_SIZE_DEFAULT 241 242 if opts.mac_prefix is None: 243 opts.mac_prefix = constants.DEFAULT_MAC_PREFIX 244 245 uid_pool = opts.uid_pool 246 if uid_pool is not None: 247 uid_pool = uidpool.ParseUidPool(uid_pool) 248 249 if opts.prealloc_wipe_disks is None: 250 opts.prealloc_wipe_disks = False 251 252 external_ip_setup_script = opts.use_external_mip_script 253 if external_ip_setup_script is None: 254 external_ip_setup_script = False 255 256 try: 257 primary_ip_version = int(opts.primary_ip_version) 258 except (ValueError, TypeError), err: 259 ToStderr("Invalid primary ip version value: %s" % str(err)) 260 return 1 261 262 master_netmask = opts.master_netmask 263 try: 264 if master_netmask is not None: 265 master_netmask = int(master_netmask) 266 except (ValueError, TypeError), err: 267 ToStderr("Invalid master netmask value: %s" % str(err)) 268 return 1 269 270 if opts.disk_state: 271 disk_state = utils.FlatToDict(opts.disk_state) 272 else: 273 disk_state = {} 274 275 hv_state = dict(opts.hv_state) 276 277 default_ialloc_params = opts.default_iallocator_params 278 279 if opts.enabled_user_shutdown: 280 enabled_user_shutdown = True 281 else: 282 enabled_user_shutdown = False 283 284 bootstrap.InitCluster(cluster_name=args[0], 285 secondary_ip=opts.secondary_ip, 286 vg_name=vg_name, 287 mac_prefix=opts.mac_prefix, 288 master_netmask=master_netmask, 289 master_netdev=master_netdev, 290 file_storage_dir=opts.file_storage_dir, 291 shared_file_storage_dir=opts.shared_file_storage_dir, 292 gluster_storage_dir=opts.gluster_storage_dir, 293 enabled_hypervisors=hvlist, 294 hvparams=hvparams, 295 beparams=beparams, 296 nicparams=nicparams, 297 ndparams=ndparams, 298 diskparams=diskparams, 299 ipolicy=ipolicy, 300 candidate_pool_size=opts.candidate_pool_size, 301 modify_etc_hosts=opts.modify_etc_hosts, 302 modify_ssh_setup=opts.modify_ssh_setup, 303 maintain_node_health=opts.maintain_node_health, 304 drbd_helper=drbd_helper, 305 uid_pool=uid_pool, 306 default_iallocator=opts.default_iallocator, 307 default_iallocator_params=default_ialloc_params, 308 primary_ip_version=primary_ip_version, 309 prealloc_wipe_disks=opts.prealloc_wipe_disks, 310 use_external_mip_script=external_ip_setup_script, 311 hv_state=hv_state, 312 disk_state=disk_state, 313 enabled_disk_templates=enabled_disk_templates, 314 enabled_user_shutdown=enabled_user_shutdown, 315 ) 316 op = opcodes.OpClusterPostInit() 317 SubmitOpCode(op, opts=opts) 318 return 0
319
320 321 @UsesRPC 322 -def DestroyCluster(opts, args):
323 """Destroy the cluster. 324 325 @param opts: the command line options selected by the user 326 @type args: list 327 @param args: should be an empty list 328 @rtype: int 329 @return: the desired exit code 330 331 """ 332 if not opts.yes_do_it: 333 ToStderr("Destroying a cluster is irreversible. If you really want" 334 " destroy this cluster, supply the --yes-do-it option.") 335 return 1 336 337 op = opcodes.OpClusterDestroy() 338 master_uuid = SubmitOpCode(op, opts=opts) 339 # if we reached this, the opcode didn't fail; we can proceed to 340 # shutdown all the daemons 341 bootstrap.FinalizeClusterDestroy(master_uuid) 342 return 0
343
344 345 -def RenameCluster(opts, args):
346 """Rename the cluster. 347 348 @param opts: the command line options selected by the user 349 @type args: list 350 @param args: should contain only one element, the new cluster name 351 @rtype: int 352 @return: the desired exit code 353 354 """ 355 cl = GetClient() 356 357 (cluster_name, ) = cl.QueryConfigValues(["cluster_name"]) 358 359 new_name = args[0] 360 if not opts.force: 361 usertext = ("This will rename the cluster from '%s' to '%s'. If you are" 362 " connected over the network to the cluster name, the" 363 " operation is very dangerous as the IP address will be" 364 " removed from the node and the change may not go through." 365 " Continue?") % (cluster_name, new_name) 366 if not AskUser(usertext): 367 return 1 368 369 op = opcodes.OpClusterRename(name=new_name) 370 result = SubmitOpCode(op, opts=opts, cl=cl) 371 372 if result: 373 ToStdout("Cluster renamed from '%s' to '%s'", cluster_name, result) 374 375 return 0
376
377 378 -def ActivateMasterIp(opts, args):
379 """Activates the master IP. 380 381 """ 382 op = opcodes.OpClusterActivateMasterIp() 383 SubmitOpCode(op) 384 return 0
385
386 387 -def DeactivateMasterIp(opts, args):
388 """Deactivates the master IP. 389 390 """ 391 if not opts.confirm: 392 usertext = ("This will disable the master IP. All the open connections to" 393 " the master IP will be closed. To reach the master you will" 394 " need to use its node IP." 395 " Continue?") 396 if not AskUser(usertext): 397 return 1 398 399 op = opcodes.OpClusterDeactivateMasterIp() 400 SubmitOpCode(op) 401 return 0
402
403 404 -def RedistributeConfig(opts, args):
405 """Forces push of the cluster configuration. 406 407 @param opts: the command line options selected by the user 408 @type args: list 409 @param args: empty list 410 @rtype: int 411 @return: the desired exit code 412 413 """ 414 op = opcodes.OpClusterRedistConf() 415 if opts.yes_do_it: 416 SubmitOpCodeToDrainedQueue(op) 417 else: 418 SubmitOrSend(op, opts) 419 return 0
420
421 422 -def ShowClusterVersion(opts, args):
423 """Write version of ganeti software to the standard output. 424 425 @param opts: the command line options selected by the user 426 @type args: list 427 @param args: should be an empty list 428 @rtype: int 429 @return: the desired exit code 430 431 """ 432 cl = GetClient(query=True) 433 result = cl.QueryClusterInfo() 434 ToStdout("Software version: %s", result["software_version"]) 435 ToStdout("Internode protocol: %s", result["protocol_version"]) 436 ToStdout("Configuration format: %s", result["config_version"]) 437 ToStdout("OS api version: %s", result["os_api_version"]) 438 ToStdout("Export interface: %s", result["export_version"]) 439 ToStdout("VCS version: %s", result["vcs_version"]) 440 return 0
441
442 443 -def ShowClusterMaster(opts, args):
444 """Write name of master node to the standard output. 445 446 @param opts: the command line options selected by the user 447 @type args: list 448 @param args: should be an empty list 449 @rtype: int 450 @return: the desired exit code 451 452 """ 453 master = bootstrap.GetMaster() 454 ToStdout(master) 455 return 0
456
457 458 -def _FormatGroupedParams(paramsdict, roman=False):
459 """Format Grouped parameters (be, nic, disk) by group. 460 461 @type paramsdict: dict of dicts 462 @param paramsdict: {group: {param: value, ...}, ...} 463 @rtype: dict of dicts 464 @return: copy of the input dictionaries with strings as values 465 466 """ 467 ret = {} 468 for (item, val) in paramsdict.items(): 469 if isinstance(val, dict): 470 ret[item] = _FormatGroupedParams(val, roman=roman) 471 elif roman and isinstance(val, int): 472 ret[item] = compat.TryToRoman(val) 473 else: 474 ret[item] = str(val) 475 return ret
476
477 478 -def ShowClusterConfig(opts, args):
479 """Shows cluster information. 480 481 @param opts: the command line options selected by the user 482 @type args: list 483 @param args: should be an empty list 484 @rtype: int 485 @return: the desired exit code 486 487 """ 488 cl = GetClient(query=True) 489 result = cl.QueryClusterInfo() 490 491 if result["tags"]: 492 tags = utils.CommaJoin(utils.NiceSort(result["tags"])) 493 else: 494 tags = "(none)" 495 if result["reserved_lvs"]: 496 reserved_lvs = utils.CommaJoin(result["reserved_lvs"]) 497 else: 498 reserved_lvs = "(none)" 499 500 enabled_hv = result["enabled_hypervisors"] 501 hvparams = dict((k, v) for k, v in result["hvparams"].iteritems() 502 if k in enabled_hv) 503 504 info = [ 505 ("Cluster name", result["name"]), 506 ("Cluster UUID", result["uuid"]), 507 508 ("Creation time", utils.FormatTime(result["ctime"])), 509 ("Modification time", utils.FormatTime(result["mtime"])), 510 511 ("Master node", result["master"]), 512 513 ("Architecture (this node)", 514 "%s (%s)" % (result["architecture"][0], result["architecture"][1])), 515 516 ("Tags", tags), 517 518 ("Default hypervisor", result["default_hypervisor"]), 519 ("Enabled hypervisors", utils.CommaJoin(enabled_hv)), 520 521 ("Hypervisor parameters", _FormatGroupedParams(hvparams)), 522 523 ("OS-specific hypervisor parameters", 524 _FormatGroupedParams(result["os_hvp"])), 525 526 ("OS parameters", _FormatGroupedParams(result["osparams"])), 527 528 ("Hidden OSes", utils.CommaJoin(result["hidden_os"])), 529 ("Blacklisted OSes", utils.CommaJoin(result["blacklisted_os"])), 530 531 ("Cluster parameters", [ 532 ("candidate pool size", 533 compat.TryToRoman(result["candidate_pool_size"], 534 convert=opts.roman_integers)), 535 ("maximal number of jobs running simultaneously", 536 compat.TryToRoman(result["max_running_jobs"], 537 convert=opts.roman_integers)), 538 ("master netdev", result["master_netdev"]), 539 ("master netmask", result["master_netmask"]), 540 ("use external master IP address setup script", 541 result["use_external_mip_script"]), 542 ("lvm volume group", result["volume_group_name"]), 543 ("lvm reserved volumes", reserved_lvs), 544 ("drbd usermode helper", result["drbd_usermode_helper"]), 545 ("file storage path", result["file_storage_dir"]), 546 ("shared file storage path", result["shared_file_storage_dir"]), 547 ("gluster storage path", result["gluster_storage_dir"]), 548 ("maintenance of node health", result["maintain_node_health"]), 549 ("uid pool", uidpool.FormatUidPool(result["uid_pool"])), 550 ("default instance allocator", result["default_iallocator"]), 551 ("default instance allocator parameters", 552 result["default_iallocator_params"]), 553 ("primary ip version", result["primary_ip_version"]), 554 ("preallocation wipe disks", result["prealloc_wipe_disks"]), 555 ("OS search path", utils.CommaJoin(pathutils.OS_SEARCH_PATH)), 556 ("ExtStorage Providers search path", 557 utils.CommaJoin(pathutils.ES_SEARCH_PATH)), 558 ("enabled disk templates", 559 utils.CommaJoin(result["enabled_disk_templates"])), 560 ("enabled user shutdown", result["enabled_user_shutdown"]), 561 ]), 562 563 ("Default node parameters", 564 _FormatGroupedParams(result["ndparams"], roman=opts.roman_integers)), 565 566 ("Default instance parameters", 567 _FormatGroupedParams(result["beparams"], roman=opts.roman_integers)), 568 569 ("Default nic parameters", 570 _FormatGroupedParams(result["nicparams"], roman=opts.roman_integers)), 571 572 ("Default disk parameters", 573 _FormatGroupedParams(result["diskparams"], roman=opts.roman_integers)), 574 575 ("Instance policy - limits for instances", 576 FormatPolicyInfo(result["ipolicy"], None, True)), 577 ] 578 579 PrintGenericInfo(info) 580 return 0
581
582 583 -def ClusterCopyFile(opts, args):
584 """Copy a file from master to some nodes. 585 586 @param opts: the command line options selected by the user 587 @type args: list 588 @param args: should contain only one element, the path of 589 the file to be copied 590 @rtype: int 591 @return: the desired exit code 592 593 """ 594 filename = args[0] 595 filename = os.path.abspath(filename) 596 597 if not os.path.exists(filename): 598 raise errors.OpPrereqError("No such filename '%s'" % filename, 599 errors.ECODE_INVAL) 600 601 cl = GetClient() 602 qcl = GetClient(query=True) 603 try: 604 cluster_name = cl.QueryConfigValues(["cluster_name"])[0] 605 606 results = GetOnlineNodes(nodes=opts.nodes, cl=qcl, filter_master=True, 607 secondary_ips=opts.use_replication_network, 608 nodegroup=opts.nodegroup) 609 ports = GetNodesSshPorts(opts.nodes, qcl) 610 finally: 611 cl.Close() 612 qcl.Close() 613 614 srun = ssh.SshRunner(cluster_name) 615 for (node, port) in zip(results, ports): 616 if not srun.CopyFileToNode(node, port, filename): 617 ToStderr("Copy of file %s to node %s:%d failed", filename, node, port) 618 619 return 0
620
621 622 -def RunClusterCommand(opts, args):
623 """Run a command on some nodes. 624 625 @param opts: the command line options selected by the user 626 @type args: list 627 @param args: should contain the command to be run and its arguments 628 @rtype: int 629 @return: the desired exit code 630 631 """ 632 cl = GetClient() 633 qcl = GetClient(query=True) 634 635 command = " ".join(args) 636 637 nodes = GetOnlineNodes(nodes=opts.nodes, cl=qcl, nodegroup=opts.nodegroup) 638 ports = GetNodesSshPorts(nodes, qcl) 639 640 cluster_name, master_node = cl.QueryConfigValues(["cluster_name", 641 "master_node"]) 642 643 srun = ssh.SshRunner(cluster_name=cluster_name) 644 645 # Make sure master node is at list end 646 if master_node in nodes: 647 nodes.remove(master_node) 648 nodes.append(master_node) 649 650 for (name, port) in zip(nodes, ports): 651 result = srun.Run(name, constants.SSH_LOGIN_USER, command, port=port) 652 653 if opts.failure_only and result.exit_code == constants.EXIT_SUCCESS: 654 # Do not output anything for successful commands 655 continue 656 657 ToStdout("------------------------------------------------") 658 if opts.show_machine_names: 659 for line in result.output.splitlines(): 660 ToStdout("%s: %s", name, line) 661 else: 662 ToStdout("node: %s", name) 663 ToStdout("%s", result.output) 664 ToStdout("return code = %s", result.exit_code) 665 666 return 0
667
668 669 -def VerifyCluster(opts, args):
670 """Verify integrity of cluster, performing various test on nodes. 671 672 @param opts: the command line options selected by the user 673 @type args: list 674 @param args: should be an empty list 675 @rtype: int 676 @return: the desired exit code 677 678 """ 679 skip_checks = [] 680 681 if opts.skip_nplusone_mem: 682 skip_checks.append(constants.VERIFY_NPLUSONE_MEM) 683 684 cl = GetClient() 685 686 op = opcodes.OpClusterVerify(verbose=opts.verbose, 687 error_codes=opts.error_codes, 688 debug_simulate_errors=opts.simulate_errors, 689 skip_checks=skip_checks, 690 ignore_errors=opts.ignore_errors, 691 group_name=opts.nodegroup) 692 result = SubmitOpCode(op, cl=cl, opts=opts) 693 694 # Keep track of submitted jobs 695 jex = JobExecutor(cl=cl, opts=opts) 696 697 for (status, job_id) in result[constants.JOB_IDS_KEY]: 698 jex.AddJobId(None, status, job_id) 699 700 results = jex.GetResults() 701 702 (bad_jobs, bad_results) = \ 703 map(len, 704 # Convert iterators to lists 705 map(list, 706 # Count errors 707 map(compat.partial(itertools.ifilterfalse, bool), 708 # Convert result to booleans in a tuple 709 zip(*((job_success, len(op_results) == 1 and op_results[0]) 710 for (job_success, op_results) in results))))) 711 712 if bad_jobs == 0 and bad_results == 0: 713 rcode = constants.EXIT_SUCCESS 714 else: 715 rcode = constants.EXIT_FAILURE 716 if bad_jobs > 0: 717 ToStdout("%s job(s) failed while verifying the cluster.", bad_jobs) 718 719 return rcode
720
721 722 -def VerifyDisks(opts, args):
723 """Verify integrity of cluster disks. 724 725 @param opts: the command line options selected by the user 726 @type args: list 727 @param args: should be an empty list 728 @rtype: int 729 @return: the desired exit code 730 731 """ 732 cl = GetClient() 733 734 op = opcodes.OpClusterVerifyDisks() 735 736 result = SubmitOpCode(op, cl=cl, opts=opts) 737 738 # Keep track of submitted jobs 739 jex = JobExecutor(cl=cl, opts=opts) 740 741 for (status, job_id) in result[constants.JOB_IDS_KEY]: 742 jex.AddJobId(None, status, job_id) 743 744 retcode = constants.EXIT_SUCCESS 745 746 for (status, result) in jex.GetResults(): 747 if not status: 748 ToStdout("Job failed: %s", result) 749 continue 750 751 ((bad_nodes, instances, missing), ) = result 752 753 for node, text in bad_nodes.items(): 754 ToStdout("Error gathering data on node %s: %s", 755 node, utils.SafeEncode(text[-400:])) 756 retcode = constants.EXIT_FAILURE 757 ToStdout("You need to fix these nodes first before fixing instances") 758 759 for iname in instances: 760 if iname in missing: 761 continue 762 op = opcodes.OpInstanceActivateDisks(instance_name=iname) 763 try: 764 ToStdout("Activating disks for instance '%s'", iname) 765 SubmitOpCode(op, opts=opts, cl=cl) 766 except errors.GenericError, err: 767 nret, msg = FormatError(err) 768 retcode |= nret 769 ToStderr("Error activating disks for instance %s: %s", iname, msg) 770 771 if missing: 772 for iname, ival in missing.iteritems(): 773 all_missing = compat.all(x[0] in bad_nodes for x in ival) 774 if all_missing: 775 ToStdout("Instance %s cannot be verified as it lives on" 776 " broken nodes", iname) 777 else: 778 ToStdout("Instance %s has missing logical volumes:", iname) 779 ival.sort() 780 for node, vol in ival: 781 if node in bad_nodes: 782 ToStdout("\tbroken node %s /dev/%s", node, vol) 783 else: 784 ToStdout("\t%s /dev/%s", node, vol) 785 786 ToStdout("You need to replace or recreate disks for all the above" 787 " instances if this message persists after fixing broken nodes.") 788 retcode = constants.EXIT_FAILURE 789 elif not instances: 790 ToStdout("No disks need to be activated.") 791 792 return retcode
793
794 795 -def RepairDiskSizes(opts, args):
796 """Verify sizes of cluster disks. 797 798 @param opts: the command line options selected by the user 799 @type args: list 800 @param args: optional list of instances to restrict check to 801 @rtype: int 802 @return: the desired exit code 803 804 """ 805 op = opcodes.OpClusterRepairDiskSizes(instances=args) 806 SubmitOpCode(op, opts=opts)
807
808 809 @UsesRPC 810 -def MasterFailover(opts, args):
811 """Failover the master node. 812 813 This command, when run on a non-master node, will cause the current 814 master to cease being master, and the non-master to become new 815 master. 816 817 @param opts: the command line options selected by the user 818 @type args: list 819 @param args: should be an empty list 820 @rtype: int 821 @return: the desired exit code 822 823 """ 824 if opts.no_voting and not opts.yes_do_it: 825 usertext = ("This will perform the failover even if most other nodes" 826 " are down, or if this node is outdated. This is dangerous" 827 " as it can lead to a non-consistent cluster. Check the" 828 " gnt-cluster(8) man page before proceeding. Continue?") 829 if not AskUser(usertext): 830 return 1 831 832 rvlaue, msgs = bootstrap.MasterFailover(no_voting=opts.no_voting) 833 for msg in msgs: 834 ToStderr(msg) 835 return rvlaue
836
837 838 -def MasterPing(opts, args):
839 """Checks if the master is alive. 840 841 @param opts: the command line options selected by the user 842 @type args: list 843 @param args: should be an empty list 844 @rtype: int 845 @return: the desired exit code 846 847 """ 848 try: 849 cl = GetClient() 850 cl.QueryClusterInfo() 851 return 0 852 except Exception: # pylint: disable=W0703 853 return 1
854
855 856 -def SearchTags(opts, args):
857 """Searches the tags on all the cluster. 858 859 @param opts: the command line options selected by the user 860 @type args: list 861 @param args: should contain only one element, the tag pattern 862 @rtype: int 863 @return: the desired exit code 864 865 """ 866 op = opcodes.OpTagsSearch(pattern=args[0]) 867 result = SubmitOpCode(op, opts=opts) 868 if not result: 869 return 1 870 result = list(result) 871 result.sort() 872 for path, tag in result: 873 ToStdout("%s %s", path, tag)
874
875 876 -def _ReadAndVerifyCert(cert_filename, verify_private_key=False):
877 """Reads and verifies an X509 certificate. 878 879 @type cert_filename: string 880 @param cert_filename: the path of the file containing the certificate to 881 verify encoded in PEM format 882 @type verify_private_key: bool 883 @param verify_private_key: whether to verify the private key in addition to 884 the public certificate 885 @rtype: string 886 @return: a string containing the PEM-encoded certificate. 887 888 """ 889 try: 890 pem = utils.ReadFile(cert_filename) 891 except IOError, err: 892 raise errors.X509CertError(cert_filename, 893 "Unable to read certificate: %s" % str(err)) 894 895 try: 896 OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, pem) 897 except Exception, err: 898 raise errors.X509CertError(cert_filename, 899 "Unable to load certificate: %s" % str(err)) 900 901 if verify_private_key: 902 try: 903 OpenSSL.crypto.load_privatekey(OpenSSL.crypto.FILETYPE_PEM, pem) 904 except Exception, err: 905 raise errors.X509CertError(cert_filename, 906 "Unable to load private key: %s" % str(err)) 907 908 return pem
909
910 911 -def _RenewCrypto(new_cluster_cert, new_rapi_cert, # pylint: disable=R0911 912 rapi_cert_filename, new_spice_cert, spice_cert_filename, 913 spice_cacert_filename, new_confd_hmac_key, new_cds, 914 cds_filename, force, new_node_cert):
915 """Renews cluster certificates, keys and secrets. 916 917 @type new_cluster_cert: bool 918 @param new_cluster_cert: Whether to generate a new cluster certificate 919 @type new_rapi_cert: bool 920 @param new_rapi_cert: Whether to generate a new RAPI certificate 921 @type rapi_cert_filename: string 922 @param rapi_cert_filename: Path to file containing new RAPI certificate 923 @type new_spice_cert: bool 924 @param new_spice_cert: Whether to generate a new SPICE certificate 925 @type spice_cert_filename: string 926 @param spice_cert_filename: Path to file containing new SPICE certificate 927 @type spice_cacert_filename: string 928 @param spice_cacert_filename: Path to file containing the certificate of the 929 CA that signed the SPICE certificate 930 @type new_confd_hmac_key: bool 931 @param new_confd_hmac_key: Whether to generate a new HMAC key 932 @type new_cds: bool 933 @param new_cds: Whether to generate a new cluster domain secret 934 @type cds_filename: string 935 @param cds_filename: Path to file containing new cluster domain secret 936 @type force: bool 937 @param force: Whether to ask user for confirmation 938 @type new_node_cert: string 939 @param new_node_cert: Whether to generate new node certificates 940 941 """ 942 if new_rapi_cert and rapi_cert_filename: 943 ToStderr("Only one of the --new-rapi-certificate and --rapi-certificate" 944 " options can be specified at the same time.") 945 return 1 946 947 if new_cds and cds_filename: 948 ToStderr("Only one of the --new-cluster-domain-secret and" 949 " --cluster-domain-secret options can be specified at" 950 " the same time.") 951 return 1 952 953 if new_spice_cert and (spice_cert_filename or spice_cacert_filename): 954 ToStderr("When using --new-spice-certificate, the --spice-certificate" 955 " and --spice-ca-certificate must not be used.") 956 return 1 957 958 if bool(spice_cacert_filename) ^ bool(spice_cert_filename): 959 ToStderr("Both --spice-certificate and --spice-ca-certificate must be" 960 " specified.") 961 return 1 962 963 rapi_cert_pem, spice_cert_pem, spice_cacert_pem = (None, None, None) 964 try: 965 if rapi_cert_filename: 966 rapi_cert_pem = _ReadAndVerifyCert(rapi_cert_filename, True) 967 if spice_cert_filename: 968 spice_cert_pem = _ReadAndVerifyCert(spice_cert_filename, True) 969 spice_cacert_pem = _ReadAndVerifyCert(spice_cacert_filename) 970 except errors.X509CertError, err: 971 ToStderr("Unable to load X509 certificate from %s: %s", err[0], err[1]) 972 return 1 973 974 if cds_filename: 975 try: 976 cds = utils.ReadFile(cds_filename) 977 except Exception, err: # pylint: disable=W0703 978 ToStderr("Can't load new cluster domain secret from %s: %s" % 979 (cds_filename, str(err))) 980 return 1 981 else: 982 cds = None 983 984 if not force: 985 usertext = ("This requires all daemons on all nodes to be restarted and" 986 " may take some time. Continue?") 987 if not AskUser(usertext): 988 return 1 989 990 def _RenewCryptoInner(ctx): 991 ctx.feedback_fn("Updating certificates and keys") 992 # Note: the node certificate will be generated in the LU 993 bootstrap.GenerateClusterCrypto(new_cluster_cert, 994 new_rapi_cert, 995 new_spice_cert, 996 new_confd_hmac_key, 997 new_cds, 998 rapi_cert_pem=rapi_cert_pem, 999 spice_cert_pem=spice_cert_pem, 1000 spice_cacert_pem=spice_cacert_pem, 1001 cds=cds) 1002 1003 files_to_copy = [] 1004 1005 if new_cluster_cert: 1006 files_to_copy.append(pathutils.NODED_CERT_FILE) 1007 1008 if new_rapi_cert or rapi_cert_pem: 1009 files_to_copy.append(pathutils.RAPI_CERT_FILE) 1010 1011 if new_spice_cert or spice_cert_pem: 1012 files_to_copy.append(pathutils.SPICE_CERT_FILE) 1013 files_to_copy.append(pathutils.SPICE_CACERT_FILE) 1014 1015 if new_confd_hmac_key: 1016 files_to_copy.append(pathutils.CONFD_HMAC_KEY) 1017 1018 if new_cds or cds: 1019 files_to_copy.append(pathutils.CLUSTER_DOMAIN_SECRET_FILE) 1020 1021 if files_to_copy: 1022 for node_name in ctx.nonmaster_nodes: 1023 port = ctx.ssh_ports[node_name] 1024 ctx.feedback_fn("Copying %s to %s:%d" % 1025 (", ".join(files_to_copy), node_name, port)) 1026 for file_name in files_to_copy: 1027 ctx.ssh.CopyFileToNode(node_name, port, file_name)
1028 1029 RunWhileClusterStopped(ToStdout, _RenewCryptoInner) 1030 1031 ToStdout("All requested certificates and keys have been replaced." 1032 " Running \"gnt-cluster verify\" now is recommended.") 1033 1034 if new_node_cert: 1035 cl = GetClient() 1036 renew_op = opcodes.OpClusterRenewCrypto() 1037 SubmitOpCode(renew_op, cl=cl) 1038 1039 return 0 1040
1041 1042 -def RenewCrypto(opts, args):
1043 """Renews cluster certificates, keys and secrets. 1044 1045 """ 1046 return _RenewCrypto(opts.new_cluster_cert, 1047 opts.new_rapi_cert, 1048 opts.rapi_cert, 1049 opts.new_spice_cert, 1050 opts.spice_cert, 1051 opts.spice_cacert, 1052 opts.new_confd_hmac_key, 1053 opts.new_cluster_domain_secret, 1054 opts.cluster_domain_secret, 1055 opts.force, 1056 opts.new_node_cert)
1057
1058 1059 -def _GetEnabledDiskTemplates(opts):
1060 """Determine the list of enabled disk templates. 1061 1062 """ 1063 if opts.enabled_disk_templates: 1064 return opts.enabled_disk_templates.split(",") 1065 else: 1066 return None
1067
1068 1069 -def _GetVgName(opts, enabled_disk_templates):
1070 """Determine the volume group name. 1071 1072 @type enabled_disk_templates: list of strings 1073 @param enabled_disk_templates: cluster-wide enabled disk-templates 1074 1075 """ 1076 # consistency between vg name and enabled disk templates 1077 vg_name = None 1078 if opts.vg_name is not None: 1079 vg_name = opts.vg_name 1080 if enabled_disk_templates: 1081 if vg_name and not utils.IsLvmEnabled(enabled_disk_templates): 1082 ToStdout("You specified a volume group with --vg-name, but you did not" 1083 " enable any of the following lvm-based disk templates: %s" % 1084 utils.CommaJoin(constants.DTS_LVM)) 1085 return vg_name
1086
1087 1088 -def _GetDrbdHelper(opts, enabled_disk_templates):
1089 """Determine the DRBD usermode helper. 1090 1091 """ 1092 drbd_helper = opts.drbd_helper 1093 if enabled_disk_templates: 1094 drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates 1095 if not drbd_enabled and opts.drbd_helper: 1096 ToStdout("You specified a DRBD usermode helper with " 1097 " --drbd-usermode-helper while DRBD is not enabled.") 1098 return drbd_helper
1099
1100 1101 -def SetClusterParams(opts, args):
1102 """Modify the cluster. 1103 1104 @param opts: the command line options selected by the user 1105 @type args: list 1106 @param args: should be an empty list 1107 @rtype: int 1108 @return: the desired exit code 1109 1110 """ 1111 if not (opts.vg_name is not None or 1112 opts.drbd_helper is not None or 1113 opts.enabled_hypervisors or opts.hvparams or 1114 opts.beparams or opts.nicparams or 1115 opts.ndparams or opts.diskparams or 1116 opts.candidate_pool_size is not None or 1117 opts.max_running_jobs is not None or 1118 opts.uid_pool is not None or 1119 opts.maintain_node_health is not None or 1120 opts.add_uids is not None or 1121 opts.remove_uids is not None or 1122 opts.default_iallocator is not None or 1123 opts.default_iallocator_params is not None or 1124 opts.reserved_lvs is not None or 1125 opts.master_netdev is not None or 1126 opts.master_netmask is not None or 1127 opts.use_external_mip_script is not None or 1128 opts.prealloc_wipe_disks is not None or 1129 opts.hv_state or 1130 opts.enabled_disk_templates or 1131 opts.disk_state or 1132 opts.ipolicy_bounds_specs is not None or 1133 opts.ipolicy_std_specs is not None or 1134 opts.ipolicy_disk_templates is not None or 1135 opts.ipolicy_vcpu_ratio is not None or 1136 opts.ipolicy_spindle_ratio is not None or 1137 opts.modify_etc_hosts is not None or 1138 opts.file_storage_dir is not None or 1139 opts.shared_file_storage_dir is not None or 1140 opts.enabled_user_shutdown is not None): 1141 ToStderr("Please give at least one of the parameters.") 1142 return 1 1143 1144 enabled_disk_templates = _GetEnabledDiskTemplates(opts) 1145 vg_name = _GetVgName(opts, enabled_disk_templates) 1146 1147 try: 1148 drbd_helper = _GetDrbdHelper(opts, enabled_disk_templates) 1149 except errors.OpPrereqError, e: 1150 ToStderr(str(e)) 1151 return 1 1152 1153 hvlist = opts.enabled_hypervisors 1154 if hvlist is not None: 1155 hvlist = hvlist.split(",") 1156 1157 # a list of (name, dict) we can pass directly to dict() (or []) 1158 hvparams = dict(opts.hvparams) 1159 for hv_params in hvparams.values(): 1160 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) 1161 1162 diskparams = dict(opts.diskparams) 1163 1164 for dt_params in diskparams.values(): 1165 utils.ForceDictType(dt_params, constants.DISK_DT_TYPES) 1166 1167 beparams = opts.beparams 1168 utils.ForceDictType(beparams, constants.BES_PARAMETER_COMPAT) 1169 1170 nicparams = opts.nicparams 1171 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES) 1172 1173 ndparams = opts.ndparams 1174 if ndparams is not None: 1175 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES) 1176 1177 ipolicy = CreateIPolicyFromOpts( 1178 minmax_ispecs=opts.ipolicy_bounds_specs, 1179 std_ispecs=opts.ipolicy_std_specs, 1180 ipolicy_disk_templates=opts.ipolicy_disk_templates, 1181 ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio, 1182 ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio, 1183 ) 1184 1185 mnh = opts.maintain_node_health 1186 1187 uid_pool = opts.uid_pool 1188 if uid_pool is not None: 1189 uid_pool = uidpool.ParseUidPool(uid_pool) 1190 1191 add_uids = opts.add_uids 1192 if add_uids is not None: 1193 add_uids = uidpool.ParseUidPool(add_uids) 1194 1195 remove_uids = opts.remove_uids 1196 if remove_uids is not None: 1197 remove_uids = uidpool.ParseUidPool(remove_uids) 1198 1199 if opts.reserved_lvs is not None: 1200 if opts.reserved_lvs == "": 1201 opts.reserved_lvs = [] 1202 else: 1203 opts.reserved_lvs = utils.UnescapeAndSplit(opts.reserved_lvs, sep=",") 1204 1205 if opts.master_netmask is not None: 1206 try: 1207 opts.master_netmask = int(opts.master_netmask) 1208 except ValueError: 1209 ToStderr("The --master-netmask option expects an int parameter.") 1210 return 1 1211 1212 ext_ip_script = opts.use_external_mip_script 1213 1214 if opts.disk_state: 1215 disk_state = utils.FlatToDict(opts.disk_state) 1216 else: 1217 disk_state = {} 1218 1219 hv_state = dict(opts.hv_state) 1220 1221 op = opcodes.OpClusterSetParams( 1222 vg_name=vg_name, 1223 drbd_helper=drbd_helper, 1224 enabled_hypervisors=hvlist, 1225 hvparams=hvparams, 1226 os_hvp=None, 1227 beparams=beparams, 1228 nicparams=nicparams, 1229 ndparams=ndparams, 1230 diskparams=diskparams, 1231 ipolicy=ipolicy, 1232 candidate_pool_size=opts.candidate_pool_size, 1233 max_running_jobs=opts.max_running_jobs, 1234 maintain_node_health=mnh, 1235 modify_etc_hosts=opts.modify_etc_hosts, 1236 uid_pool=uid_pool, 1237 add_uids=add_uids, 1238 remove_uids=remove_uids, 1239 default_iallocator=opts.default_iallocator, 1240 default_iallocator_params=opts.default_iallocator_params, 1241 prealloc_wipe_disks=opts.prealloc_wipe_disks, 1242 master_netdev=opts.master_netdev, 1243 master_netmask=opts.master_netmask, 1244 reserved_lvs=opts.reserved_lvs, 1245 use_external_mip_script=ext_ip_script, 1246 hv_state=hv_state, 1247 disk_state=disk_state, 1248 enabled_disk_templates=enabled_disk_templates, 1249 force=opts.force, 1250 file_storage_dir=opts.file_storage_dir, 1251 shared_file_storage_dir=opts.shared_file_storage_dir, 1252 enabled_user_shutdown=opts.enabled_user_shutdown, 1253 ) 1254 SubmitOrSend(op, opts) 1255 return 0
1256
1257 1258 -def QueueOps(opts, args):
1259 """Queue operations. 1260 1261 @param opts: the command line options selected by the user 1262 @type args: list 1263 @param args: should contain only one element, the subcommand 1264 @rtype: int 1265 @return: the desired exit code 1266 1267 """ 1268 command = args[0] 1269 client = GetClient() 1270 if command in ("drain", "undrain"): 1271 drain_flag = command == "drain" 1272 client.SetQueueDrainFlag(drain_flag) 1273 elif command == "info": 1274 result = client.QueryConfigValues(["drain_flag"]) 1275 if result[0]: 1276 val = "set" 1277 else: 1278 val = "unset" 1279 ToStdout("The drain flag is %s" % val) 1280 else: 1281 raise errors.OpPrereqError("Command '%s' is not valid." % command, 1282 errors.ECODE_INVAL) 1283 1284 return 0
1285
1286 1287 -def _ShowWatcherPause(until):
1288 if until is None or until < time.time(): 1289 ToStdout("The watcher is not paused.") 1290 else: 1291 ToStdout("The watcher is paused until %s.", time.ctime(until))
1292
1293 1294 -def WatcherOps(opts, args):
1295 """Watcher operations. 1296 1297 @param opts: the command line options selected by the user 1298 @type args: list 1299 @param args: should contain only one element, the subcommand 1300 @rtype: int 1301 @return: the desired exit code 1302 1303 """ 1304 command = args[0] 1305 client = GetClient() 1306 1307 if command == "continue": 1308 client.SetWatcherPause(None) 1309 ToStdout("The watcher is no longer paused.") 1310 1311 elif command == "pause": 1312 if len(args) < 2: 1313 raise errors.OpPrereqError("Missing pause duration", errors.ECODE_INVAL) 1314 1315 result = client.SetWatcherPause(time.time() + ParseTimespec(args[1])) 1316 _ShowWatcherPause(result) 1317 1318 elif command == "info": 1319 result = client.QueryConfigValues(["watcher_pause"]) 1320 _ShowWatcherPause(result[0]) 1321 1322 else: 1323 raise errors.OpPrereqError("Command '%s' is not valid." % command, 1324 errors.ECODE_INVAL) 1325 1326 return 0
1327
1328 1329 -def _OobPower(opts, node_list, power):
1330 """Puts the node in the list to desired power state. 1331 1332 @param opts: The command line options selected by the user 1333 @param node_list: The list of nodes to operate on 1334 @param power: True if they should be powered on, False otherwise 1335 @return: The success of the operation (none failed) 1336 1337 """ 1338 if power: 1339 command = constants.OOB_POWER_ON 1340 else: 1341 command = constants.OOB_POWER_OFF 1342 1343 op = opcodes.OpOobCommand(node_names=node_list, 1344 command=command, 1345 ignore_status=True, 1346 timeout=opts.oob_timeout, 1347 power_delay=opts.power_delay) 1348 result = SubmitOpCode(op, opts=opts) 1349 errs = 0 1350 for node_result in result: 1351 (node_tuple, data_tuple) = node_result 1352 (_, node_name) = node_tuple 1353 (data_status, _) = data_tuple 1354 if data_status != constants.RS_NORMAL: 1355 assert data_status != constants.RS_UNAVAIL 1356 errs += 1 1357 ToStderr("There was a problem changing power for %s, please investigate", 1358 node_name) 1359 1360 if errs > 0: 1361 return False 1362 1363 return True
1364
1365 1366 -def _InstanceStart(opts, inst_list, start, no_remember=False):
1367 """Puts the instances in the list to desired state. 1368 1369 @param opts: The command line options selected by the user 1370 @param inst_list: The list of instances to operate on 1371 @param start: True if they should be started, False for shutdown 1372 @param no_remember: If the instance state should be remembered 1373 @return: The success of the operation (none failed) 1374 1375 """ 1376 if start: 1377 opcls = opcodes.OpInstanceStartup 1378 text_submit, text_success, text_failed = ("startup", "started", "starting") 1379 else: 1380 opcls = compat.partial(opcodes.OpInstanceShutdown, 1381 timeout=opts.shutdown_timeout, 1382 no_remember=no_remember) 1383 text_submit, text_success, text_failed = ("shutdown", "stopped", "stopping") 1384 1385 jex = JobExecutor(opts=opts) 1386 1387 for inst in inst_list: 1388 ToStdout("Submit %s of instance %s", text_submit, inst) 1389 op = opcls(instance_name=inst) 1390 jex.QueueJob(inst, op) 1391 1392 results = jex.GetResults() 1393 bad_cnt = len([1 for (success, _) in results if not success]) 1394 1395 if bad_cnt == 0: 1396 ToStdout("All instances have been %s successfully", text_success) 1397 else: 1398 ToStderr("There were errors while %s instances:\n" 1399 "%d error(s) out of %d instance(s)", text_failed, bad_cnt, 1400 len(results)) 1401 return False 1402 1403 return True
1404
1405 1406 -class _RunWhenNodesReachableHelper(object):
1407 """Helper class to make shared internal state sharing easier. 1408 1409 @ivar success: Indicates if all action_cb calls were successful 1410 1411 """
1412 - def __init__(self, node_list, action_cb, node2ip, port, feedback_fn, 1413 _ping_fn=netutils.TcpPing, _sleep_fn=time.sleep):
1414 """Init the object. 1415 1416 @param node_list: The list of nodes to be reachable 1417 @param action_cb: Callback called when a new host is reachable 1418 @type node2ip: dict 1419 @param node2ip: Node to ip mapping 1420 @param port: The port to use for the TCP ping 1421 @param feedback_fn: The function used for feedback 1422 @param _ping_fn: Function to check reachabilty (for unittest use only) 1423 @param _sleep_fn: Function to sleep (for unittest use only) 1424 1425 """ 1426 self.down = set(node_list) 1427 self.up = set() 1428 self.node2ip = node2ip 1429 self.success = True 1430 self.action_cb = action_cb 1431 self.port = port 1432 self.feedback_fn = feedback_fn 1433 self._ping_fn = _ping_fn 1434 self._sleep_fn = _sleep_fn
1435
1436 - def __call__(self):
1437 """When called we run action_cb. 1438 1439 @raises utils.RetryAgain: When there are still down nodes 1440 1441 """ 1442 if not self.action_cb(self.up): 1443 self.success = False 1444 1445 if self.down: 1446 raise utils.RetryAgain() 1447 else: 1448 return self.success
1449
1450 - def Wait(self, secs):
1451 """Checks if a host is up or waits remaining seconds. 1452 1453 @param secs: The secs remaining 1454 1455 """ 1456 start = time.time() 1457 for node in self.down: 1458 if self._ping_fn(self.node2ip[node], self.port, timeout=_EPO_PING_TIMEOUT, 1459 live_port_needed=True): 1460 self.feedback_fn("Node %s became available" % node) 1461 self.up.add(node) 1462 self.down -= self.up 1463 # If we have a node available there is the possibility to run the 1464 # action callback successfully, therefore we don't wait and return 1465 return 1466 1467 self._sleep_fn(max(0.0, start + secs - time.time()))
1468
1469 1470 -def _RunWhenNodesReachable(node_list, action_cb, interval):
1471 """Run action_cb when nodes become reachable. 1472 1473 @param node_list: The list of nodes to be reachable 1474 @param action_cb: Callback called when a new host is reachable 1475 @param interval: The earliest time to retry 1476 1477 """ 1478 client = GetClient() 1479 cluster_info = client.QueryClusterInfo() 1480 if cluster_info["primary_ip_version"] == constants.IP4_VERSION: 1481 family = netutils.IPAddress.family 1482 else: 1483 family = netutils.IP6Address.family 1484 1485 node2ip = dict((node, netutils.GetHostname(node, family=family).ip) 1486 for node in node_list) 1487 1488 port = netutils.GetDaemonPort(constants.NODED) 1489 helper = _RunWhenNodesReachableHelper(node_list, action_cb, node2ip, port, 1490 ToStdout) 1491 1492 try: 1493 return utils.Retry(helper, interval, _EPO_REACHABLE_TIMEOUT, 1494 wait_fn=helper.Wait) 1495 except utils.RetryTimeout: 1496 ToStderr("Time exceeded while waiting for nodes to become reachable" 1497 " again:\n - %s", " - ".join(helper.down)) 1498 return False
1499
1500 1501 -def _MaybeInstanceStartup(opts, inst_map, nodes_online, 1502 _instance_start_fn=_InstanceStart):
1503 """Start the instances conditional based on node_states. 1504 1505 @param opts: The command line options selected by the user 1506 @param inst_map: A dict of inst -> nodes mapping 1507 @param nodes_online: A list of nodes online 1508 @param _instance_start_fn: Callback to start instances (unittest use only) 1509 @return: Success of the operation on all instances 1510 1511 """ 1512 start_inst_list = [] 1513 for (inst, nodes) in inst_map.items(): 1514 if not (nodes - nodes_online): 1515 # All nodes the instance lives on are back online 1516 start_inst_list.append(inst) 1517 1518 for inst in start_inst_list: 1519 del inst_map[inst] 1520 1521 if start_inst_list: 1522 return _instance_start_fn(opts, start_inst_list, True) 1523 1524 return True
1525
1526 1527 -def _EpoOn(opts, full_node_list, node_list, inst_map):
1528 """Does the actual power on. 1529 1530 @param opts: The command line options selected by the user 1531 @param full_node_list: All nodes to operate on (includes nodes not supporting 1532 OOB) 1533 @param node_list: The list of nodes to operate on (all need to support OOB) 1534 @param inst_map: A dict of inst -> nodes mapping 1535 @return: The desired exit status 1536 1537 """ 1538 if node_list and not _OobPower(opts, node_list, False): 1539 ToStderr("Not all nodes seem to get back up, investigate and start" 1540 " manually if needed") 1541 1542 # Wait for the nodes to be back up 1543 action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map)) 1544 1545 ToStdout("Waiting until all nodes are available again") 1546 if not _RunWhenNodesReachable(full_node_list, action_cb, _EPO_PING_INTERVAL): 1547 ToStderr("Please investigate and start stopped instances manually") 1548 return constants.EXIT_FAILURE 1549 1550 return constants.EXIT_SUCCESS
1551
1552 1553 -def _EpoOff(opts, node_list, inst_map):
1554 """Does the actual power off. 1555 1556 @param opts: The command line options selected by the user 1557 @param node_list: The list of nodes to operate on (all need to support OOB) 1558 @param inst_map: A dict of inst -> nodes mapping 1559 @return: The desired exit status 1560 1561 """ 1562 if not _InstanceStart(opts, inst_map.keys(), False, no_remember=True): 1563 ToStderr("Please investigate and stop instances manually before continuing") 1564 return constants.EXIT_FAILURE 1565 1566 if not node_list: 1567 return constants.EXIT_SUCCESS 1568 1569 if _OobPower(opts, node_list, False): 1570 return constants.EXIT_SUCCESS 1571 else: 1572 return constants.EXIT_FAILURE
1573
1574 1575 -def Epo(opts, args, qcl=None, _on_fn=_EpoOn, _off_fn=_EpoOff, 1576 _confirm_fn=ConfirmOperation, 1577 _stdout_fn=ToStdout, _stderr_fn=ToStderr):
1578 """EPO operations. 1579 1580 @param opts: the command line options selected by the user 1581 @type args: list 1582 @param args: should contain only one element, the subcommand 1583 @rtype: int 1584 @return: the desired exit code 1585 1586 """ 1587 if opts.groups and opts.show_all: 1588 _stderr_fn("Only one of --groups or --all are allowed") 1589 return constants.EXIT_FAILURE 1590 elif args and opts.show_all: 1591 _stderr_fn("Arguments in combination with --all are not allowed") 1592 return constants.EXIT_FAILURE 1593 1594 if qcl is None: 1595 # Query client 1596 qcl = GetClient(query=True) 1597 1598 if opts.groups: 1599 node_query_list = \ 1600 itertools.chain(*qcl.QueryGroups(args, ["node_list"], False)) 1601 else: 1602 node_query_list = args 1603 1604 result = qcl.QueryNodes(node_query_list, ["name", "master", "pinst_list", 1605 "sinst_list", "powered", "offline"], 1606 False) 1607 1608 all_nodes = map(compat.fst, result) 1609 node_list = [] 1610 inst_map = {} 1611 for (node, master, pinsts, sinsts, powered, offline) in result: 1612 if not offline: 1613 for inst in (pinsts + sinsts): 1614 if inst in inst_map: 1615 if not master: 1616 inst_map[inst].add(node) 1617 elif master: 1618 inst_map[inst] = set() 1619 else: 1620 inst_map[inst] = set([node]) 1621 1622 if master and opts.on: 1623 # We ignore the master for turning on the machines, in fact we are 1624 # already operating on the master at this point :) 1625 continue 1626 elif master and not opts.show_all: 1627 _stderr_fn("%s is the master node, please do a master-failover to another" 1628 " node not affected by the EPO or use --all if you intend to" 1629 " shutdown the whole cluster", node) 1630 return constants.EXIT_FAILURE 1631 elif powered is None: 1632 _stdout_fn("Node %s does not support out-of-band handling, it can not be" 1633 " handled in a fully automated manner", node) 1634 elif powered == opts.on: 1635 _stdout_fn("Node %s is already in desired power state, skipping", node) 1636 elif not offline or (offline and powered): 1637 node_list.append(node) 1638 1639 if not (opts.force or _confirm_fn(all_nodes, "nodes", "epo")): 1640 return constants.EXIT_FAILURE 1641 1642 if opts.on: 1643 return _on_fn(opts, all_nodes, node_list, inst_map) 1644 else: 1645 return _off_fn(opts, node_list, inst_map)
1646
1647 1648 -def _GetCreateCommand(info):
1649 buf = StringIO() 1650 buf.write("gnt-cluster init") 1651 PrintIPolicyCommand(buf, info["ipolicy"], False) 1652 buf.write(" ") 1653 buf.write(info["name"]) 1654 return buf.getvalue()
1655
1656 1657 -def ShowCreateCommand(opts, args):
1658 """Shows the command that can be used to re-create the cluster. 1659 1660 Currently it works only for ipolicy specs. 1661 1662 """ 1663 cl = GetClient(query=True) 1664 result = cl.QueryClusterInfo() 1665 ToStdout(_GetCreateCommand(result))
1666
1667 1668 -def _RunCommandAndReport(cmd):
1669 """Run a command and report its output, iff it failed. 1670 1671 @param cmd: the command to execute 1672 @type cmd: list 1673 @rtype: bool 1674 @return: False, if the execution failed. 1675 1676 """ 1677 result = utils.RunCmd(cmd) 1678 if result.failed: 1679 ToStderr("Command %s failed: %s; Output %s" % 1680 (cmd, result.fail_reason, result.output)) 1681 return False 1682 return True
1683
1684 1685 -def _VerifyCommand(cmd):
1686 """Verify that a given command succeeds on all online nodes. 1687 1688 As this function is intended to run during upgrades, it 1689 is implemented in such a way that it still works, if all Ganeti 1690 daemons are down. 1691 1692 @param cmd: the command to execute 1693 @type cmd: list 1694 @rtype: list 1695 @return: the list of node names that are online where 1696 the command failed. 1697 1698 """ 1699 command = utils.text.ShellQuoteArgs([str(val) for val in cmd]) 1700 1701 nodes = ssconf.SimpleStore().GetOnlineNodeList() 1702 master_node = ssconf.SimpleStore().GetMasterNode() 1703 cluster_name = ssconf.SimpleStore().GetClusterName() 1704 1705 # If master node is in 'nodes', make sure master node is at list end 1706 if master_node in nodes: 1707 nodes.remove(master_node) 1708 nodes.append(master_node) 1709 1710 failed = [] 1711 1712 srun = ssh.SshRunner(cluster_name=cluster_name) 1713 for name in nodes: 1714 result = srun.Run(name, constants.SSH_LOGIN_USER, command) 1715 if result.exit_code != 0: 1716 failed.append(name) 1717 1718 return failed
1719
1720 1721 -def _VerifyVersionInstalled(versionstring):
1722 """Verify that the given version of ganeti is installed on all online nodes. 1723 1724 Do nothing, if this is the case, otherwise print an appropriate 1725 message to stderr. 1726 1727 @param versionstring: the version to check for 1728 @type versionstring: string 1729 @rtype: bool 1730 @return: True, if the version is installed on all online nodes 1731 1732 """ 1733 badnodes = _VerifyCommand(["test", "-d", 1734 os.path.join(pathutils.PKGLIBDIR, versionstring)]) 1735 if badnodes: 1736 ToStderr("Ganeti version %s not installed on nodes %s" 1737 % (versionstring, ", ".join(badnodes))) 1738 return False 1739 1740 return True
1741
1742 1743 -def _GetRunning():
1744 """Determine the list of running jobs. 1745 1746 @rtype: list 1747 @return: the number of jobs still running 1748 1749 """ 1750 cl = GetClient() 1751 qfilter = qlang.MakeSimpleFilter("status", 1752 frozenset([constants.JOB_STATUS_RUNNING])) 1753 return len(cl.Query(constants.QR_JOB, [], qfilter).data)
1754
1755 1756 -def _SetGanetiVersion(versionstring):
1757 """Set the active version of ganeti to the given versionstring 1758 1759 @type versionstring: string 1760 @rtype: list 1761 @return: the list of nodes where the version change failed 1762 1763 """ 1764 failed = [] 1765 if constants.HAS_GNU_LN: 1766 failed.extend(_VerifyCommand( 1767 ["ln", "-s", "-f", "-T", 1768 os.path.join(pathutils.PKGLIBDIR, versionstring), 1769 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")])) 1770 failed.extend(_VerifyCommand( 1771 ["ln", "-s", "-f", "-T", 1772 os.path.join(pathutils.SHAREDIR, versionstring), 1773 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")])) 1774 else: 1775 failed.extend(_VerifyCommand( 1776 ["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")])) 1777 failed.extend(_VerifyCommand( 1778 ["ln", "-s", "-f", os.path.join(pathutils.PKGLIBDIR, versionstring), 1779 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")])) 1780 failed.extend(_VerifyCommand( 1781 ["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/share")])) 1782 failed.extend(_VerifyCommand( 1783 ["ln", "-s", "-f", os.path.join(pathutils.SHAREDIR, versionstring), 1784 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")])) 1785 return list(set(failed))
1786
1787 1788 -def _ExecuteCommands(fns):
1789 """Execute a list of functions, in reverse order. 1790 1791 @type fns: list of functions. 1792 @param fns: the functions to be executed. 1793 1794 """ 1795 for fn in reversed(fns): 1796 fn()
1797
1798 1799 -def _GetConfigVersion():
1800 """Determine the version the configuration file currently has. 1801 1802 @rtype: tuple or None 1803 @return: (major, minor, revision) if the version can be determined, 1804 None otherwise 1805 1806 """ 1807 config_data = serializer.LoadJson(utils.ReadFile(pathutils.CLUSTER_CONF_FILE)) 1808 try: 1809 config_version = config_data["version"] 1810 except KeyError: 1811 return None 1812 return utils.SplitVersion(config_version)
1813
1814 1815 -def _ReadIntentToUpgrade():
1816 """Read the file documenting the intent to upgrade the cluster. 1817 1818 @rtype: (string, string) or (None, None) 1819 @return: (old version, version to upgrade to), if the file exists, 1820 and (None, None) otherwise. 1821 1822 """ 1823 if not os.path.isfile(pathutils.INTENT_TO_UPGRADE): 1824 return (None, None) 1825 1826 contentstring = utils.ReadFile(pathutils.INTENT_TO_UPGRADE) 1827 contents = utils.UnescapeAndSplit(contentstring) 1828 if len(contents) != 3: 1829 # file syntactically mal-formed 1830 return (None, None) 1831 return (contents[0], contents[1])
1832
1833 1834 -def _WriteIntentToUpgrade(version):
1835 """Write file documenting the intent to upgrade the cluster. 1836 1837 @type version: string 1838 @param version: the version we intent to upgrade to 1839 1840 """ 1841 utils.WriteFile(pathutils.INTENT_TO_UPGRADE, 1842 data=utils.EscapeAndJoin([constants.RELEASE_VERSION, version, 1843 "%d" % os.getpid()]))
1844
1845 1846 -def _UpgradeBeforeConfigurationChange(versionstring):
1847 """ 1848 Carry out all the tasks necessary for an upgrade that happen before 1849 the configuration file, or Ganeti version, changes. 1850 1851 @type versionstring: string 1852 @param versionstring: the version to upgrade to 1853 @rtype: (bool, list) 1854 @return: tuple of a bool indicating success and a list of rollback tasks 1855 1856 """ 1857 rollback = [] 1858 1859 if not _VerifyVersionInstalled(versionstring): 1860 return (False, rollback) 1861 1862 _WriteIntentToUpgrade(versionstring) 1863 rollback.append( 1864 lambda: utils.RunCmd(["rm", "-f", pathutils.INTENT_TO_UPGRADE])) 1865 1866 ToStdout("Draining queue") 1867 client = GetClient() 1868 client.SetQueueDrainFlag(True) 1869 1870 rollback.append(lambda: GetClient().SetQueueDrainFlag(False)) 1871 1872 if utils.SimpleRetry(0, _GetRunning, 1873 constants.UPGRADE_QUEUE_POLL_INTERVAL, 1874 constants.UPGRADE_QUEUE_DRAIN_TIMEOUT): 1875 ToStderr("Failed to completely empty the queue.") 1876 return (False, rollback) 1877 1878 ToStdout("Pausing the watcher for one hour.") 1879 rollback.append(lambda: GetClient().SetWatcherPause(None)) 1880 GetClient().SetWatcherPause(time.time() + 60 * 60) 1881 1882 ToStdout("Stopping daemons on master node.") 1883 if not _RunCommandAndReport([pathutils.DAEMON_UTIL, "stop-all"]): 1884 return (False, rollback) 1885 1886 if not _VerifyVersionInstalled(versionstring): 1887 utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"]) 1888 return (False, rollback) 1889 1890 ToStdout("Stopping daemons everywhere.") 1891 rollback.append(lambda: _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])) 1892 badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"]) 1893 if badnodes: 1894 ToStderr("Failed to stop daemons on %s." % (", ".join(badnodes),)) 1895 return (False, rollback) 1896 1897 backuptar = os.path.join(pathutils.BACKUP_DIR, "ganeti%d.tar" % time.time()) 1898 ToStdout("Backing up configuration as %s" % backuptar) 1899 if not _RunCommandAndReport(["mkdir", "-p", pathutils.BACKUP_DIR]): 1900 return (False, rollback) 1901 1902 # Create the archive in a safe manner, as it contains sensitive 1903 # information. 1904 (_, tmp_name) = tempfile.mkstemp(prefix=backuptar, dir=pathutils.BACKUP_DIR) 1905 if not _RunCommandAndReport(["tar", "-cf", tmp_name, 1906 "--exclude=queue/archive", 1907 pathutils.DATA_DIR]): 1908 return (False, rollback) 1909 1910 os.rename(tmp_name, backuptar) 1911 return (True, rollback)
1912
1913 1914 -def _VersionSpecificDowngrade():
1915 """ 1916 Perform any additional downrade tasks that are version specific 1917 and need to be done just after the configuration downgrade. This 1918 function needs to be idempotent, so that it can be redone if the 1919 downgrade procedure gets interrupted after changing the 1920 configuration. 1921 1922 Note that this function has to be reset with every version bump. 1923 1924 @return: True upon success 1925 """ 1926 ToStdout("Performing version-specific downgrade tasks.") 1927 1928 ToStdout("...removing client certificates ssconf file") 1929 ssconffile = ssconf.SimpleStore().KeyToFilename( 1930 constants.SS_MASTER_CANDIDATES_CERTS) 1931 badnodes = _VerifyCommand(["rm", "-f", ssconffile]) 1932 if badnodes: 1933 ToStderr("Warning: failed to clean up ssconf on %s." 1934 % (", ".join(badnodes),)) 1935 return False 1936 1937 ToStdout("...removing client certificates") 1938 badnodes = _VerifyCommand(["rm", "-f", pathutils.NODED_CLIENT_CERT_FILE]) 1939 if badnodes: 1940 ToStderr("Warning: failed to clean up certificates on %s." 1941 % (", ".join(badnodes),)) 1942 return False 1943 1944 return True
1945
1946 1947 -def _SwitchVersionAndConfig(versionstring, downgrade):
1948 """ 1949 Switch to the new Ganeti version and change the configuration, 1950 in correct order. 1951 1952 @type versionstring: string 1953 @param versionstring: the version to change to 1954 @type downgrade: bool 1955 @param downgrade: True, if the configuration should be downgraded 1956 @rtype: (bool, list) 1957 @return: tupe of a bool indicating success, and a list of 1958 additional rollback tasks 1959 1960 """ 1961 rollback = [] 1962 if downgrade: 1963 ToStdout("Downgrading configuration") 1964 if not _RunCommandAndReport([pathutils.CFGUPGRADE, "--downgrade", "-f"]): 1965 return (False, rollback) 1966 # Note: version specific downgrades need to be done before switching 1967 # binaries, so that we still have the knowledgeable binary if the downgrade 1968 # process gets interrupted at this point. 1969 if not _VersionSpecificDowngrade(): 1970 return (False, rollback) 1971 1972 # Configuration change is the point of no return. From then onwards, it is 1973 # safer to push through the up/dowgrade than to try to roll it back. 1974 1975 ToStdout("Switching to version %s on all nodes" % versionstring) 1976 rollback.append(lambda: _SetGanetiVersion(constants.DIR_VERSION)) 1977 badnodes = _SetGanetiVersion(versionstring) 1978 if badnodes: 1979 ToStderr("Failed to switch to Ganeti version %s on nodes %s" 1980 % (versionstring, ", ".join(badnodes))) 1981 if not downgrade: 1982 return (False, rollback) 1983 1984 # Now that we have changed to the new version of Ganeti we should 1985 # not communicate over luxi any more, as luxi might have changed in 1986 # incompatible ways. Therefore, manually call the corresponding ganeti 1987 # commands using their canonical (version independent) path. 1988 1989 if not downgrade: 1990 ToStdout("Upgrading configuration") 1991 if not _RunCommandAndReport([pathutils.CFGUPGRADE, "-f"]): 1992 return (False, rollback) 1993 1994 return (True, rollback)
1995
1996 1997 -def _UpgradeAfterConfigurationChange(oldversion):
1998 """ 1999 Carry out the upgrade actions necessary after switching to the new 2000 Ganeti version and updating the configuration. 2001 2002 As this part is run at a time where the new version of Ganeti is already 2003 running, no communication should happen via luxi, as this is not a stable 2004 interface. Also, as the configuration change is the point of no return, 2005 all actions are pushed trough, even if some of them fail. 2006 2007 @param oldversion: the version the upgrade started from 2008 @type oldversion: string 2009 @rtype: int 2010 @return: the intended return value 2011 2012 """ 2013 returnvalue = 0 2014 2015 ToStdout("Ensuring directories everywhere.") 2016 badnodes = _VerifyCommand([pathutils.ENSURE_DIRS]) 2017 if badnodes: 2018 ToStderr("Warning: failed to ensure directories on %s." % 2019 (", ".join(badnodes))) 2020 returnvalue = 1 2021 2022 ToStdout("Starting daemons everywhere.") 2023 badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"]) 2024 if badnodes: 2025 ToStderr("Warning: failed to start daemons on %s." % (", ".join(badnodes),)) 2026 returnvalue = 1 2027 2028 ToStdout("Redistributing the configuration.") 2029 if not _RunCommandAndReport(["gnt-cluster", "redist-conf", "--yes-do-it"]): 2030 returnvalue = 1 2031 2032 ToStdout("Restarting daemons everywhere.") 2033 badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"]) 2034 badnodes.extend(_VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])) 2035 if badnodes: 2036 ToStderr("Warning: failed to start daemons on %s." % 2037 (", ".join(list(set(badnodes))),)) 2038 returnvalue = 1 2039 2040 ToStdout("Undraining the queue.") 2041 if not _RunCommandAndReport(["gnt-cluster", "queue", "undrain"]): 2042 returnvalue = 1 2043 2044 _RunCommandAndReport(["rm", "-f", pathutils.INTENT_TO_UPGRADE]) 2045 2046 ToStdout("Running post-upgrade hooks") 2047 if not _RunCommandAndReport([pathutils.POST_UPGRADE, oldversion]): 2048 returnvalue = 1 2049 2050 ToStdout("Unpausing the watcher.") 2051 if not _RunCommandAndReport(["gnt-cluster", "watcher", "continue"]): 2052 returnvalue = 1 2053 2054 ToStdout("Verifying cluster.") 2055 if not _RunCommandAndReport(["gnt-cluster", "verify"]): 2056 returnvalue = 1 2057 2058 return returnvalue
2059
2060 2061 -def UpgradeGanetiCommand(opts, args):
2062 """Upgrade a cluster to a new ganeti version. 2063 2064 @param opts: the command line options selected by the user 2065 @type args: list 2066 @param args: should be an empty list 2067 @rtype: int 2068 @return: the desired exit code 2069 2070 """ 2071 if ((not opts.resume and opts.to is None) 2072 or (opts.resume and opts.to is not None)): 2073 ToStderr("Precisely one of the options --to and --resume" 2074 " has to be given") 2075 return 1 2076 2077 # If we're not told to resume, verify there is no upgrade 2078 # in progress. 2079 if not opts.resume: 2080 oldversion, versionstring = _ReadIntentToUpgrade() 2081 if versionstring is not None: 2082 # An upgrade is going on; verify whether the target matches 2083 if versionstring == opts.to: 2084 ToStderr("An upgrade is already in progress. Target version matches," 2085 " resuming.") 2086 opts.resume = True 2087 opts.to = None 2088 else: 2089 ToStderr("An upgrade from %s to %s is in progress; use --resume to" 2090 " finish it first" % (oldversion, versionstring)) 2091 return 1 2092 2093 oldversion = constants.RELEASE_VERSION 2094 2095 if opts.resume: 2096 ssconf.CheckMaster(False) 2097 oldversion, versionstring = _ReadIntentToUpgrade() 2098 if versionstring is None: 2099 return 0 2100 version = utils.version.ParseVersion(versionstring) 2101 if version is None: 2102 return 1 2103 configversion = _GetConfigVersion() 2104 if configversion is None: 2105 return 1 2106 # If the upgrade we resume was an upgrade between compatible 2107 # versions (like 2.10.0 to 2.10.1), the correct configversion 2108 # does not guarantee that the config has been updated. 2109 # However, in the case of a compatible update with the configuration 2110 # not touched, we are running a different dirversion with the same 2111 # config version. 2112 config_already_modified = \ 2113 (utils.IsCorrectConfigVersion(version, configversion) and 2114 not (versionstring != constants.DIR_VERSION and 2115 configversion == (constants.CONFIG_MAJOR, constants.CONFIG_MINOR, 2116 constants.CONFIG_REVISION))) 2117 if not config_already_modified: 2118 # We have to start from the beginning; however, some daemons might have 2119 # already been stopped, so the only way to get into a well-defined state 2120 # is by starting all daemons again. 2121 _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"]) 2122 else: 2123 versionstring = opts.to 2124 config_already_modified = False 2125 version = utils.version.ParseVersion(versionstring) 2126 if version is None: 2127 ToStderr("Could not parse version string %s" % versionstring) 2128 return 1 2129 2130 msg = utils.version.UpgradeRange(version) 2131 if msg is not None: 2132 ToStderr("Cannot upgrade to %s: %s" % (versionstring, msg)) 2133 return 1 2134 2135 if not config_already_modified: 2136 success, rollback = _UpgradeBeforeConfigurationChange(versionstring) 2137 if not success: 2138 _ExecuteCommands(rollback) 2139 return 1 2140 else: 2141 rollback = [] 2142 2143 downgrade = utils.version.ShouldCfgdowngrade(version) 2144 2145 success, additionalrollback = \ 2146 _SwitchVersionAndConfig(versionstring, downgrade) 2147 if not success: 2148 rollback.extend(additionalrollback) 2149 _ExecuteCommands(rollback) 2150 return 1 2151 2152 return _UpgradeAfterConfigurationChange(oldversion)
2153 2154 2155 commands = { 2156 "init": ( 2157 InitCluster, [ArgHost(min=1, max=1)], 2158 [BACKEND_OPT, CP_SIZE_OPT, ENABLED_HV_OPT, GLOBAL_FILEDIR_OPT, 2159 HVLIST_OPT, MAC_PREFIX_OPT, MASTER_NETDEV_OPT, MASTER_NETMASK_OPT, 2160 NIC_PARAMS_OPT, NOMODIFY_ETCHOSTS_OPT, NOMODIFY_SSH_SETUP_OPT, 2161 SECONDARY_IP_OPT, VG_NAME_OPT, MAINTAIN_NODE_HEALTH_OPT, UIDPOOL_OPT, 2162 DRBD_HELPER_OPT, DEFAULT_IALLOCATOR_OPT, DEFAULT_IALLOCATOR_PARAMS_OPT, 2163 PRIMARY_IP_VERSION_OPT, PREALLOC_WIPE_DISKS_OPT, NODE_PARAMS_OPT, 2164 GLOBAL_SHARED_FILEDIR_OPT, USE_EXTERNAL_MIP_SCRIPT, DISK_PARAMS_OPT, 2165 HV_STATE_OPT, DISK_STATE_OPT, ENABLED_DISK_TEMPLATES_OPT, 2166 ENABLED_USER_SHUTDOWN_OPT, IPOLICY_STD_SPECS_OPT, 2167 GLOBAL_GLUSTER_FILEDIR_OPT] 2168 + INSTANCE_POLICY_OPTS + SPLIT_ISPECS_OPTS, 2169 "[opts...] <cluster_name>", "Initialises a new cluster configuration"), 2170 "destroy": ( 2171 DestroyCluster, ARGS_NONE, [YES_DOIT_OPT], 2172 "", "Destroy cluster"), 2173 "rename": ( 2174 RenameCluster, [ArgHost(min=1, max=1)], 2175 [FORCE_OPT, DRY_RUN_OPT], 2176 "<new_name>", 2177 "Renames the cluster"), 2178 "redist-conf": ( 2179 RedistributeConfig, ARGS_NONE, SUBMIT_OPTS + 2180 [DRY_RUN_OPT, PRIORITY_OPT, FORCE_DISTRIBUTION], 2181 "", "Forces a push of the configuration file and ssconf files" 2182 " to the nodes in the cluster"), 2183 "verify": ( 2184 VerifyCluster, ARGS_NONE, 2185 [VERBOSE_OPT, DEBUG_SIMERR_OPT, ERROR_CODES_OPT, NONPLUS1_OPT, 2186 DRY_RUN_OPT, PRIORITY_OPT, NODEGROUP_OPT, IGNORE_ERRORS_OPT], 2187 "", "Does a check on the cluster configuration"), 2188 "verify-disks": ( 2189 VerifyDisks, ARGS_NONE, [PRIORITY_OPT], 2190 "", "Does a check on the cluster disk status"), 2191 "repair-disk-sizes": ( 2192 RepairDiskSizes, ARGS_MANY_INSTANCES, [DRY_RUN_OPT, PRIORITY_OPT], 2193 "[instance...]", "Updates mismatches in recorded disk sizes"), 2194 "master-failover": ( 2195 MasterFailover, ARGS_NONE, [NOVOTING_OPT, FORCE_FAILOVER], 2196 "", "Makes the current node the master"), 2197 "master-ping": ( 2198 MasterPing, ARGS_NONE, [], 2199 "", "Checks if the master is alive"), 2200 "version": ( 2201 ShowClusterVersion, ARGS_NONE, [], 2202 "", "Shows the cluster version"), 2203 "getmaster": ( 2204 ShowClusterMaster, ARGS_NONE, [], 2205 "", "Shows the cluster master"), 2206 "copyfile": ( 2207 ClusterCopyFile, [ArgFile(min=1, max=1)], 2208 [NODE_LIST_OPT, USE_REPL_NET_OPT, NODEGROUP_OPT], 2209 "[-n node...] <filename>", "Copies a file to all (or only some) nodes"), 2210 "command": ( 2211 RunClusterCommand, [ArgCommand(min=1)], 2212 [NODE_LIST_OPT, NODEGROUP_OPT, SHOW_MACHINE_OPT, FAILURE_ONLY_OPT], 2213 "[-n node...] <command>", "Runs a command on all (or only some) nodes"), 2214 "info": ( 2215 ShowClusterConfig, ARGS_NONE, [ROMAN_OPT], 2216 "[--roman]", "Show cluster configuration"), 2217 "list-tags": ( 2218 ListTags, ARGS_NONE, [], "", "List the tags of the cluster"), 2219 "add-tags": ( 2220 AddTags, [ArgUnknown()], [TAG_SRC_OPT, PRIORITY_OPT] + SUBMIT_OPTS, 2221 "tag...", "Add tags to the cluster"), 2222 "remove-tags": ( 2223 RemoveTags, [ArgUnknown()], [TAG_SRC_OPT, PRIORITY_OPT] + SUBMIT_OPTS, 2224 "tag...", "Remove tags from the cluster"), 2225 "search-tags": ( 2226 SearchTags, [ArgUnknown(min=1, max=1)], [PRIORITY_OPT], "", 2227 "Searches the tags on all objects on" 2228 " the cluster for a given pattern (regex)"), 2229 "queue": ( 2230 QueueOps, 2231 [ArgChoice(min=1, max=1, choices=["drain", "undrain", "info"])], 2232 [], "drain|undrain|info", "Change queue properties"), 2233 "watcher": ( 2234 WatcherOps, 2235 [ArgChoice(min=1, max=1, choices=["pause", "continue", "info"]), 2236 ArgSuggest(min=0, max=1, choices=["30m", "1h", "4h"])], 2237 [], 2238 "{pause <timespec>|continue|info}", "Change watcher properties"), 2239 "modify": ( 2240 SetClusterParams, ARGS_NONE, 2241 [FORCE_OPT, 2242 BACKEND_OPT, CP_SIZE_OPT, RQL_OPT, 2243 ENABLED_HV_OPT, HVLIST_OPT, MASTER_NETDEV_OPT, 2244 MASTER_NETMASK_OPT, NIC_PARAMS_OPT, VG_NAME_OPT, MAINTAIN_NODE_HEALTH_OPT, 2245 UIDPOOL_OPT, ADD_UIDS_OPT, REMOVE_UIDS_OPT, DRBD_HELPER_OPT, 2246 DEFAULT_IALLOCATOR_OPT, DEFAULT_IALLOCATOR_PARAMS_OPT, RESERVED_LVS_OPT, 2247 DRY_RUN_OPT, PRIORITY_OPT, PREALLOC_WIPE_DISKS_OPT, NODE_PARAMS_OPT, 2248 USE_EXTERNAL_MIP_SCRIPT, DISK_PARAMS_OPT, HV_STATE_OPT, DISK_STATE_OPT] + 2249 SUBMIT_OPTS + 2250 [ENABLED_DISK_TEMPLATES_OPT, IPOLICY_STD_SPECS_OPT, MODIFY_ETCHOSTS_OPT, 2251 ENABLED_USER_SHUTDOWN_OPT] + 2252 INSTANCE_POLICY_OPTS + [GLOBAL_FILEDIR_OPT, GLOBAL_SHARED_FILEDIR_OPT], 2253 "[opts...]", 2254 "Alters the parameters of the cluster"), 2255 "renew-crypto": ( 2256 RenewCrypto, ARGS_NONE, 2257 [NEW_CLUSTER_CERT_OPT, NEW_RAPI_CERT_OPT, RAPI_CERT_OPT, 2258 NEW_CONFD_HMAC_KEY_OPT, FORCE_OPT, 2259 NEW_CLUSTER_DOMAIN_SECRET_OPT, CLUSTER_DOMAIN_SECRET_OPT, 2260 NEW_SPICE_CERT_OPT, SPICE_CERT_OPT, SPICE_CACERT_OPT, 2261 NEW_NODE_CERT_OPT], 2262 "[opts...]", 2263 "Renews cluster certificates, keys and secrets"), 2264 "epo": ( 2265 Epo, [ArgUnknown()], 2266 [FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT, 2267 SHUTDOWN_TIMEOUT_OPT, POWER_DELAY_OPT], 2268 "[opts...] [args]", 2269 "Performs an emergency power-off on given args"), 2270 "activate-master-ip": ( 2271 ActivateMasterIp, ARGS_NONE, [], "", "Activates the master IP"), 2272 "deactivate-master-ip": ( 2273 DeactivateMasterIp, ARGS_NONE, [CONFIRM_OPT], "", 2274 "Deactivates the master IP"), 2275 "show-ispecs-cmd": ( 2276 ShowCreateCommand, ARGS_NONE, [], "", 2277 "Show the command line to re-create the cluster"), 2278 "upgrade": ( 2279 UpgradeGanetiCommand, ARGS_NONE, [TO_OPT, RESUME_OPT], "", 2280 "Upgrade (or downgrade) to a new Ganeti version"), 2281 } 2282 2283 2284 #: dictionary with aliases for commands 2285 aliases = { 2286 "masterfailover": "master-failover", 2287 "show": "info", 2288 }
2289 2290 2291 -def Main():
2292 return GenericMain(commands, override={"tag_type": constants.TAG_CLUSTER}, 2293 aliases=aliases)
2294