Package ganeti :: Package cmdlib :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module ganeti.cmdlib.cluster

   1  # 
   2  # 
   3   
   4  # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc. 
   5  # All rights reserved. 
   6  # 
   7  # Redistribution and use in source and binary forms, with or without 
   8  # modification, are permitted provided that the following conditions are 
   9  # met: 
  10  # 
  11  # 1. Redistributions of source code must retain the above copyright notice, 
  12  # this list of conditions and the following disclaimer. 
  13  # 
  14  # 2. Redistributions in binary form must reproduce the above copyright 
  15  # notice, this list of conditions and the following disclaimer in the 
  16  # documentation and/or other materials provided with the distribution. 
  17  # 
  18  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
  19  # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
  20  # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
  21  # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
  22  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  23  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  24  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  25  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  26  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  27  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  28  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  29   
  30   
  31  """Logical units dealing with the cluster.""" 
  32   
  33  import copy 
  34  import itertools 
  35  import logging 
  36  import operator 
  37  import os 
  38  import re 
  39  import time 
  40   
  41  from ganeti import compat 
  42  from ganeti import constants 
  43  from ganeti import errors 
  44  from ganeti import hypervisor 
  45  from ganeti import locking 
  46  from ganeti import masterd 
  47  from ganeti import netutils 
  48  from ganeti import objects 
  49  from ganeti import opcodes 
  50  from ganeti import pathutils 
  51  from ganeti import query 
  52  import ganeti.rpc.node as rpc 
  53  from ganeti import runtime 
  54  from ganeti import ssh 
  55  from ganeti import uidpool 
  56  from ganeti import utils 
  57  from ganeti import vcluster 
  58   
  59  from ganeti.cmdlib.base import NoHooksLU, QueryBase, LogicalUnit, \ 
  60    ResultWithJobs 
  61  from ganeti.cmdlib.common import ShareAll, RunPostHook, \ 
  62    ComputeAncillaryFiles, RedistributeAncillaryFiles, UploadHelper, \ 
  63    GetWantedInstances, MergeAndVerifyHvState, MergeAndVerifyDiskState, \ 
  64    GetUpdatedIPolicy, ComputeNewInstanceViolations, GetUpdatedParams, \ 
  65    CheckOSParams, CheckHVParams, AdjustCandidatePool, CheckNodePVs, \ 
  66    ComputeIPolicyInstanceViolation, AnnotateDiskParams, SupportsOob, \ 
  67    CheckIpolicyVsDiskTemplates, CheckDiskAccessModeValidity, \ 
  68    CheckDiskAccessModeConsistency, GetClientCertDigest, \ 
  69    AddInstanceCommunicationNetworkOp, ConnectInstanceCommunicationNetworkOp, \ 
  70    CheckImageValidity, CheckDiskAccessModeConsistency, EnsureKvmdOnNodes 
  71   
  72  import ganeti.masterd.instance 
73 74 75 -class LUClusterRenewCrypto(NoHooksLU):
76 """Renew the cluster's crypto tokens. 77 78 Note that most of this operation is done in gnt_cluster.py, this LU only 79 takes care of the renewal of the client SSL certificates. 80 81 """ 82 _MAX_NUM_RETRIES = 3 83
84 - def Exec(self, feedback_fn):
85 master_uuid = self.cfg.GetMasterNode() 86 cluster = self.cfg.GetClusterInfo() 87 88 logging.debug("Renewing the master's SSL node certificate." 89 " Master's UUID: %s.", master_uuid) 90 91 # mapping node UUIDs to client certificate digests 92 digest_map = {} 93 master_digest = utils.GetCertificateDigest( 94 cert_filename=pathutils.NODED_CLIENT_CERT_FILE) 95 digest_map[master_uuid] = master_digest 96 logging.debug("Adding the master's SSL node certificate digest to the" 97 " configuration. Master's UUID: %s, Digest: %s", 98 master_uuid, master_digest) 99 100 node_errors = {} 101 nodes = self.cfg.GetAllNodesInfo() 102 logging.debug("Renewing non-master nodes' node certificates.") 103 for (node_uuid, node_info) in nodes.items(): 104 if node_info.offline: 105 feedback_fn("* Skipping offline node %s" % node_info.name) 106 logging.debug("Skipping offline node %s (UUID: %s).", 107 node_info.name, node_uuid) 108 continue 109 if node_uuid != master_uuid: 110 logging.debug("Adding certificate digest of node '%s'.", node_uuid) 111 last_exception = None 112 for i in range(self._MAX_NUM_RETRIES): 113 try: 114 if node_info.master_candidate: 115 node_digest = GetClientCertDigest(self, node_uuid) 116 digest_map[node_uuid] = node_digest 117 logging.debug("Added the node's certificate to candidate" 118 " certificate list. Current list: %s.", 119 str(cluster.candidate_certs)) 120 break 121 except errors.OpExecError as e: 122 last_exception = e 123 logging.error("Could not fetch a non-master node's SSL node" 124 " certificate at attempt no. %s. The node's UUID" 125 " is %s, and the error was: %s.", 126 str(i), node_uuid, e) 127 else: 128 if last_exception: 129 node_errors[node_uuid] = last_exception 130 131 if node_errors: 132 msg = ("Some nodes' SSL client certificates could not be fetched." 133 " Please make sure those nodes are reachable and rerun" 134 " the operation. The affected nodes and their errors are:\n") 135 for uuid, e in node_errors.items(): 136 msg += "Node %s: %s\n" % (uuid, e) 137 feedback_fn(msg) 138 139 self.cfg.SetCandidateCerts(digest_map)
140
141 142 -class LUClusterActivateMasterIp(NoHooksLU):
143 """Activate the master IP on the master node. 144 145 """
146 - def Exec(self, feedback_fn):
147 """Activate the master IP. 148 149 """ 150 master_params = self.cfg.GetMasterNetworkParameters() 151 ems = self.cfg.GetUseExternalMipScript() 152 result = self.rpc.call_node_activate_master_ip(master_params.uuid, 153 master_params, ems) 154 result.Raise("Could not activate the master IP")
155
156 157 -class LUClusterDeactivateMasterIp(NoHooksLU):
158 """Deactivate the master IP on the master node. 159 160 """
161 - def Exec(self, feedback_fn):
162 """Deactivate the master IP. 163 164 """ 165 master_params = self.cfg.GetMasterNetworkParameters() 166 ems = self.cfg.GetUseExternalMipScript() 167 result = self.rpc.call_node_deactivate_master_ip(master_params.uuid, 168 master_params, ems) 169 result.Raise("Could not deactivate the master IP")
170
171 172 -class LUClusterConfigQuery(NoHooksLU):
173 """Return configuration values. 174 175 """ 176 REQ_BGL = False 177
178 - def CheckArguments(self):
179 self.cq = ClusterQuery(None, self.op.output_fields, False)
180
181 - def ExpandNames(self):
182 self.cq.ExpandNames(self)
183
184 - def DeclareLocks(self, level):
185 self.cq.DeclareLocks(self, level)
186
187 - def Exec(self, feedback_fn):
188 result = self.cq.OldStyleQuery(self) 189 190 assert len(result) == 1 191 192 return result[0]
193
194 195 -class LUClusterDestroy(LogicalUnit):
196 """Logical unit for destroying the cluster. 197 198 """ 199 HPATH = "cluster-destroy" 200 HTYPE = constants.HTYPE_CLUSTER 201 202 # Read by the job queue to detect when the cluster is gone and job files will 203 # never be available. 204 # FIXME: This variable should be removed together with the Python job queue. 205 clusterHasBeenDestroyed = False 206
207 - def BuildHooksEnv(self):
208 """Build hooks env. 209 210 """ 211 return { 212 "OP_TARGET": self.cfg.GetClusterName(), 213 }
214
215 - def BuildHooksNodes(self):
216 """Build hooks nodes. 217 218 """ 219 return ([], [])
220
221 - def CheckPrereq(self):
222 """Check prerequisites. 223 224 This checks whether the cluster is empty. 225 226 Any errors are signaled by raising errors.OpPrereqError. 227 228 """ 229 master = self.cfg.GetMasterNode() 230 231 nodelist = self.cfg.GetNodeList() 232 if len(nodelist) != 1 or nodelist[0] != master: 233 raise errors.OpPrereqError("There are still %d node(s) in" 234 " this cluster." % (len(nodelist) - 1), 235 errors.ECODE_INVAL) 236 instancelist = self.cfg.GetInstanceList() 237 if instancelist: 238 raise errors.OpPrereqError("There are still %d instance(s) in" 239 " this cluster." % len(instancelist), 240 errors.ECODE_INVAL)
241
242 - def Exec(self, feedback_fn):
243 """Destroys the cluster. 244 245 """ 246 master_params = self.cfg.GetMasterNetworkParameters() 247 248 # Run post hooks on master node before it's removed 249 RunPostHook(self, self.cfg.GetNodeName(master_params.uuid)) 250 251 ems = self.cfg.GetUseExternalMipScript() 252 result = self.rpc.call_node_deactivate_master_ip(master_params.uuid, 253 master_params, ems) 254 result.Warn("Error disabling the master IP address", self.LogWarning) 255 256 self.wconfd.Client().PrepareClusterDestruction(self.wconfdcontext) 257 258 # signal to the job queue that the cluster is gone 259 LUClusterDestroy.clusterHasBeenDestroyed = True 260 261 return master_params.uuid
262
263 264 -class LUClusterPostInit(LogicalUnit):
265 """Logical unit for running hooks after cluster initialization. 266 267 """ 268 HPATH = "cluster-init" 269 HTYPE = constants.HTYPE_CLUSTER 270
271 - def CheckArguments(self):
272 self.master_uuid = self.cfg.GetMasterNode() 273 self.master_ndparams = self.cfg.GetNdParams(self.cfg.GetMasterNodeInfo()) 274 275 # TODO: When Issue 584 is solved, and None is properly parsed when used 276 # as a default value, ndparams.get(.., None) can be changed to 277 # ndparams[..] to access the values directly 278 279 # OpenvSwitch: Warn user if link is missing 280 if (self.master_ndparams[constants.ND_OVS] and not 281 self.master_ndparams.get(constants.ND_OVS_LINK, None)): 282 self.LogInfo("No physical interface for OpenvSwitch was given." 283 " OpenvSwitch will not have an outside connection. This" 284 " might not be what you want.")
285
286 - def BuildHooksEnv(self):
287 """Build hooks env. 288 289 """ 290 return { 291 "OP_TARGET": self.cfg.GetClusterName(), 292 }
293
294 - def BuildHooksNodes(self):
295 """Build hooks nodes. 296 297 """ 298 return ([], [self.cfg.GetMasterNode()])
299
300 - def Exec(self, feedback_fn):
301 """Create and configure Open vSwitch 302 303 """ 304 if self.master_ndparams[constants.ND_OVS]: 305 result = self.rpc.call_node_configure_ovs( 306 self.master_uuid, 307 self.master_ndparams[constants.ND_OVS_NAME], 308 self.master_ndparams.get(constants.ND_OVS_LINK, None)) 309 result.Raise("Could not successully configure Open vSwitch") 310 311 return True
312
313 314 -class ClusterQuery(QueryBase):
315 FIELDS = query.CLUSTER_FIELDS 316 317 #: Do not sort (there is only one item) 318 SORT_FIELD = None 319
320 - def ExpandNames(self, lu):
321 lu.needed_locks = {} 322 323 # The following variables interact with _QueryBase._GetNames 324 self.wanted = locking.ALL_SET 325 self.do_locking = self.use_locking 326 327 if self.do_locking: 328 raise errors.OpPrereqError("Can not use locking for cluster queries", 329 errors.ECODE_INVAL)
330
331 - def DeclareLocks(self, lu, level):
332 pass
333
334 - def _GetQueryData(self, lu):
335 """Computes the list of nodes and their attributes. 336 337 """ 338 if query.CQ_CONFIG in self.requested_data: 339 cluster = lu.cfg.GetClusterInfo() 340 nodes = lu.cfg.GetAllNodesInfo() 341 else: 342 cluster = NotImplemented 343 nodes = NotImplemented 344 345 if query.CQ_QUEUE_DRAINED in self.requested_data: 346 drain_flag = os.path.exists(pathutils.JOB_QUEUE_DRAIN_FILE) 347 else: 348 drain_flag = NotImplemented 349 350 if query.CQ_WATCHER_PAUSE in self.requested_data: 351 master_node_uuid = lu.cfg.GetMasterNode() 352 353 result = lu.rpc.call_get_watcher_pause(master_node_uuid) 354 result.Raise("Can't retrieve watcher pause from master node '%s'" % 355 lu.cfg.GetMasterNodeName()) 356 357 watcher_pause = result.payload 358 else: 359 watcher_pause = NotImplemented 360 361 return query.ClusterQueryData(cluster, nodes, drain_flag, watcher_pause)
362
363 364 -class LUClusterQuery(NoHooksLU):
365 """Query cluster configuration. 366 367 """ 368 REQ_BGL = False 369
370 - def ExpandNames(self):
371 self.needed_locks = {}
372
373 - def Exec(self, feedback_fn):
374 """Return cluster config. 375 376 """ 377 cluster = self.cfg.GetClusterInfo() 378 os_hvp = {} 379 380 # Filter just for enabled hypervisors 381 for os_name, hv_dict in cluster.os_hvp.items(): 382 os_hvp[os_name] = {} 383 for hv_name, hv_params in hv_dict.items(): 384 if hv_name in cluster.enabled_hypervisors: 385 os_hvp[os_name][hv_name] = hv_params 386 387 # Convert ip_family to ip_version 388 primary_ip_version = constants.IP4_VERSION 389 if cluster.primary_ip_family == netutils.IP6Address.family: 390 primary_ip_version = constants.IP6_VERSION 391 392 result = { 393 "software_version": constants.RELEASE_VERSION, 394 "protocol_version": constants.PROTOCOL_VERSION, 395 "config_version": constants.CONFIG_VERSION, 396 "os_api_version": max(constants.OS_API_VERSIONS), 397 "export_version": constants.EXPORT_VERSION, 398 "vcs_version": constants.VCS_VERSION, 399 "architecture": runtime.GetArchInfo(), 400 "name": cluster.cluster_name, 401 "master": self.cfg.GetMasterNodeName(), 402 "default_hypervisor": cluster.primary_hypervisor, 403 "enabled_hypervisors": cluster.enabled_hypervisors, 404 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name]) 405 for hypervisor_name in cluster.enabled_hypervisors]), 406 "os_hvp": os_hvp, 407 "beparams": cluster.beparams, 408 "osparams": cluster.osparams, 409 "ipolicy": cluster.ipolicy, 410 "nicparams": cluster.nicparams, 411 "ndparams": cluster.ndparams, 412 "diskparams": cluster.diskparams, 413 "candidate_pool_size": cluster.candidate_pool_size, 414 "max_running_jobs": cluster.max_running_jobs, 415 "max_tracked_jobs": cluster.max_tracked_jobs, 416 "mac_prefix": cluster.mac_prefix, 417 "master_netdev": cluster.master_netdev, 418 "master_netmask": cluster.master_netmask, 419 "use_external_mip_script": cluster.use_external_mip_script, 420 "volume_group_name": cluster.volume_group_name, 421 "drbd_usermode_helper": cluster.drbd_usermode_helper, 422 "file_storage_dir": cluster.file_storage_dir, 423 "shared_file_storage_dir": cluster.shared_file_storage_dir, 424 "maintain_node_health": cluster.maintain_node_health, 425 "ctime": cluster.ctime, 426 "mtime": cluster.mtime, 427 "uuid": cluster.uuid, 428 "tags": list(cluster.GetTags()), 429 "uid_pool": cluster.uid_pool, 430 "default_iallocator": cluster.default_iallocator, 431 "default_iallocator_params": cluster.default_iallocator_params, 432 "reserved_lvs": cluster.reserved_lvs, 433 "primary_ip_version": primary_ip_version, 434 "prealloc_wipe_disks": cluster.prealloc_wipe_disks, 435 "hidden_os": cluster.hidden_os, 436 "blacklisted_os": cluster.blacklisted_os, 437 "enabled_disk_templates": cluster.enabled_disk_templates, 438 "install_image": cluster.install_image, 439 "instance_communication_network": cluster.instance_communication_network, 440 "compression_tools": cluster.compression_tools, 441 "enabled_user_shutdown": cluster.enabled_user_shutdown, 442 } 443 444 return result
445
446 447 -class LUClusterRedistConf(NoHooksLU):
448 """Force the redistribution of cluster configuration. 449 450 This is a very simple LU. 451 452 """ 453 REQ_BGL = False 454
455 - def ExpandNames(self):
456 self.needed_locks = { 457 locking.LEVEL_NODE: locking.ALL_SET, 458 locking.LEVEL_NODE_ALLOC: locking.ALL_SET, 459 } 460 self.share_locks = ShareAll()
461
462 - def Exec(self, feedback_fn):
463 """Redistribute the configuration. 464 465 """ 466 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn) 467 RedistributeAncillaryFiles(self)
468
469 470 -class LUClusterRename(LogicalUnit):
471 """Rename the cluster. 472 473 """ 474 HPATH = "cluster-rename" 475 HTYPE = constants.HTYPE_CLUSTER 476
477 - def BuildHooksEnv(self):
478 """Build hooks env. 479 480 """ 481 return { 482 "OP_TARGET": self.cfg.GetClusterName(), 483 "NEW_NAME": self.op.name, 484 }
485
486 - def BuildHooksNodes(self):
487 """Build hooks nodes. 488 489 """ 490 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
491
492 - def CheckPrereq(self):
493 """Verify that the passed name is a valid one. 494 495 """ 496 hostname = netutils.GetHostname(name=self.op.name, 497 family=self.cfg.GetPrimaryIPFamily()) 498 499 new_name = hostname.name 500 self.ip = new_ip = hostname.ip 501 old_name = self.cfg.GetClusterName() 502 old_ip = self.cfg.GetMasterIP() 503 if new_name == old_name and new_ip == old_ip: 504 raise errors.OpPrereqError("Neither the name nor the IP address of the" 505 " cluster has changed", 506 errors.ECODE_INVAL) 507 if new_ip != old_ip: 508 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT): 509 raise errors.OpPrereqError("The given cluster IP address (%s) is" 510 " reachable on the network" % 511 new_ip, errors.ECODE_NOTUNIQUE) 512 513 self.op.name = new_name
514
515 - def Exec(self, feedback_fn):
516 """Rename the cluster. 517 518 """ 519 clustername = self.op.name 520 new_ip = self.ip 521 522 # shutdown the master IP 523 master_params = self.cfg.GetMasterNetworkParameters() 524 ems = self.cfg.GetUseExternalMipScript() 525 result = self.rpc.call_node_deactivate_master_ip(master_params.uuid, 526 master_params, ems) 527 result.Raise("Could not disable the master role") 528 529 try: 530 cluster = self.cfg.GetClusterInfo() 531 cluster.cluster_name = clustername 532 cluster.master_ip = new_ip 533 self.cfg.Update(cluster, feedback_fn) 534 535 # update the known hosts file 536 ssh.WriteKnownHostsFile(self.cfg, pathutils.SSH_KNOWN_HOSTS_FILE) 537 node_list = self.cfg.GetOnlineNodeList() 538 try: 539 node_list.remove(master_params.uuid) 540 except ValueError: 541 pass 542 UploadHelper(self, node_list, pathutils.SSH_KNOWN_HOSTS_FILE) 543 finally: 544 master_params.ip = new_ip 545 result = self.rpc.call_node_activate_master_ip(master_params.uuid, 546 master_params, ems) 547 result.Warn("Could not re-enable the master role on the master," 548 " please restart manually", self.LogWarning) 549 550 return clustername
551
552 553 -class LUClusterRepairDiskSizes(NoHooksLU):
554 """Verifies the cluster disks sizes. 555 556 """ 557 REQ_BGL = False 558
559 - def ExpandNames(self):
560 if self.op.instances: 561 (_, self.wanted_names) = GetWantedInstances(self, self.op.instances) 562 # Not getting the node allocation lock as only a specific set of 563 # instances (and their nodes) is going to be acquired 564 self.needed_locks = { 565 locking.LEVEL_NODE_RES: [], 566 locking.LEVEL_INSTANCE: self.wanted_names, 567 } 568 self.recalculate_locks[locking.LEVEL_NODE_RES] = constants.LOCKS_REPLACE 569 else: 570 self.wanted_names = None 571 self.needed_locks = { 572 locking.LEVEL_NODE_RES: locking.ALL_SET, 573 locking.LEVEL_INSTANCE: locking.ALL_SET, 574 575 # This opcode is acquires the node locks for all instances 576 locking.LEVEL_NODE_ALLOC: locking.ALL_SET, 577 } 578 579 self.share_locks = { 580 locking.LEVEL_NODE_RES: 1, 581 locking.LEVEL_INSTANCE: 0, 582 locking.LEVEL_NODE_ALLOC: 1, 583 }
584
585 - def DeclareLocks(self, level):
586 if level == locking.LEVEL_NODE_RES and self.wanted_names is not None: 587 self._LockInstancesNodes(primary_only=True, level=level)
588
589 - def CheckPrereq(self):
590 """Check prerequisites. 591 592 This only checks the optional instance list against the existing names. 593 594 """ 595 if self.wanted_names is None: 596 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE) 597 598 self.wanted_instances = \ 599 map(compat.snd, self.cfg.GetMultiInstanceInfoByName(self.wanted_names))
600
601 - def _EnsureChildSizes(self, disk):
602 """Ensure children of the disk have the needed disk size. 603 604 This is valid mainly for DRBD8 and fixes an issue where the 605 children have smaller disk size. 606 607 @param disk: an L{ganeti.objects.Disk} object 608 609 """ 610 if disk.dev_type == constants.DT_DRBD8: 611 assert disk.children, "Empty children for DRBD8?" 612 fchild = disk.children[0] 613 mismatch = fchild.size < disk.size 614 if mismatch: 615 self.LogInfo("Child disk has size %d, parent %d, fixing", 616 fchild.size, disk.size) 617 fchild.size = disk.size 618 619 # and we recurse on this child only, not on the metadev 620 return self._EnsureChildSizes(fchild) or mismatch 621 else: 622 return False
623
624 - def Exec(self, feedback_fn):
625 """Verify the size of cluster disks. 626 627 """ 628 # TODO: check child disks too 629 # TODO: check differences in size between primary/secondary nodes 630 per_node_disks = {} 631 for instance in self.wanted_instances: 632 pnode = instance.primary_node 633 if pnode not in per_node_disks: 634 per_node_disks[pnode] = [] 635 for idx, disk in enumerate(self.cfg.GetInstanceDisks(instance.uuid)): 636 per_node_disks[pnode].append((instance, idx, disk)) 637 638 assert not (frozenset(per_node_disks.keys()) - 639 frozenset(self.owned_locks(locking.LEVEL_NODE_RES))), \ 640 "Not owning correct locks" 641 assert not self.owned_locks(locking.LEVEL_NODE) 642 643 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, 644 per_node_disks.keys()) 645 646 changed = [] 647 for node_uuid, dskl in per_node_disks.items(): 648 if not dskl: 649 # no disks on the node 650 continue 651 652 newl = [([v[2].Copy()], v[0]) for v in dskl] 653 node_name = self.cfg.GetNodeName(node_uuid) 654 result = self.rpc.call_blockdev_getdimensions(node_uuid, newl) 655 if result.fail_msg: 656 self.LogWarning("Failure in blockdev_getdimensions call to node" 657 " %s, ignoring", node_name) 658 continue 659 if len(result.payload) != len(dskl): 660 logging.warning("Invalid result from node %s: len(dksl)=%d," 661 " result.payload=%s", node_name, len(dskl), 662 result.payload) 663 self.LogWarning("Invalid result from node %s, ignoring node results", 664 node_name) 665 continue 666 for ((instance, idx, disk), dimensions) in zip(dskl, result.payload): 667 if dimensions is None: 668 self.LogWarning("Disk %d of instance %s did not return size" 669 " information, ignoring", idx, instance.name) 670 continue 671 if not isinstance(dimensions, (tuple, list)): 672 self.LogWarning("Disk %d of instance %s did not return valid" 673 " dimension information, ignoring", idx, 674 instance.name) 675 continue 676 (size, spindles) = dimensions 677 if not isinstance(size, (int, long)): 678 self.LogWarning("Disk %d of instance %s did not return valid" 679 " size information, ignoring", idx, instance.name) 680 continue 681 size = size >> 20 682 if size != disk.size: 683 self.LogInfo("Disk %d of instance %s has mismatched size," 684 " correcting: recorded %d, actual %d", idx, 685 instance.name, disk.size, size) 686 disk.size = size 687 self.cfg.Update(disk, feedback_fn) 688 changed.append((instance.name, idx, "size", size)) 689 if es_flags[node_uuid]: 690 if spindles is None: 691 self.LogWarning("Disk %d of instance %s did not return valid" 692 " spindles information, ignoring", idx, 693 instance.name) 694 elif disk.spindles is None or disk.spindles != spindles: 695 self.LogInfo("Disk %d of instance %s has mismatched spindles," 696 " correcting: recorded %s, actual %s", 697 idx, instance.name, disk.spindles, spindles) 698 disk.spindles = spindles 699 self.cfg.Update(disk, feedback_fn) 700 changed.append((instance.name, idx, "spindles", disk.spindles)) 701 if self._EnsureChildSizes(disk): 702 self.cfg.Update(disk, feedback_fn) 703 changed.append((instance.name, idx, "size", disk.size)) 704 return changed
705
706 707 -def _ValidateNetmask(cfg, netmask):
708 """Checks if a netmask is valid. 709 710 @type cfg: L{config.ConfigWriter} 711 @param cfg: cluster configuration 712 @type netmask: int 713 @param netmask: netmask to be verified 714 @raise errors.OpPrereqError: if the validation fails 715 716 """ 717 ip_family = cfg.GetPrimaryIPFamily() 718 try: 719 ipcls = netutils.IPAddress.GetClassFromIpFamily(ip_family) 720 except errors.ProgrammerError: 721 raise errors.OpPrereqError("Invalid primary ip family: %s." % 722 ip_family, errors.ECODE_INVAL) 723 if not ipcls.ValidateNetmask(netmask): 724 raise errors.OpPrereqError("CIDR netmask (%s) not valid" % 725 (netmask), errors.ECODE_INVAL)
726
727 728 -def CheckFileBasedStoragePathVsEnabledDiskTemplates( 729 logging_warn_fn, file_storage_dir, enabled_disk_templates, 730 file_disk_template):
731 """Checks whether the given file-based storage directory is acceptable. 732 733 Note: This function is public, because it is also used in bootstrap.py. 734 735 @type logging_warn_fn: function 736 @param logging_warn_fn: function which accepts a string and logs it 737 @type file_storage_dir: string 738 @param file_storage_dir: the directory to be used for file-based instances 739 @type enabled_disk_templates: list of string 740 @param enabled_disk_templates: the list of enabled disk templates 741 @type file_disk_template: string 742 @param file_disk_template: the file-based disk template for which the 743 path should be checked 744 745 """ 746 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes( 747 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER 748 )) 749 750 file_storage_enabled = file_disk_template in enabled_disk_templates 751 if file_storage_dir is not None: 752 if file_storage_dir == "": 753 if file_storage_enabled: 754 raise errors.OpPrereqError( 755 "Unsetting the '%s' storage directory while having '%s' storage" 756 " enabled is not permitted." % 757 (file_disk_template, file_disk_template)) 758 else: 759 if not file_storage_enabled: 760 logging_warn_fn( 761 "Specified a %s storage directory, although %s storage is not" 762 " enabled." % (file_disk_template, file_disk_template)) 763 else: 764 raise errors.ProgrammerError("Received %s storage dir with value" 765 " 'None'." % file_disk_template)
766
767 768 -def CheckFileStoragePathVsEnabledDiskTemplates( 769 logging_warn_fn, file_storage_dir, enabled_disk_templates):
770 """Checks whether the given file storage directory is acceptable. 771 772 @see: C{CheckFileBasedStoragePathVsEnabledDiskTemplates} 773 774 """ 775 CheckFileBasedStoragePathVsEnabledDiskTemplates( 776 logging_warn_fn, file_storage_dir, enabled_disk_templates, 777 constants.DT_FILE)
778
779 780 -def CheckSharedFileStoragePathVsEnabledDiskTemplates( 781 logging_warn_fn, file_storage_dir, enabled_disk_templates):
782 """Checks whether the given shared file storage directory is acceptable. 783 784 @see: C{CheckFileBasedStoragePathVsEnabledDiskTemplates} 785 786 """ 787 CheckFileBasedStoragePathVsEnabledDiskTemplates( 788 logging_warn_fn, file_storage_dir, enabled_disk_templates, 789 constants.DT_SHARED_FILE)
790
791 792 -def CheckGlusterStoragePathVsEnabledDiskTemplates( 793 logging_warn_fn, file_storage_dir, enabled_disk_templates):
794 """Checks whether the given gluster storage directory is acceptable. 795 796 @see: C{CheckFileBasedStoragePathVsEnabledDiskTemplates} 797 798 """ 799 CheckFileBasedStoragePathVsEnabledDiskTemplates( 800 logging_warn_fn, file_storage_dir, enabled_disk_templates, 801 constants.DT_GLUSTER)
802
803 804 -def CheckCompressionTools(tools):
805 """Check whether the provided compression tools look like executables. 806 807 @type tools: list of string 808 @param tools: The tools provided as opcode input 809 810 """ 811 regex = re.compile('^[-_a-zA-Z0-9]+$') 812 illegal_tools = [t for t in tools if not regex.match(t)] 813 814 if illegal_tools: 815 raise errors.OpPrereqError( 816 "The tools '%s' contain illegal characters: only alphanumeric values," 817 " dashes, and underscores are allowed" % ", ".join(illegal_tools) 818 ) 819 820 if constants.IEC_GZIP not in tools: 821 raise errors.OpPrereqError("For compatibility reasons, the %s utility must" 822 " be present among the compression tools" % 823 constants.IEC_GZIP) 824 825 if constants.IEC_NONE in tools: 826 raise errors.OpPrereqError("%s is a reserved value used for no compression," 827 " and cannot be used as the name of a tool" % 828 constants.IEC_NONE)
829
830 831 -class LUClusterSetParams(LogicalUnit):
832 """Change the parameters of the cluster. 833 834 """ 835 HPATH = "cluster-modify" 836 HTYPE = constants.HTYPE_CLUSTER 837 REQ_BGL = False 838
839 - def CheckArguments(self):
840 """Check parameters 841 842 """ 843 if self.op.uid_pool: 844 uidpool.CheckUidPool(self.op.uid_pool) 845 846 if self.op.add_uids: 847 uidpool.CheckUidPool(self.op.add_uids) 848 849 if self.op.remove_uids: 850 uidpool.CheckUidPool(self.op.remove_uids) 851 852 if self.op.mac_prefix: 853 self.op.mac_prefix = \ 854 utils.NormalizeAndValidateThreeOctetMacPrefix(self.op.mac_prefix) 855 856 if self.op.master_netmask is not None: 857 _ValidateNetmask(self.cfg, self.op.master_netmask) 858 859 if self.op.diskparams: 860 for dt_params in self.op.diskparams.values(): 861 utils.ForceDictType(dt_params, constants.DISK_DT_TYPES) 862 try: 863 utils.VerifyDictOptions(self.op.diskparams, constants.DISK_DT_DEFAULTS) 864 CheckDiskAccessModeValidity(self.op.diskparams) 865 except errors.OpPrereqError, err: 866 raise errors.OpPrereqError("While verify diskparams options: %s" % err, 867 errors.ECODE_INVAL) 868 869 if self.op.install_image is not None: 870 CheckImageValidity(self.op.install_image, 871 "Install image must be an absolute path or a URL")
872
873 - def ExpandNames(self):
874 # FIXME: in the future maybe other cluster params won't require checking on 875 # all nodes to be modified. 876 # FIXME: This opcode changes cluster-wide settings. Is acquiring all 877 # resource locks the right thing, shouldn't it be the BGL instead? 878 self.needed_locks = { 879 locking.LEVEL_NODE: locking.ALL_SET, 880 locking.LEVEL_INSTANCE: locking.ALL_SET, 881 locking.LEVEL_NODEGROUP: locking.ALL_SET, 882 locking.LEVEL_NODE_ALLOC: locking.ALL_SET, 883 } 884 self.share_locks = ShareAll()
885
886 - def BuildHooksEnv(self):
887 """Build hooks env. 888 889 """ 890 return { 891 "OP_TARGET": self.cfg.GetClusterName(), 892 "NEW_VG_NAME": self.op.vg_name, 893 }
894
895 - def BuildHooksNodes(self):
896 """Build hooks nodes. 897 898 """ 899 mn = self.cfg.GetMasterNode() 900 return ([mn], [mn])
901
902 - def _CheckVgName(self, node_uuids, enabled_disk_templates, 903 new_enabled_disk_templates):
904 """Check the consistency of the vg name on all nodes and in case it gets 905 unset whether there are instances still using it. 906 907 """ 908 lvm_is_enabled = utils.IsLvmEnabled(enabled_disk_templates) 909 lvm_gets_enabled = utils.LvmGetsEnabled(enabled_disk_templates, 910 new_enabled_disk_templates) 911 current_vg_name = self.cfg.GetVGName() 912 913 if self.op.vg_name == '': 914 if lvm_is_enabled: 915 raise errors.OpPrereqError("Cannot unset volume group if lvm-based" 916 " disk templates are or get enabled.") 917 918 if self.op.vg_name is None: 919 if current_vg_name is None and lvm_is_enabled: 920 raise errors.OpPrereqError("Please specify a volume group when" 921 " enabling lvm-based disk-templates.") 922 923 if self.op.vg_name is not None and not self.op.vg_name: 924 if self.cfg.HasAnyDiskOfType(constants.DT_PLAIN): 925 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based" 926 " instances exist", errors.ECODE_INVAL) 927 928 if (self.op.vg_name is not None and lvm_is_enabled) or \ 929 (self.cfg.GetVGName() is not None and lvm_gets_enabled): 930 self._CheckVgNameOnNodes(node_uuids)
931
932 - def _CheckVgNameOnNodes(self, node_uuids):
933 """Check the status of the volume group on each node. 934 935 """ 936 vglist = self.rpc.call_vg_list(node_uuids) 937 for node_uuid in node_uuids: 938 msg = vglist[node_uuid].fail_msg 939 if msg: 940 # ignoring down node 941 self.LogWarning("Error while gathering data on node %s" 942 " (ignoring node): %s", 943 self.cfg.GetNodeName(node_uuid), msg) 944 continue 945 vgstatus = utils.CheckVolumeGroupSize(vglist[node_uuid].payload, 946 self.op.vg_name, 947 constants.MIN_VG_SIZE) 948 if vgstatus: 949 raise errors.OpPrereqError("Error on node '%s': %s" % 950 (self.cfg.GetNodeName(node_uuid), vgstatus), 951 errors.ECODE_ENVIRON)
952 953 @staticmethod
954 - def _GetDiskTemplateSetsInner(op_enabled_disk_templates, 955 old_enabled_disk_templates):
956 """Computes three sets of disk templates. 957 958 @see: C{_GetDiskTemplateSets} for more details. 959 960 """ 961 enabled_disk_templates = None 962 new_enabled_disk_templates = [] 963 disabled_disk_templates = [] 964 if op_enabled_disk_templates: 965 enabled_disk_templates = op_enabled_disk_templates 966 new_enabled_disk_templates = \ 967 list(set(enabled_disk_templates) 968 - set(old_enabled_disk_templates)) 969 disabled_disk_templates = \ 970 list(set(old_enabled_disk_templates) 971 - set(enabled_disk_templates)) 972 else: 973 enabled_disk_templates = old_enabled_disk_templates 974 return (enabled_disk_templates, new_enabled_disk_templates, 975 disabled_disk_templates)
976
977 - def _GetDiskTemplateSets(self, cluster):
978 """Computes three sets of disk templates. 979 980 The three sets are: 981 - disk templates that will be enabled after this operation (no matter if 982 they were enabled before or not) 983 - disk templates that get enabled by this operation (thus haven't been 984 enabled before.) 985 - disk templates that get disabled by this operation 986 987 """ 988 return self._GetDiskTemplateSetsInner(self.op.enabled_disk_templates, 989 cluster.enabled_disk_templates)
990
991 - def _CheckIpolicy(self, cluster, enabled_disk_templates):
992 """Checks the ipolicy. 993 994 @type cluster: C{objects.Cluster} 995 @param cluster: the cluster's configuration 996 @type enabled_disk_templates: list of string 997 @param enabled_disk_templates: list of (possibly newly) enabled disk 998 templates 999 1000 """ 1001 # FIXME: write unit tests for this 1002 if self.op.ipolicy: 1003 self.new_ipolicy = GetUpdatedIPolicy(cluster.ipolicy, self.op.ipolicy, 1004 group_policy=False) 1005 1006 CheckIpolicyVsDiskTemplates(self.new_ipolicy, 1007 enabled_disk_templates) 1008 1009 all_instances = self.cfg.GetAllInstancesInfo().values() 1010 violations = set() 1011 for group in self.cfg.GetAllNodeGroupsInfo().values(): 1012 instances = frozenset( 1013 [inst for inst in all_instances 1014 if compat.any(nuuid in group.members 1015 for nuuid in self.cfg.GetInstanceNodes(inst.uuid))]) 1016 new_ipolicy = objects.FillIPolicy(self.new_ipolicy, group.ipolicy) 1017 ipol = masterd.instance.CalculateGroupIPolicy(cluster, group) 1018 new = ComputeNewInstanceViolations(ipol, new_ipolicy, instances, 1019 self.cfg) 1020 if new: 1021 violations.update(new) 1022 1023 if violations: 1024 self.LogWarning("After the ipolicy change the following instances" 1025 " violate them: %s", 1026 utils.CommaJoin(utils.NiceSort(violations))) 1027 else: 1028 CheckIpolicyVsDiskTemplates(cluster.ipolicy, 1029 enabled_disk_templates)
1030
1031 - def _CheckDrbdHelperOnNodes(self, drbd_helper, node_uuids):
1032 """Checks whether the set DRBD helper actually exists on the nodes. 1033 1034 @type drbd_helper: string 1035 @param drbd_helper: path of the drbd usermode helper binary 1036 @type node_uuids: list of strings 1037 @param node_uuids: list of node UUIDs to check for the helper 1038 1039 """ 1040 # checks given drbd helper on all nodes 1041 helpers = self.rpc.call_drbd_helper(node_uuids) 1042 for (_, ninfo) in self.cfg.GetMultiNodeInfo(node_uuids): 1043 if ninfo.offline: 1044 self.LogInfo("Not checking drbd helper on offline node %s", 1045 ninfo.name) 1046 continue 1047 msg = helpers[ninfo.uuid].fail_msg 1048 if msg: 1049 raise errors.OpPrereqError("Error checking drbd helper on node" 1050 " '%s': %s" % (ninfo.name, msg), 1051 errors.ECODE_ENVIRON) 1052 node_helper = helpers[ninfo.uuid].payload 1053 if node_helper != drbd_helper: 1054 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" % 1055 (ninfo.name, node_helper), 1056 errors.ECODE_ENVIRON)
1057
1058 - def _CheckDrbdHelper(self, node_uuids, drbd_enabled, drbd_gets_enabled):
1059 """Check the DRBD usermode helper. 1060 1061 @type node_uuids: list of strings 1062 @param node_uuids: a list of nodes' UUIDs 1063 @type drbd_enabled: boolean 1064 @param drbd_enabled: whether DRBD will be enabled after this operation 1065 (no matter if it was disabled before or not) 1066 @type drbd_gets_enabled: boolen 1067 @param drbd_gets_enabled: true if DRBD was disabled before this 1068 operation, but will be enabled afterwards 1069 1070 """ 1071 if self.op.drbd_helper == '': 1072 if drbd_enabled: 1073 raise errors.OpPrereqError("Cannot disable drbd helper while" 1074 " DRBD is enabled.") 1075 if self.cfg.HasAnyDiskOfType(constants.DT_DRBD8): 1076 raise errors.OpPrereqError("Cannot disable drbd helper while" 1077 " drbd-based instances exist", 1078 errors.ECODE_INVAL) 1079 1080 else: 1081 if self.op.drbd_helper is not None and drbd_enabled: 1082 self._CheckDrbdHelperOnNodes(self.op.drbd_helper, node_uuids) 1083 else: 1084 if drbd_gets_enabled: 1085 current_drbd_helper = self.cfg.GetClusterInfo().drbd_usermode_helper 1086 if current_drbd_helper is not None: 1087 self._CheckDrbdHelperOnNodes(current_drbd_helper, node_uuids) 1088 else: 1089 raise errors.OpPrereqError("Cannot enable DRBD without a" 1090 " DRBD usermode helper set.")
1091
1092 - def _CheckInstancesOfDisabledDiskTemplates( 1093 self, disabled_disk_templates):
1094 """Check whether we try to disable a disk template that is in use. 1095 1096 @type disabled_disk_templates: list of string 1097 @param disabled_disk_templates: list of disk templates that are going to 1098 be disabled by this operation 1099 1100 """ 1101 for disk_template in disabled_disk_templates: 1102 if self.cfg.HasAnyDiskOfType(disk_template): 1103 raise errors.OpPrereqError( 1104 "Cannot disable disk template '%s', because there is at least one" 1105 " instance using it." % disk_template)
1106 1107 @staticmethod
1108 - def _CheckInstanceCommunicationNetwork(network, warning_fn):
1109 """Check whether an existing network is configured for instance 1110 communication. 1111 1112 Checks whether an existing network is configured with the 1113 parameters that are advisable for instance communication, and 1114 otherwise issue security warnings. 1115 1116 @type network: L{ganeti.objects.Network} 1117 @param network: L{ganeti.objects.Network} object whose 1118 configuration is being checked 1119 @type warning_fn: function 1120 @param warning_fn: function used to print warnings 1121 @rtype: None 1122 @return: None 1123 1124 """ 1125 def _MaybeWarn(err, val, default): 1126 if val != default: 1127 warning_fn("Supplied instance communication network '%s' %s '%s'," 1128 " this might pose a security risk (default is '%s').", 1129 network.name, err, val, default)
1130 1131 if network.network is None: 1132 raise errors.OpPrereqError("Supplied instance communication network '%s'" 1133 " must have an IPv4 network address.", 1134 network.name) 1135 1136 _MaybeWarn("has an IPv4 gateway", network.gateway, None) 1137 _MaybeWarn("has a non-standard IPv4 network address", network.network, 1138 constants.INSTANCE_COMMUNICATION_NETWORK4) 1139 _MaybeWarn("has an IPv6 gateway", network.gateway6, None) 1140 _MaybeWarn("has a non-standard IPv6 network address", network.network6, 1141 constants.INSTANCE_COMMUNICATION_NETWORK6) 1142 _MaybeWarn("has a non-standard MAC prefix", network.mac_prefix, 1143 constants.INSTANCE_COMMUNICATION_MAC_PREFIX)
1144
1145 - def CheckPrereq(self):
1146 """Check prerequisites. 1147 1148 This checks whether the given params don't conflict and 1149 if the given volume group is valid. 1150 1151 """ 1152 node_uuids = self.owned_locks(locking.LEVEL_NODE) 1153 self.cluster = cluster = self.cfg.GetClusterInfo() 1154 1155 vm_capable_node_uuids = [node.uuid 1156 for node in self.cfg.GetAllNodesInfo().values() 1157 if node.uuid in node_uuids and node.vm_capable] 1158 1159 (enabled_disk_templates, new_enabled_disk_templates, 1160 disabled_disk_templates) = self._GetDiskTemplateSets(cluster) 1161 self._CheckInstancesOfDisabledDiskTemplates(disabled_disk_templates) 1162 1163 self._CheckVgName(vm_capable_node_uuids, enabled_disk_templates, 1164 new_enabled_disk_templates) 1165 1166 if self.op.file_storage_dir is not None: 1167 CheckFileStoragePathVsEnabledDiskTemplates( 1168 self.LogWarning, self.op.file_storage_dir, enabled_disk_templates) 1169 1170 if self.op.shared_file_storage_dir is not None: 1171 CheckSharedFileStoragePathVsEnabledDiskTemplates( 1172 self.LogWarning, self.op.shared_file_storage_dir, 1173 enabled_disk_templates) 1174 1175 drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates 1176 drbd_gets_enabled = constants.DT_DRBD8 in new_enabled_disk_templates 1177 self._CheckDrbdHelper(vm_capable_node_uuids, 1178 drbd_enabled, drbd_gets_enabled) 1179 1180 # validate params changes 1181 if self.op.beparams: 1182 objects.UpgradeBeParams(self.op.beparams) 1183 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES) 1184 self.new_beparams = cluster.SimpleFillBE(self.op.beparams) 1185 1186 if self.op.ndparams: 1187 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES) 1188 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams) 1189 1190 # TODO: we need a more general way to handle resetting 1191 # cluster-level parameters to default values 1192 if self.new_ndparams["oob_program"] == "": 1193 self.new_ndparams["oob_program"] = \ 1194 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM] 1195 1196 if self.op.hv_state: 1197 new_hv_state = MergeAndVerifyHvState(self.op.hv_state, 1198 self.cluster.hv_state_static) 1199 self.new_hv_state = dict((hv, cluster.SimpleFillHvState(values)) 1200 for hv, values in new_hv_state.items()) 1201 1202 if self.op.disk_state: 1203 new_disk_state = MergeAndVerifyDiskState(self.op.disk_state, 1204 self.cluster.disk_state_static) 1205 self.new_disk_state = \ 1206 dict((storage, dict((name, cluster.SimpleFillDiskState(values)) 1207 for name, values in svalues.items())) 1208 for storage, svalues in new_disk_state.items()) 1209 1210 self._CheckIpolicy(cluster, enabled_disk_templates) 1211 1212 if self.op.nicparams: 1213 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES) 1214 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams) 1215 objects.NIC.CheckParameterSyntax(self.new_nicparams) 1216 nic_errors = [] 1217 1218 # check all instances for consistency 1219 for instance in self.cfg.GetAllInstancesInfo().values(): 1220 for nic_idx, nic in enumerate(instance.nics): 1221 params_copy = copy.deepcopy(nic.nicparams) 1222 params_filled = objects.FillDict(self.new_nicparams, params_copy) 1223 1224 # check parameter syntax 1225 try: 1226 objects.NIC.CheckParameterSyntax(params_filled) 1227 except errors.ConfigurationError, err: 1228 nic_errors.append("Instance %s, nic/%d: %s" % 1229 (instance.name, nic_idx, err)) 1230 1231 # if we're moving instances to routed, check that they have an ip 1232 target_mode = params_filled[constants.NIC_MODE] 1233 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip: 1234 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip" 1235 " address" % (instance.name, nic_idx)) 1236 if nic_errors: 1237 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" % 1238 "\n".join(nic_errors), errors.ECODE_INVAL) 1239 1240 # hypervisor list/parameters 1241 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {}) 1242 if self.op.hvparams: 1243 for hv_name, hv_dict in self.op.hvparams.items(): 1244 if hv_name not in self.new_hvparams: 1245 self.new_hvparams[hv_name] = hv_dict 1246 else: 1247 self.new_hvparams[hv_name].update(hv_dict) 1248 1249 # disk template parameters 1250 self.new_diskparams = objects.FillDict(cluster.diskparams, {}) 1251 if self.op.diskparams: 1252 for dt_name, dt_params in self.op.diskparams.items(): 1253 if dt_name not in self.new_diskparams: 1254 self.new_diskparams[dt_name] = dt_params 1255 else: 1256 self.new_diskparams[dt_name].update(dt_params) 1257 CheckDiskAccessModeConsistency(self.op.diskparams, self.cfg) 1258 1259 # os hypervisor parameters 1260 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {}) 1261 if self.op.os_hvp: 1262 for os_name, hvs in self.op.os_hvp.items(): 1263 if os_name not in self.new_os_hvp: 1264 self.new_os_hvp[os_name] = hvs 1265 else: 1266 for hv_name, hv_dict in hvs.items(): 1267 if hv_dict is None: 1268 # Delete if it exists 1269 self.new_os_hvp[os_name].pop(hv_name, None) 1270 elif hv_name not in self.new_os_hvp[os_name]: 1271 self.new_os_hvp[os_name][hv_name] = hv_dict 1272 else: 1273 self.new_os_hvp[os_name][hv_name].update(hv_dict) 1274 1275 # os parameters 1276 self._BuildOSParams(cluster) 1277 1278 # changes to the hypervisor list 1279 if self.op.enabled_hypervisors is not None: 1280 for hv in self.op.enabled_hypervisors: 1281 # if the hypervisor doesn't already exist in the cluster 1282 # hvparams, we initialize it to empty, and then (in both 1283 # cases) we make sure to fill the defaults, as we might not 1284 # have a complete defaults list if the hypervisor wasn't 1285 # enabled before 1286 if hv not in new_hvp: 1287 new_hvp[hv] = {} 1288 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv]) 1289 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES) 1290 1291 if self.op.hvparams or self.op.enabled_hypervisors is not None: 1292 # either the enabled list has changed, or the parameters have, validate 1293 for hv_name, hv_params in self.new_hvparams.items(): 1294 if ((self.op.hvparams and hv_name in self.op.hvparams) or 1295 (self.op.enabled_hypervisors and 1296 hv_name in self.op.enabled_hypervisors)): 1297 # either this is a new hypervisor, or its parameters have changed 1298 hv_class = hypervisor.GetHypervisorClass(hv_name) 1299 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) 1300 hv_class.CheckParameterSyntax(hv_params) 1301 CheckHVParams(self, node_uuids, hv_name, hv_params) 1302 1303 self._CheckDiskTemplateConsistency() 1304 1305 if self.op.os_hvp: 1306 # no need to check any newly-enabled hypervisors, since the 1307 # defaults have already been checked in the above code-block 1308 for os_name, os_hvp in self.new_os_hvp.items(): 1309 for hv_name, hv_params in os_hvp.items(): 1310 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) 1311 # we need to fill in the new os_hvp on top of the actual hv_p 1312 cluster_defaults = self.new_hvparams.get(hv_name, {}) 1313 new_osp = objects.FillDict(cluster_defaults, hv_params) 1314 hv_class = hypervisor.GetHypervisorClass(hv_name) 1315 hv_class.CheckParameterSyntax(new_osp) 1316 CheckHVParams(self, node_uuids, hv_name, new_osp) 1317 1318 if self.op.default_iallocator: 1319 alloc_script = utils.FindFile(self.op.default_iallocator, 1320 constants.IALLOCATOR_SEARCH_PATH, 1321 os.path.isfile) 1322 if alloc_script is None: 1323 raise errors.OpPrereqError("Invalid default iallocator script '%s'" 1324 " specified" % self.op.default_iallocator, 1325 errors.ECODE_INVAL) 1326 1327 if self.op.instance_communication_network: 1328 network_name = self.op.instance_communication_network 1329 1330 try: 1331 network_uuid = self.cfg.LookupNetwork(network_name) 1332 except errors.OpPrereqError: 1333 network_uuid = None 1334 1335 if network_uuid is not None: 1336 network = self.cfg.GetNetwork(network_uuid) 1337 self._CheckInstanceCommunicationNetwork(network, self.LogWarning) 1338 1339 if self.op.compression_tools: 1340 CheckCompressionTools(self.op.compression_tools)
1341
1342 - def _BuildOSParams(self, cluster):
1343 "Calculate the new OS parameters for this operation." 1344 1345 def _GetNewParams(source, new_params): 1346 "Wrapper around GetUpdatedParams." 1347 if new_params is None: 1348 return source 1349 result = objects.FillDict(source, {}) # deep copy of source 1350 for os_name in new_params: 1351 result[os_name] = GetUpdatedParams(result.get(os_name, {}), 1352 new_params[os_name], 1353 use_none=True) 1354 if not result[os_name]: 1355 del result[os_name] # we removed all parameters 1356 return result
1357 1358 self.new_osp = _GetNewParams(cluster.osparams, 1359 self.op.osparams) 1360 self.new_osp_private = _GetNewParams(cluster.osparams_private_cluster, 1361 self.op.osparams_private_cluster) 1362 1363 # Remove os validity check 1364 changed_oses = (set(self.new_osp.keys()) | set(self.new_osp_private.keys())) 1365 for os_name in changed_oses: 1366 os_params = cluster.SimpleFillOS( 1367 os_name, 1368 self.new_osp.get(os_name, {}), 1369 os_params_private=self.new_osp_private.get(os_name, {}) 1370 ) 1371 # check the parameter validity (remote check) 1372 CheckOSParams(self, False, [self.cfg.GetMasterNode()], 1373 os_name, os_params, False) 1374
1375 - def _CheckDiskTemplateConsistency(self):
1376 """Check whether the disk templates that are going to be disabled 1377 are still in use by some instances. 1378 1379 """ 1380 if self.op.enabled_disk_templates: 1381 cluster = self.cfg.GetClusterInfo() 1382 instances = self.cfg.GetAllInstancesInfo() 1383 1384 disk_templates_to_remove = set(cluster.enabled_disk_templates) \ 1385 - set(self.op.enabled_disk_templates) 1386 for instance in instances.itervalues(): 1387 if instance.disk_template in disk_templates_to_remove: 1388 raise errors.OpPrereqError("Cannot disable disk template '%s'," 1389 " because instance '%s' is using it." % 1390 (instance.disk_template, instance.name))
1391
1392 - def _SetVgName(self, feedback_fn):
1393 """Determines and sets the new volume group name. 1394 1395 """ 1396 if self.op.vg_name is not None: 1397 new_volume = self.op.vg_name 1398 if not new_volume: 1399 new_volume = None 1400 if new_volume != self.cfg.GetVGName(): 1401 self.cfg.SetVGName(new_volume) 1402 else: 1403 feedback_fn("Cluster LVM configuration already in desired" 1404 " state, not changing")
1405
1406 - def _SetFileStorageDir(self, feedback_fn):
1407 """Set the file storage directory. 1408 1409 """ 1410 if self.op.file_storage_dir is not None: 1411 if self.cluster.file_storage_dir == self.op.file_storage_dir: 1412 feedback_fn("Global file storage dir already set to value '%s'" 1413 % self.cluster.file_storage_dir) 1414 else: 1415 self.cluster.file_storage_dir = self.op.file_storage_dir
1416
1417 - def _SetSharedFileStorageDir(self, feedback_fn):
1418 """Set the shared file storage directory. 1419 1420 """ 1421 if self.op.shared_file_storage_dir is not None: 1422 if self.cluster.shared_file_storage_dir == \ 1423 self.op.shared_file_storage_dir: 1424 feedback_fn("Global shared file storage dir already set to value '%s'" 1425 % self.cluster.shared_file_storage_dir) 1426 else: 1427 self.cluster.shared_file_storage_dir = self.op.shared_file_storage_dir
1428
1429 - def _SetDrbdHelper(self, feedback_fn):
1430 """Set the DRBD usermode helper. 1431 1432 """ 1433 if self.op.drbd_helper is not None: 1434 if not constants.DT_DRBD8 in self.cluster.enabled_disk_templates: 1435 feedback_fn("Note that you specified a drbd user helper, but did not" 1436 " enable the drbd disk template.") 1437 new_helper = self.op.drbd_helper 1438 if not new_helper: 1439 new_helper = None 1440 if new_helper != self.cfg.GetDRBDHelper(): 1441 self.cfg.SetDRBDHelper(new_helper) 1442 else: 1443 feedback_fn("Cluster DRBD helper already in desired state," 1444 " not changing")
1445 1446 @staticmethod
1447 - def _EnsureInstanceCommunicationNetwork(cfg, network_name):
1448 """Ensure that the instance communication network exists and is 1449 connected to all groups. 1450 1451 The instance communication network given by L{network_name} it is 1452 created, if necessary, via the opcode 'OpNetworkAdd'. Also, the 1453 instance communication network is connected to all existing node 1454 groups, if necessary, via the opcode 'OpNetworkConnect'. 1455 1456 @type cfg: L{config.ConfigWriter} 1457 @param cfg: cluster configuration 1458 1459 @type network_name: string 1460 @param network_name: instance communication network name 1461 1462 @rtype: L{ganeti.cmdlib.ResultWithJobs} or L{None} 1463 @return: L{ganeti.cmdlib.ResultWithJobs} if the instance 1464 communication needs to be created or it needs to be 1465 connected to a group, otherwise L{None} 1466 1467 """ 1468 jobs = [] 1469 1470 try: 1471 network_uuid = cfg.LookupNetwork(network_name) 1472 network_exists = True 1473 except errors.OpPrereqError: 1474 network_exists = False 1475 1476 if not network_exists: 1477 jobs.append(AddInstanceCommunicationNetworkOp(network_name)) 1478 1479 for group_uuid in cfg.GetNodeGroupList(): 1480 group = cfg.GetNodeGroup(group_uuid) 1481 1482 if network_exists: 1483 network_connected = network_uuid in group.networks 1484 else: 1485 # The network was created asynchronously by the previous 1486 # opcode and, therefore, we don't have access to its 1487 # network_uuid. As a result, we assume that the network is 1488 # not connected to any group yet. 1489 network_connected = False 1490 1491 if not network_connected: 1492 op = ConnectInstanceCommunicationNetworkOp(group_uuid, network_name) 1493 jobs.append(op) 1494 1495 if jobs: 1496 return ResultWithJobs([jobs]) 1497 else: 1498 return None
1499 1500 @staticmethod
1501 - def _ModifyInstanceCommunicationNetwork(cfg, network_name, feedback_fn):
1502 """Update the instance communication network stored in the cluster 1503 configuration. 1504 1505 Compares the user-supplied instance communication network against 1506 the one stored in the Ganeti cluster configuration. If there is a 1507 change, the instance communication network may be possibly created 1508 and connected to all groups (see 1509 L{LUClusterSetParams._EnsureInstanceCommunicationNetwork}). 1510 1511 @type cfg: L{config.ConfigWriter} 1512 @param cfg: cluster configuration 1513 1514 @type network_name: string 1515 @param network_name: instance communication network name 1516 1517 @type feedback_fn: function 1518 @param feedback_fn: see L{ganeti.cmdlist.base.LogicalUnit} 1519 1520 @rtype: L{LUClusterSetParams._EnsureInstanceCommunicationNetwork} or L{None} 1521 @return: see L{LUClusterSetParams._EnsureInstanceCommunicationNetwork} 1522 1523 """ 1524 config_network_name = cfg.GetInstanceCommunicationNetwork() 1525 1526 if network_name == config_network_name: 1527 feedback_fn("Instance communication network already is '%s', nothing to" 1528 " do." % network_name) 1529 else: 1530 try: 1531 cfg.LookupNetwork(config_network_name) 1532 feedback_fn("Previous instance communication network '%s'" 1533 " should be removed manually." % config_network_name) 1534 except errors.OpPrereqError: 1535 pass 1536 1537 if network_name: 1538 feedback_fn("Changing instance communication network to '%s', only new" 1539 " instances will be affected." 1540 % network_name) 1541 else: 1542 feedback_fn("Disabling instance communication network, only new" 1543 " instances will be affected.") 1544 1545 cfg.SetInstanceCommunicationNetwork(network_name) 1546 1547 if network_name: 1548 return LUClusterSetParams._EnsureInstanceCommunicationNetwork( 1549 cfg, 1550 network_name) 1551 else: 1552 return None
1553
1554 - def Exec(self, feedback_fn):
1555 """Change the parameters of the cluster. 1556 1557 """ 1558 # re-read the fresh configuration 1559 self.cluster = self.cfg.GetClusterInfo() 1560 if self.op.enabled_disk_templates: 1561 self.cluster.enabled_disk_templates = \ 1562 list(self.op.enabled_disk_templates) 1563 # save the changes 1564 self.cfg.Update(self.cluster, feedback_fn) 1565 1566 self._SetVgName(feedback_fn) 1567 1568 self.cluster = self.cfg.GetClusterInfo() 1569 self._SetFileStorageDir(feedback_fn) 1570 self._SetSharedFileStorageDir(feedback_fn) 1571 self.cfg.Update(self.cluster, feedback_fn) 1572 self._SetDrbdHelper(feedback_fn) 1573 1574 # re-read the fresh configuration again 1575 self.cluster = self.cfg.GetClusterInfo() 1576 1577 ensure_kvmd = False 1578 1579 if self.op.hvparams: 1580 self.cluster.hvparams = self.new_hvparams 1581 if self.op.os_hvp: 1582 self.cluster.os_hvp = self.new_os_hvp 1583 if self.op.enabled_hypervisors is not None: 1584 self.cluster.hvparams = self.new_hvparams 1585 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors 1586 ensure_kvmd = True 1587 if self.op.beparams: 1588 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams 1589 if self.op.nicparams: 1590 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams 1591 if self.op.ipolicy: 1592 self.cluster.ipolicy = self.new_ipolicy 1593 if self.op.osparams: 1594 self.cluster.osparams = self.new_osp 1595 if self.op.osparams_private_cluster: 1596 self.cluster.osparams_private_cluster = self.new_osp_private 1597 if self.op.ndparams: 1598 self.cluster.ndparams = self.new_ndparams 1599 if self.op.diskparams: 1600 self.cluster.diskparams = self.new_diskparams 1601 if self.op.hv_state: 1602 self.cluster.hv_state_static = self.new_hv_state 1603 if self.op.disk_state: 1604 self.cluster.disk_state_static = self.new_disk_state 1605 1606 if self.op.candidate_pool_size is not None: 1607 self.cluster.candidate_pool_size = self.op.candidate_pool_size 1608 # we need to update the pool size here, otherwise the save will fail 1609 AdjustCandidatePool(self, []) 1610 1611 if self.op.max_running_jobs is not None: 1612 self.cluster.max_running_jobs = self.op.max_running_jobs 1613 1614 if self.op.max_tracked_jobs is not None: 1615 self.cluster.max_tracked_jobs = self.op.max_tracked_jobs 1616 1617 if self.op.maintain_node_health is not None: 1618 if self.op.maintain_node_health and not constants.ENABLE_CONFD: 1619 feedback_fn("Note: CONFD was disabled at build time, node health" 1620 " maintenance is not useful (still enabling it)") 1621 self.cluster.maintain_node_health = self.op.maintain_node_health 1622 1623 if self.op.modify_etc_hosts is not None: 1624 self.cluster.modify_etc_hosts = self.op.modify_etc_hosts 1625 1626 if self.op.prealloc_wipe_disks is not None: 1627 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks 1628 1629 if self.op.add_uids is not None: 1630 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids) 1631 1632 if self.op.remove_uids is not None: 1633 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids) 1634 1635 if self.op.uid_pool is not None: 1636 self.cluster.uid_pool = self.op.uid_pool 1637 1638 if self.op.default_iallocator is not None: 1639 self.cluster.default_iallocator = self.op.default_iallocator 1640 1641 if self.op.default_iallocator_params is not None: 1642 self.cluster.default_iallocator_params = self.op.default_iallocator_params 1643 1644 if self.op.reserved_lvs is not None: 1645 self.cluster.reserved_lvs = self.op.reserved_lvs 1646 1647 if self.op.use_external_mip_script is not None: 1648 self.cluster.use_external_mip_script = self.op.use_external_mip_script 1649 1650 if self.op.enabled_user_shutdown is not None and \ 1651 self.cluster.enabled_user_shutdown != self.op.enabled_user_shutdown: 1652 self.cluster.enabled_user_shutdown = self.op.enabled_user_shutdown 1653 ensure_kvmd = True 1654 1655 def helper_os(aname, mods, desc): 1656 desc += " OS list" 1657 lst = getattr(self.cluster, aname) 1658 for key, val in mods: 1659 if key == constants.DDM_ADD: 1660 if val in lst: 1661 feedback_fn("OS %s already in %s, ignoring" % (val, desc)) 1662 else: 1663 lst.append(val) 1664 elif key == constants.DDM_REMOVE: 1665 if val in lst: 1666 lst.remove(val) 1667 else: 1668 feedback_fn("OS %s not found in %s, ignoring" % (val, desc)) 1669 else: 1670 raise errors.ProgrammerError("Invalid modification '%s'" % key)
1671 1672 if self.op.hidden_os: 1673 helper_os("hidden_os", self.op.hidden_os, "hidden") 1674 1675 if self.op.blacklisted_os: 1676 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted") 1677 1678 if self.op.mac_prefix: 1679 self.cluster.mac_prefix = self.op.mac_prefix 1680 1681 if self.op.master_netdev: 1682 master_params = self.cfg.GetMasterNetworkParameters() 1683 ems = self.cfg.GetUseExternalMipScript() 1684 feedback_fn("Shutting down master ip on the current netdev (%s)" % 1685 self.cluster.master_netdev) 1686 result = self.rpc.call_node_deactivate_master_ip(master_params.uuid, 1687 master_params, ems) 1688 if not self.op.force: 1689 result.Raise("Could not disable the master ip") 1690 else: 1691 if result.fail_msg: 1692 msg = ("Could not disable the master ip (continuing anyway): %s" % 1693 result.fail_msg) 1694 feedback_fn(msg) 1695 feedback_fn("Changing master_netdev from %s to %s" % 1696 (master_params.netdev, self.op.master_netdev)) 1697 self.cluster.master_netdev = self.op.master_netdev 1698 1699 if self.op.master_netmask: 1700 master_params = self.cfg.GetMasterNetworkParameters() 1701 feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask) 1702 result = self.rpc.call_node_change_master_netmask( 1703 master_params.uuid, master_params.netmask, 1704 self.op.master_netmask, master_params.ip, 1705 master_params.netdev) 1706 result.Warn("Could not change the master IP netmask", feedback_fn) 1707 self.cluster.master_netmask = self.op.master_netmask 1708 1709 if self.op.install_image: 1710 self.cluster.install_image = self.op.install_image 1711 1712 if self.op.zeroing_image is not None: 1713 CheckImageValidity(self.op.zeroing_image, 1714 "Zeroing image must be an absolute path or a URL") 1715 self.cluster.zeroing_image = self.op.zeroing_image 1716 1717 self.cfg.Update(self.cluster, feedback_fn) 1718 1719 if self.op.master_netdev: 1720 master_params = self.cfg.GetMasterNetworkParameters() 1721 feedback_fn("Starting the master ip on the new master netdev (%s)" % 1722 self.op.master_netdev) 1723 ems = self.cfg.GetUseExternalMipScript() 1724 result = self.rpc.call_node_activate_master_ip(master_params.uuid, 1725 master_params, ems) 1726 result.Warn("Could not re-enable the master ip on the master," 1727 " please restart manually", self.LogWarning) 1728 1729 # Even though 'self.op.enabled_user_shutdown' is being tested 1730 # above, the RPCs can only be done after 'self.cfg.Update' because 1731 # this will update the cluster object and sync 'Ssconf', and kvmd 1732 # uses 'Ssconf'. 1733 if ensure_kvmd: 1734 EnsureKvmdOnNodes(self, feedback_fn) 1735 1736 if self.op.compression_tools is not None: 1737 self.cfg.SetCompressionTools(self.op.compression_tools) 1738 1739 network_name = self.op.instance_communication_network 1740 if network_name is not None: 1741 return self._ModifyInstanceCommunicationNetwork(self.cfg, 1742 network_name, feedback_fn) 1743 else: 1744 return None 1745
1746 1747 -class LUClusterVerify(NoHooksLU):
1748 """Submits all jobs necessary to verify the cluster. 1749 1750 """ 1751 REQ_BGL = False 1752
1753 - def ExpandNames(self):
1754 self.needed_locks = {}
1755
1756 - def Exec(self, feedback_fn):
1757 jobs = [] 1758 1759 if self.op.group_name: 1760 groups = [self.op.group_name] 1761 depends_fn = lambda: None 1762 else: 1763 groups = self.cfg.GetNodeGroupList() 1764 1765 # Verify global configuration 1766 jobs.append([ 1767 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors), 1768 ]) 1769 1770 # Always depend on global verification 1771 depends_fn = lambda: [(-len(jobs), [])] 1772 1773 jobs.extend( 1774 [opcodes.OpClusterVerifyGroup(group_name=group, 1775 ignore_errors=self.op.ignore_errors, 1776 depends=depends_fn())] 1777 for group in groups) 1778 1779 # Fix up all parameters 1780 for op in itertools.chain(*jobs): # pylint: disable=W0142 1781 op.debug_simulate_errors = self.op.debug_simulate_errors 1782 op.verbose = self.op.verbose 1783 op.error_codes = self.op.error_codes 1784 try: 1785 op.skip_checks = self.op.skip_checks 1786 except AttributeError: 1787 assert not isinstance(op, opcodes.OpClusterVerifyGroup) 1788 1789 return ResultWithJobs(jobs)
1790
1791 1792 -class _VerifyErrors(object):
1793 """Mix-in for cluster/group verify LUs. 1794 1795 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects 1796 self.op and self._feedback_fn to be available.) 1797 1798 """ 1799 1800 ETYPE_FIELD = "code" 1801 ETYPE_ERROR = constants.CV_ERROR 1802 ETYPE_WARNING = constants.CV_WARNING 1803
1804 - def _Error(self, ecode, item, msg, *args, **kwargs):
1805 """Format an error message. 1806 1807 Based on the opcode's error_codes parameter, either format a 1808 parseable error code, or a simpler error string. 1809 1810 This must be called only from Exec and functions called from Exec. 1811 1812 """ 1813 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) 1814 itype, etxt, _ = ecode 1815 # If the error code is in the list of ignored errors, demote the error to a 1816 # warning 1817 if etxt in self.op.ignore_errors: # pylint: disable=E1101 1818 ltype = self.ETYPE_WARNING 1819 # first complete the msg 1820 if args: 1821 msg = msg % args 1822 # then format the whole message 1823 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101 1824 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg) 1825 else: 1826 if item: 1827 item = " " + item 1828 else: 1829 item = "" 1830 msg = "%s: %s%s: %s" % (ltype, itype, item, msg) 1831 # and finally report it via the feedback_fn 1832 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101 1833 # do not mark the operation as failed for WARN cases only 1834 if ltype == self.ETYPE_ERROR: 1835 self.bad = True
1836
1837 - def _ErrorIf(self, cond, *args, **kwargs):
1838 """Log an error message if the passed condition is True. 1839 1840 """ 1841 if (bool(cond) 1842 or self.op.debug_simulate_errors): # pylint: disable=E1101 1843 self._Error(*args, **kwargs)
1844
1845 1846 -def _GetAllHypervisorParameters(cluster, instances):
1847 """Compute the set of all hypervisor parameters. 1848 1849 @type cluster: L{objects.Cluster} 1850 @param cluster: the cluster object 1851 @param instances: list of L{objects.Instance} 1852 @param instances: additional instances from which to obtain parameters 1853 @rtype: list of (origin, hypervisor, parameters) 1854 @return: a list with all parameters found, indicating the hypervisor they 1855 apply to, and the origin (can be "cluster", "os X", or "instance Y") 1856 1857 """ 1858 hvp_data = [] 1859 1860 for hv_name in cluster.enabled_hypervisors: 1861 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name))) 1862 1863 for os_name, os_hvp in cluster.os_hvp.items(): 1864 for hv_name, hv_params in os_hvp.items(): 1865 if hv_params: 1866 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name) 1867 hvp_data.append(("os %s" % os_name, hv_name, full_params)) 1868 1869 # TODO: collapse identical parameter values in a single one 1870 for instance in instances: 1871 if instance.hvparams: 1872 hvp_data.append(("instance %s" % instance.name, instance.hypervisor, 1873 cluster.FillHV(instance))) 1874 1875 return hvp_data
1876
1877 1878 -class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
1879 """Verifies the cluster config. 1880 1881 """ 1882 REQ_BGL = False 1883
1884 - def _VerifyHVP(self, hvp_data):
1885 """Verifies locally the syntax of the hypervisor parameters. 1886 1887 """ 1888 for item, hv_name, hv_params in hvp_data: 1889 msg = ("hypervisor %s parameters syntax check (source %s): %%s" % 1890 (item, hv_name)) 1891 try: 1892 hv_class = hypervisor.GetHypervisorClass(hv_name) 1893 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) 1894 hv_class.CheckParameterSyntax(hv_params) 1895 except errors.GenericError, err: 1896 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
1897
1898 - def ExpandNames(self):
1899 self.needed_locks = dict.fromkeys(locking.LEVELS, locking.ALL_SET) 1900 self.share_locks = ShareAll()
1901
1902 - def CheckPrereq(self):
1903 """Check prerequisites. 1904 1905 """ 1906 # Retrieve all information 1907 self.all_group_info = self.cfg.GetAllNodeGroupsInfo() 1908 self.all_node_info = self.cfg.GetAllNodesInfo() 1909 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1910
1911 - def Exec(self, feedback_fn):
1912 """Verify integrity of cluster, performing various test on nodes. 1913 1914 """ 1915 self.bad = False 1916 self._feedback_fn = feedback_fn 1917 1918 feedback_fn("* Verifying cluster config") 1919 1920 for msg in self.cfg.VerifyConfig(): 1921 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg) 1922 1923 feedback_fn("* Verifying cluster certificate files") 1924 1925 for cert_filename in pathutils.ALL_CERT_FILES: 1926 (errcode, msg) = utils.VerifyCertificate(cert_filename) 1927 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode) 1928 1929 self._ErrorIf(not utils.CanRead(constants.LUXID_USER, 1930 pathutils.NODED_CERT_FILE), 1931 constants.CV_ECLUSTERCERT, 1932 None, 1933 pathutils.NODED_CERT_FILE + " must be accessible by the " + 1934 constants.LUXID_USER + " user") 1935 1936 feedback_fn("* Verifying hypervisor parameters") 1937 1938 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(), 1939 self.all_inst_info.values())) 1940 1941 feedback_fn("* Verifying all nodes belong to an existing group") 1942 1943 # We do this verification here because, should this bogus circumstance 1944 # occur, it would never be caught by VerifyGroup, which only acts on 1945 # nodes/instances reachable from existing node groups. 1946 1947 dangling_nodes = set(node for node in self.all_node_info.values() 1948 if node.group not in self.all_group_info) 1949 1950 dangling_instances = {} 1951 no_node_instances = [] 1952 1953 for inst in self.all_inst_info.values(): 1954 if inst.primary_node in [node.uuid for node in dangling_nodes]: 1955 dangling_instances.setdefault(inst.primary_node, []).append(inst) 1956 elif inst.primary_node not in self.all_node_info: 1957 no_node_instances.append(inst) 1958 1959 pretty_dangling = [ 1960 "%s (%s)" % 1961 (node.name, 1962 utils.CommaJoin(inst.name for 1963 inst in dangling_instances.get(node.uuid, []))) 1964 for node in dangling_nodes] 1965 1966 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES, 1967 None, 1968 "the following nodes (and their instances) belong to a non" 1969 " existing group: %s", utils.CommaJoin(pretty_dangling)) 1970 1971 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST, 1972 None, 1973 "the following instances have a non-existing primary-node:" 1974 " %s", utils.CommaJoin(inst.name for 1975 inst in no_node_instances)) 1976 1977 return not self.bad
1978
1979 1980 -class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
1981 """Verifies the status of a node group. 1982 1983 """ 1984 HPATH = "cluster-verify" 1985 HTYPE = constants.HTYPE_CLUSTER 1986 REQ_BGL = False 1987 1988 _HOOKS_INDENT_RE = re.compile("^", re.M) 1989
1990 - class NodeImage(object):
1991 """A class representing the logical and physical status of a node. 1992 1993 @type uuid: string 1994 @ivar uuid: the node UUID to which this object refers 1995 @ivar volumes: a structure as returned from 1996 L{ganeti.backend.GetVolumeList} (runtime) 1997 @ivar instances: a list of running instances (runtime) 1998 @ivar pinst: list of configured primary instances (config) 1999 @ivar sinst: list of configured secondary instances (config) 2000 @ivar sbp: dictionary of {primary-node: list of instances} for all 2001 instances for which this node is secondary (config) 2002 @ivar mfree: free memory, as reported by hypervisor (runtime) 2003 @ivar dfree: free disk, as reported by the node (runtime) 2004 @ivar offline: the offline status (config) 2005 @type rpc_fail: boolean 2006 @ivar rpc_fail: whether the RPC verify call was successfull (overall, 2007 not whether the individual keys were correct) (runtime) 2008 @type lvm_fail: boolean 2009 @ivar lvm_fail: whether the RPC call didn't return valid LVM data 2010 @type hyp_fail: boolean 2011 @ivar hyp_fail: whether the RPC call didn't return the instance list 2012 @type ghost: boolean 2013 @ivar ghost: whether this is a known node or not (config) 2014 @type os_fail: boolean 2015 @ivar os_fail: whether the RPC call didn't return valid OS data 2016 @type oslist: list 2017 @ivar oslist: list of OSes as diagnosed by DiagnoseOS 2018 @type vm_capable: boolean 2019 @ivar vm_capable: whether the node can host instances 2020 @type pv_min: float 2021 @ivar pv_min: size in MiB of the smallest PVs 2022 @type pv_max: float 2023 @ivar pv_max: size in MiB of the biggest PVs 2024 2025 """
2026 - def __init__(self, offline=False, uuid=None, vm_capable=True):
2027 self.uuid = uuid 2028 self.volumes = {} 2029 self.instances = [] 2030 self.pinst = [] 2031 self.sinst = [] 2032 self.sbp = {} 2033 self.mfree = 0 2034 self.dfree = 0 2035 self.offline = offline 2036 self.vm_capable = vm_capable 2037 self.rpc_fail = False 2038 self.lvm_fail = False 2039 self.hyp_fail = False 2040 self.ghost = False 2041 self.os_fail = False 2042 self.oslist = {} 2043 self.pv_min = None 2044 self.pv_max = None
2045
2046 - def ExpandNames(self):
2047 # This raises errors.OpPrereqError on its own: 2048 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name) 2049 2050 # Get instances in node group; this is unsafe and needs verification later 2051 inst_uuids = \ 2052 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) 2053 2054 self.needed_locks = { 2055 locking.LEVEL_INSTANCE: self.cfg.GetInstanceNames(inst_uuids), 2056 locking.LEVEL_NODEGROUP: [self.group_uuid], 2057 locking.LEVEL_NODE: [], 2058 2059 # This opcode is run by watcher every five minutes and acquires all nodes 2060 # for a group. It doesn't run for a long time, so it's better to acquire 2061 # the node allocation lock as well. 2062 locking.LEVEL_NODE_ALLOC: locking.ALL_SET, 2063 } 2064 2065 self.share_locks = ShareAll()
2066
2067 - def DeclareLocks(self, level):
2068 if level == locking.LEVEL_NODE: 2069 # Get members of node group; this is unsafe and needs verification later 2070 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members) 2071 2072 # In Exec(), we warn about mirrored instances that have primary and 2073 # secondary living in separate node groups. To fully verify that 2074 # volumes for these instances are healthy, we will need to do an 2075 # extra call to their secondaries. We ensure here those nodes will 2076 # be locked. 2077 for inst_name in self.owned_locks(locking.LEVEL_INSTANCE): 2078 # Important: access only the instances whose lock is owned 2079 instance = self.cfg.GetInstanceInfoByName(inst_name) 2080 if instance.disk_template in constants.DTS_INT_MIRROR: 2081 nodes.update(self.cfg.GetInstanceSecondaryNodes(instance.uuid)) 2082 2083 self.needed_locks[locking.LEVEL_NODE] = nodes
2084
2085 - def CheckPrereq(self):
2086 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP) 2087 self.group_info = self.cfg.GetNodeGroup(self.group_uuid) 2088 2089 group_node_uuids = set(self.group_info.members) 2090 group_inst_uuids = \ 2091 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) 2092 2093 unlocked_node_uuids = \ 2094 group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE)) 2095 2096 unlocked_inst_uuids = \ 2097 group_inst_uuids.difference( 2098 [self.cfg.GetInstanceInfoByName(name).uuid 2099 for name in self.owned_locks(locking.LEVEL_INSTANCE)]) 2100 2101 if unlocked_node_uuids: 2102 raise errors.OpPrereqError( 2103 "Missing lock for nodes: %s" % 2104 utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)), 2105 errors.ECODE_STATE) 2106 2107 if unlocked_inst_uuids: 2108 raise errors.OpPrereqError( 2109 "Missing lock for instances: %s" % 2110 utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)), 2111 errors.ECODE_STATE) 2112 2113 self.all_node_info = self.cfg.GetAllNodesInfo() 2114 self.all_inst_info = self.cfg.GetAllInstancesInfo() 2115 2116 self.my_node_uuids = group_node_uuids 2117 self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid]) 2118 for node_uuid in group_node_uuids) 2119 2120 self.my_inst_uuids = group_inst_uuids 2121 self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid]) 2122 for inst_uuid in group_inst_uuids) 2123 2124 # We detect here the nodes that will need the extra RPC calls for verifying 2125 # split LV volumes; they should be locked. 2126 extra_lv_nodes = set() 2127 2128 for inst in self.my_inst_info.values(): 2129 if inst.disk_template in constants.DTS_INT_MIRROR: 2130 inst_nodes = self.cfg.GetInstanceNodes(inst.uuid) 2131 for nuuid in inst_nodes: 2132 if self.all_node_info[nuuid].group != self.group_uuid: 2133 extra_lv_nodes.add(nuuid) 2134 2135 unlocked_lv_nodes = \ 2136 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE)) 2137 2138 if unlocked_lv_nodes: 2139 raise errors.OpPrereqError("Missing node locks for LV check: %s" % 2140 utils.CommaJoin(unlocked_lv_nodes), 2141 errors.ECODE_STATE) 2142 self.extra_lv_nodes = list(extra_lv_nodes)
2143
2144 - def _VerifyNode(self, ninfo, nresult):
2145 """Perform some basic validation on data returned from a node. 2146 2147 - check the result data structure is well formed and has all the 2148 mandatory fields 2149 - check ganeti version 2150 2151 @type ninfo: L{objects.Node} 2152 @param ninfo: the node to check 2153 @param nresult: the results from the node 2154 @rtype: boolean 2155 @return: whether overall this call was successful (and we can expect 2156 reasonable values in the respose) 2157 2158 """ 2159 # main result, nresult should be a non-empty dict 2160 test = not nresult or not isinstance(nresult, dict) 2161 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name, 2162 "unable to verify node: no data returned") 2163 if test: 2164 return False 2165 2166 # compares ganeti version 2167 local_version = constants.PROTOCOL_VERSION 2168 remote_version = nresult.get("version", None) 2169 test = not (remote_version and 2170 isinstance(remote_version, (list, tuple)) and 2171 len(remote_version) == 2) 2172 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name, 2173 "connection to node returned invalid data") 2174 if test: 2175 return False 2176 2177 test = local_version != remote_version[0] 2178 self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name, 2179 "incompatible protocol versions: master %s," 2180 " node %s", local_version, remote_version[0]) 2181 if test: 2182 return False 2183 2184 # node seems compatible, we can actually try to look into its results 2185 2186 # full package version 2187 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1], 2188 constants.CV_ENODEVERSION, ninfo.name, 2189 "software version mismatch: master %s, node %s", 2190 constants.RELEASE_VERSION, remote_version[1], 2191 code=self.ETYPE_WARNING) 2192 2193 hyp_result = nresult.get(constants.NV_HYPERVISOR, None) 2194 if ninfo.vm_capable and isinstance(hyp_result, dict): 2195 for hv_name, hv_result in hyp_result.iteritems(): 2196 test = hv_result is not None 2197 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 2198 "hypervisor %s verify failure: '%s'", hv_name, hv_result) 2199 2200 hvp_result = nresult.get(constants.NV_HVPARAMS, None) 2201 if ninfo.vm_capable and isinstance(hvp_result, list): 2202 for item, hv_name, hv_result in hvp_result: 2203 self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name, 2204 "hypervisor %s parameter verify failure (source %s): %s", 2205 hv_name, item, hv_result) 2206 2207 test = nresult.get(constants.NV_NODESETUP, 2208 ["Missing NODESETUP results"]) 2209 self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name, 2210 "node setup error: %s", "; ".join(test)) 2211 2212 return True
2213
2214 - def _VerifyNodeTime(self, ninfo, nresult, 2215 nvinfo_starttime, nvinfo_endtime):
2216 """Check the node time. 2217 2218 @type ninfo: L{objects.Node} 2219 @param ninfo: the node to check 2220 @param nresult: the remote results for the node 2221 @param nvinfo_starttime: the start time of the RPC call 2222 @param nvinfo_endtime: the end time of the RPC call 2223 2224 """ 2225 ntime = nresult.get(constants.NV_TIME, None) 2226 try: 2227 ntime_merged = utils.MergeTime(ntime) 2228 except (ValueError, TypeError): 2229 self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name, 2230 "Node returned invalid time") 2231 return 2232 2233 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW): 2234 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged) 2235 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW): 2236 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime) 2237 else: 2238 ntime_diff = None 2239 2240 self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name, 2241 "Node time diverges by at least %s from master node time", 2242 ntime_diff)
2243
2244 - def _UpdateVerifyNodeLVM(self, ninfo, nresult, vg_name, nimg):
2245 """Check the node LVM results and update info for cross-node checks. 2246 2247 @type ninfo: L{objects.Node} 2248 @param ninfo: the node to check 2249 @param nresult: the remote results for the node 2250 @param vg_name: the configured VG name 2251 @type nimg: L{NodeImage} 2252 @param nimg: node image 2253 2254 """ 2255 if vg_name is None: 2256 return 2257 2258 # checks vg existence and size > 20G 2259 vglist = nresult.get(constants.NV_VGLIST, None) 2260 test = not vglist 2261 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name, 2262 "unable to check volume groups") 2263 if not test: 2264 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name, 2265 constants.MIN_VG_SIZE) 2266 self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus) 2267 2268 # Check PVs 2269 (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage) 2270 for em in errmsgs: 2271 self._Error(constants.CV_ENODELVM, ninfo.name, em) 2272 if pvminmax is not None: 2273 (nimg.pv_min, nimg.pv_max) = pvminmax
2274
2275 - def _VerifyGroupDRBDVersion(self, node_verify_infos):
2276 """Check cross-node DRBD version consistency. 2277 2278 @type node_verify_infos: dict 2279 @param node_verify_infos: infos about nodes as returned from the 2280 node_verify call. 2281 2282 """ 2283 node_versions = {} 2284 for node_uuid, ndata in node_verify_infos.items(): 2285 nresult = ndata.payload 2286 if nresult: 2287 version = nresult.get(constants.NV_DRBDVERSION, None) 2288 if version: 2289 node_versions[node_uuid] = version 2290 2291 if len(set(node_versions.values())) > 1: 2292 for node_uuid, version in sorted(node_versions.items()): 2293 msg = "DRBD version mismatch: %s" % version 2294 self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg, 2295 code=self.ETYPE_WARNING)
2296
2297 - def _VerifyGroupLVM(self, node_image, vg_name):
2298 """Check cross-node consistency in LVM. 2299 2300 @type node_image: dict 2301 @param node_image: info about nodes, mapping from node to names to 2302 L{NodeImage} objects 2303 @param vg_name: the configured VG name 2304 2305 """ 2306 if vg_name is None: 2307 return 2308 2309 # Only exclusive storage needs this kind of checks 2310 if not self._exclusive_storage: 2311 return 2312 2313 # exclusive_storage wants all PVs to have the same size (approximately), 2314 # if the smallest and the biggest ones are okay, everything is fine. 2315 # pv_min is None iff pv_max is None 2316 vals = filter((lambda ni: ni.pv_min is not None), node_image.values()) 2317 if not vals: 2318 return 2319 (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals) 2320 (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals) 2321 bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax) 2322 self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE, self.group_info.name, 2323 "PV sizes differ too much in the group; smallest (%s MB) is" 2324 " on %s, biggest (%s MB) is on %s", 2325 pvmin, self.cfg.GetNodeName(minnode_uuid), 2326 pvmax, self.cfg.GetNodeName(maxnode_uuid))
2327
2328 - def _VerifyNodeBridges(self, ninfo, nresult, bridges):
2329 """Check the node bridges. 2330 2331 @type ninfo: L{objects.Node} 2332 @param ninfo: the node to check 2333 @param nresult: the remote results for the node 2334 @param bridges: the expected list of bridges 2335 2336 """ 2337 if not bridges: 2338 return 2339 2340 missing = nresult.get(constants.NV_BRIDGES, None) 2341 test = not isinstance(missing, list) 2342 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 2343 "did not return valid bridge information") 2344 if not test: 2345 self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name, 2346 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
2347
2348 - def _VerifyNodeUserScripts(self, ninfo, nresult):
2349 """Check the results of user scripts presence and executability on the node 2350 2351 @type ninfo: L{objects.Node} 2352 @param ninfo: the node to check 2353 @param nresult: the remote results for the node 2354 2355 """ 2356 test = not constants.NV_USERSCRIPTS in nresult 2357 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, ninfo.name, 2358 "did not return user scripts information") 2359 2360 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None) 2361 if not test: 2362 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, ninfo.name, 2363 "user scripts not present or not executable: %s" % 2364 utils.CommaJoin(sorted(broken_scripts)))
2365
2366 - def _VerifyNodeNetwork(self, ninfo, nresult):
2367 """Check the node network connectivity results. 2368 2369 @type ninfo: L{objects.Node} 2370 @param ninfo: the node to check 2371 @param nresult: the remote results for the node 2372 2373 """ 2374 test = constants.NV_NODELIST not in nresult 2375 self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name, 2376 "node hasn't returned node ssh connectivity data") 2377 if not test: 2378 if nresult[constants.NV_NODELIST]: 2379 for a_node, a_msg in nresult[constants.NV_NODELIST].items(): 2380 self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name, 2381 "ssh communication with node '%s': %s", a_node, a_msg) 2382 2383 test = constants.NV_NODENETTEST not in nresult 2384 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 2385 "node hasn't returned node tcp connectivity data") 2386 if not test: 2387 if nresult[constants.NV_NODENETTEST]: 2388 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys()) 2389 for anode in nlist: 2390 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, 2391 "tcp communication with node '%s': %s", 2392 anode, nresult[constants.NV_NODENETTEST][anode]) 2393 2394 test = constants.NV_MASTERIP not in nresult 2395 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 2396 "node hasn't returned node master IP reachability data") 2397 if not test: 2398 if not nresult[constants.NV_MASTERIP]: 2399 if ninfo.uuid == self.master_node: 2400 msg = "the master node cannot reach the master IP (not configured?)" 2401 else: 2402 msg = "cannot reach the master IP" 2403 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg)
2404
2405 - def _VerifyInstance(self, instance, node_image, diskstatus):
2406 """Verify an instance. 2407 2408 This function checks to see if the required block devices are 2409 available on the instance's node, and that the nodes are in the correct 2410 state. 2411 2412 """ 2413 pnode_uuid = instance.primary_node 2414 pnode_img = node_image[pnode_uuid] 2415 groupinfo = self.cfg.GetAllNodeGroupsInfo() 2416 2417 node_vol_should = {} 2418 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 2419 2420 cluster = self.cfg.GetClusterInfo() 2421 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster, 2422 self.group_info) 2423 err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg) 2424 self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name, 2425 utils.CommaJoin(err), code=self.ETYPE_WARNING) 2426 2427 for node_uuid in node_vol_should: 2428 n_img = node_image[node_uuid] 2429 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail: 2430 # ignore missing volumes on offline or broken nodes 2431 continue 2432 for volume in node_vol_should[node_uuid]: 2433 test = volume not in n_img.volumes 2434 self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name, 2435 "volume %s missing on node %s", volume, 2436 self.cfg.GetNodeName(node_uuid)) 2437 2438 if instance.admin_state == constants.ADMINST_UP: 2439 test = instance.uuid not in pnode_img.instances and not pnode_img.offline 2440 self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name, 2441 "instance not running on its primary node %s", 2442 self.cfg.GetNodeName(pnode_uuid)) 2443 self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE, 2444 instance.name, "instance is marked as running and lives on" 2445 " offline node %s", self.cfg.GetNodeName(pnode_uuid)) 2446 2447 diskdata = [(nname, success, status, idx) 2448 for (nname, disks) in diskstatus.items() 2449 for idx, (success, status) in enumerate(disks)] 2450 2451 for nname, success, bdev_status, idx in diskdata: 2452 # the 'ghost node' construction in Exec() ensures that we have a 2453 # node here 2454 snode = node_image[nname] 2455 bad_snode = snode.ghost or snode.offline 2456 self._ErrorIf(instance.disks_active and 2457 not success and not bad_snode, 2458 constants.CV_EINSTANCEFAULTYDISK, instance.name, 2459 "couldn't retrieve status for disk/%s on %s: %s", 2460 idx, self.cfg.GetNodeName(nname), bdev_status) 2461 2462 if instance.disks_active and success and \ 2463 (bdev_status.is_degraded or 2464 bdev_status.ldisk_status != constants.LDS_OKAY): 2465 msg = "disk/%s on %s" % (idx, self.cfg.GetNodeName(nname)) 2466 if bdev_status.is_degraded: 2467 msg += " is degraded" 2468 if bdev_status.ldisk_status != constants.LDS_OKAY: 2469 msg += "; state is '%s'" % \ 2470 constants.LDS_NAMES[bdev_status.ldisk_status] 2471 2472 self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg) 2473 2474 self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline, 2475 constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid), 2476 "instance %s, connection to primary node failed", 2477 instance.name) 2478 2479 secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid) 2480 self._ErrorIf(len(secondary_nodes) > 1, 2481 constants.CV_EINSTANCELAYOUT, instance.name, 2482 "instance has multiple secondary nodes: %s", 2483 utils.CommaJoin(secondary_nodes), 2484 code=self.ETYPE_WARNING) 2485 2486 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid) 2487 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, inst_nodes) 2488 if any(es_flags.values()): 2489 if instance.disk_template not in constants.DTS_EXCL_STORAGE: 2490 # Disk template not compatible with exclusive_storage: no instance 2491 # node should have the flag set 2492 es_nodes = [n 2493 for (n, es) in es_flags.items() 2494 if es] 2495 self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name, 2496 "instance has template %s, which is not supported on nodes" 2497 " that have exclusive storage set: %s", 2498 instance.disk_template, 2499 utils.CommaJoin(self.cfg.GetNodeNames(es_nodes))) 2500 for (idx, disk) in enumerate(self.cfg.GetInstanceDisks(instance.uuid)): 2501 self._ErrorIf(disk.spindles is None, 2502 constants.CV_EINSTANCEMISSINGCFGPARAMETER, instance.name, 2503 "number of spindles not configured for disk %s while" 2504 " exclusive storage is enabled, try running" 2505 " gnt-cluster repair-disk-sizes", idx) 2506 2507 if instance.disk_template in constants.DTS_INT_MIRROR: 2508 instance_nodes = utils.NiceSort(inst_nodes) 2509 instance_groups = {} 2510 2511 for node_uuid in instance_nodes: 2512 instance_groups.setdefault(self.all_node_info[node_uuid].group, 2513 []).append(node_uuid) 2514 2515 pretty_list = [ 2516 "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)), 2517 groupinfo[group].name) 2518 # Sort so that we always list the primary node first. 2519 for group, nodes in sorted(instance_groups.items(), 2520 key=lambda (_, nodes): pnode_uuid in nodes, 2521 reverse=True)] 2522 2523 self._ErrorIf(len(instance_groups) > 1, 2524 constants.CV_EINSTANCESPLITGROUPS, 2525 instance.name, "instance has primary and secondary nodes in" 2526 " different groups: %s", utils.CommaJoin(pretty_list), 2527 code=self.ETYPE_WARNING) 2528 2529 inst_nodes_offline = [] 2530 for snode in secondary_nodes: 2531 s_img = node_image[snode] 2532 self._ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC, 2533 self.cfg.GetNodeName(snode), 2534 "instance %s, connection to secondary node failed", 2535 instance.name) 2536 2537 if s_img.offline: 2538 inst_nodes_offline.append(snode) 2539 2540 # warn that the instance lives on offline nodes 2541 self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, 2542 instance.name, "instance has offline secondary node(s) %s", 2543 utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline))) 2544 # ... or ghost/non-vm_capable nodes 2545 for node_uuid in inst_nodes: 2546 self._ErrorIf(node_image[node_uuid].ghost, constants.CV_EINSTANCEBADNODE, 2547 instance.name, "instance lives on ghost node %s", 2548 self.cfg.GetNodeName(node_uuid)) 2549 self._ErrorIf(not node_image[node_uuid].vm_capable, 2550 constants.CV_EINSTANCEBADNODE, instance.name, 2551 "instance lives on non-vm_capable node %s", 2552 self.cfg.GetNodeName(node_uuid))
2553
2554 - def _VerifyOrphanVolumes(self, vg_name, node_vol_should, node_image, 2555 reserved):
2556 """Verify if there are any unknown volumes in the cluster. 2557 2558 The .os, .swap and backup volumes are ignored. All other volumes are 2559 reported as unknown. 2560 2561 @type vg_name: string 2562 @param vg_name: the name of the Ganeti-administered volume group 2563 @type reserved: L{ganeti.utils.FieldSet} 2564 @param reserved: a FieldSet of reserved volume names 2565 2566 """ 2567 for node_uuid, n_img in node_image.items(): 2568 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or 2569 self.all_node_info[node_uuid].group != self.group_uuid): 2570 # skip non-healthy nodes 2571 continue 2572 for volume in n_img.volumes: 2573 # skip volumes not belonging to the ganeti-administered volume group 2574 if volume.split('/')[0] != vg_name: 2575 continue 2576 2577 test = ((node_uuid not in node_vol_should or 2578 volume not in node_vol_should[node_uuid]) and 2579 not reserved.Matches(volume)) 2580 self._ErrorIf(test, constants.CV_ENODEORPHANLV, 2581 self.cfg.GetNodeName(node_uuid), 2582 "volume %s is unknown", volume, 2583 code=_VerifyErrors.ETYPE_WARNING)
2584
2585 - def _VerifyNPlusOneMemory(self, node_image, all_insts):
2586 """Verify N+1 Memory Resilience. 2587 2588 Check that if one single node dies we can still start all the 2589 instances it was primary for. 2590 2591 """ 2592 cluster_info = self.cfg.GetClusterInfo() 2593 for node_uuid, n_img in node_image.items(): 2594 # This code checks that every node which is now listed as 2595 # secondary has enough memory to host all instances it is 2596 # supposed to should a single other node in the cluster fail. 2597 # FIXME: not ready for failover to an arbitrary node 2598 # FIXME: does not support file-backed instances 2599 # WARNING: we currently take into account down instances as well 2600 # as up ones, considering that even if they're down someone 2601 # might want to start them even in the event of a node failure. 2602 if n_img.offline or \ 2603 self.all_node_info[node_uuid].group != self.group_uuid: 2604 # we're skipping nodes marked offline and nodes in other groups from 2605 # the N+1 warning, since most likely we don't have good memory 2606 # information from them; we already list instances living on such 2607 # nodes, and that's enough warning 2608 continue 2609 #TODO(dynmem): also consider ballooning out other instances 2610 for prinode, inst_uuids in n_img.sbp.items(): 2611 needed_mem = 0 2612 for inst_uuid in inst_uuids: 2613 bep = cluster_info.FillBE(all_insts[inst_uuid]) 2614 if bep[constants.BE_AUTO_BALANCE]: 2615 needed_mem += bep[constants.BE_MINMEM] 2616 test = n_img.mfree < needed_mem 2617 self._ErrorIf(test, constants.CV_ENODEN1, 2618 self.cfg.GetNodeName(node_uuid), 2619 "not enough memory to accomodate instance failovers" 2620 " should node %s fail (%dMiB needed, %dMiB available)", 2621 self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
2622
2623 - def _VerifyClientCertificates(self, nodes, all_nvinfo):
2624 """Verifies the consistency of the client certificates. 2625 2626 This includes several aspects: 2627 - the individual validation of all nodes' certificates 2628 - the consistency of the master candidate certificate map 2629 - the consistency of the master candidate certificate map with the 2630 certificates that the master candidates are actually using. 2631 2632 @param nodes: the list of nodes to consider in this verification 2633 @param all_nvinfo: the map of results of the verify_node call to 2634 all nodes 2635 2636 """ 2637 candidate_certs = self.cfg.GetClusterInfo().candidate_certs 2638 if candidate_certs is None or len(candidate_certs) == 0: 2639 self._ErrorIf( 2640 True, constants.CV_ECLUSTERCLIENTCERT, None, 2641 "The cluster's list of master candidate certificates is empty." 2642 " If you just updated the cluster, please run" 2643 " 'gnt-cluster renew-crypto --new-node-certificates'.") 2644 return 2645 2646 self._ErrorIf( 2647 len(candidate_certs) != len(set(candidate_certs.values())), 2648 constants.CV_ECLUSTERCLIENTCERT, None, 2649 "There are at least two master candidates configured to use the same" 2650 " certificate.") 2651 2652 # collect the client certificate 2653 for node in nodes: 2654 if node.offline: 2655 continue 2656 2657 nresult = all_nvinfo[node.uuid] 2658 if nresult.fail_msg or not nresult.payload: 2659 continue 2660 2661 (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None) 2662 2663 self._ErrorIf( 2664 errcode is not None, constants.CV_ECLUSTERCLIENTCERT, None, 2665 "Client certificate of node '%s' failed validation: %s (code '%s')", 2666 node.uuid, msg, errcode) 2667 2668 if not errcode: 2669 digest = msg 2670 if node.master_candidate: 2671 if node.uuid in candidate_certs: 2672 self._ErrorIf( 2673 digest != candidate_certs[node.uuid], 2674 constants.CV_ECLUSTERCLIENTCERT, None, 2675 "Client certificate digest of master candidate '%s' does not" 2676 " match its entry in the cluster's map of master candidate" 2677 " certificates. Expected: %s Got: %s", node.uuid, 2678 digest, candidate_certs[node.uuid]) 2679 else: 2680 self._ErrorIf( 2681 True, constants.CV_ECLUSTERCLIENTCERT, None, 2682 "The master candidate '%s' does not have an entry in the" 2683 " map of candidate certificates.", node.uuid) 2684 self._ErrorIf( 2685 digest in candidate_certs.values(), 2686 constants.CV_ECLUSTERCLIENTCERT, None, 2687 "Master candidate '%s' is using a certificate of another node.", 2688 node.uuid) 2689 else: 2690 self._ErrorIf( 2691 node.uuid in candidate_certs, 2692 constants.CV_ECLUSTERCLIENTCERT, None, 2693 "Node '%s' is not a master candidate, but still listed in the" 2694 " map of master candidate certificates.", node.uuid) 2695 self._ErrorIf( 2696 (node.uuid not in candidate_certs) and 2697 (digest in candidate_certs.values()), 2698 constants.CV_ECLUSTERCLIENTCERT, None, 2699 "Node '%s' is not a master candidate and is incorrectly using a" 2700 " certificate of another node which is master candidate.", 2701 node.uuid)
2702
2703 - def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo, 2704 (files_all, files_opt, files_mc, files_vm)):
2705 """Verifies file checksums collected from all nodes. 2706 2707 @param nodes: List of L{objects.Node} objects 2708 @param master_node_uuid: UUID of master node 2709 @param all_nvinfo: RPC results 2710 2711 """ 2712 # Define functions determining which nodes to consider for a file 2713 files2nodefn = [ 2714 (files_all, None), 2715 (files_mc, lambda node: (node.master_candidate or 2716 node.uuid == master_node_uuid)), 2717 (files_vm, lambda node: node.vm_capable), 2718 ] 2719 2720 # Build mapping from filename to list of nodes which should have the file 2721 nodefiles = {} 2722 for (files, fn) in files2nodefn: 2723 if fn is None: 2724 filenodes = nodes 2725 else: 2726 filenodes = filter(fn, nodes) 2727 nodefiles.update((filename, 2728 frozenset(map(operator.attrgetter("uuid"), filenodes))) 2729 for filename in files) 2730 2731 assert set(nodefiles) == (files_all | files_mc | files_vm) 2732 2733 fileinfo = dict((filename, {}) for filename in nodefiles) 2734 ignore_nodes = set() 2735 2736 for node in nodes: 2737 if node.offline: 2738 ignore_nodes.add(node.uuid) 2739 continue 2740 2741 nresult = all_nvinfo[node.uuid] 2742 2743 if nresult.fail_msg or not nresult.payload: 2744 node_files = None 2745 else: 2746 fingerprints = nresult.payload.get(constants.NV_FILELIST, {}) 2747 node_files = dict((vcluster.LocalizeVirtualPath(key), value) 2748 for (key, value) in fingerprints.items()) 2749 del fingerprints 2750 2751 test = not (node_files and isinstance(node_files, dict)) 2752 self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name, 2753 "Node did not return file checksum data") 2754 if test: 2755 ignore_nodes.add(node.uuid) 2756 continue 2757 2758 # Build per-checksum mapping from filename to nodes having it 2759 for (filename, checksum) in node_files.items(): 2760 assert filename in nodefiles 2761 fileinfo[filename].setdefault(checksum, set()).add(node.uuid) 2762 2763 for (filename, checksums) in fileinfo.items(): 2764 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum" 2765 2766 # Nodes having the file 2767 with_file = frozenset(node_uuid 2768 for node_uuids in fileinfo[filename].values() 2769 for node_uuid in node_uuids) - ignore_nodes 2770 2771 expected_nodes = nodefiles[filename] - ignore_nodes 2772 2773 # Nodes missing file 2774 missing_file = expected_nodes - with_file 2775 2776 if filename in files_opt: 2777 # All or no nodes 2778 self._ErrorIf(missing_file and missing_file != expected_nodes, 2779 constants.CV_ECLUSTERFILECHECK, None, 2780 "File %s is optional, but it must exist on all or no" 2781 " nodes (not found on %s)", 2782 filename, 2783 utils.CommaJoin( 2784 utils.NiceSort( 2785 map(self.cfg.GetNodeName, missing_file)))) 2786 else: 2787 self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None, 2788 "File %s is missing from node(s) %s", filename, 2789 utils.CommaJoin( 2790 utils.NiceSort( 2791 map(self.cfg.GetNodeName, missing_file)))) 2792 2793 # Warn if a node has a file it shouldn't 2794 unexpected = with_file - expected_nodes 2795 self._ErrorIf(unexpected, 2796 constants.CV_ECLUSTERFILECHECK, None, 2797 "File %s should not exist on node(s) %s", 2798 filename, utils.CommaJoin( 2799 utils.NiceSort(map(self.cfg.GetNodeName, unexpected)))) 2800 2801 # See if there are multiple versions of the file 2802 test = len(checksums) > 1 2803 if test: 2804 variants = ["variant %s on %s" % 2805 (idx + 1, 2806 utils.CommaJoin(utils.NiceSort( 2807 map(self.cfg.GetNodeName, node_uuids)))) 2808 for (idx, (checksum, node_uuids)) in 2809 enumerate(sorted(checksums.items()))] 2810 else: 2811 variants = [] 2812 2813 self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None, 2814 "File %s found with %s different checksums (%s)", 2815 filename, len(checksums), "; ".join(variants))
2816
2817 - def _VerifyNodeDrbdHelper(self, ninfo, nresult, drbd_helper):
2818 """Verify the drbd helper. 2819 2820 """ 2821 if drbd_helper: 2822 helper_result = nresult.get(constants.NV_DRBDHELPER, None) 2823 test = (helper_result is None) 2824 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 2825 "no drbd usermode helper returned") 2826 if helper_result: 2827 status, payload = helper_result 2828 test = not status 2829 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 2830 "drbd usermode helper check unsuccessful: %s", payload) 2831 test = status and (payload != drbd_helper) 2832 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 2833 "wrong drbd usermode helper: %s", payload)
2834
2835 - def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper, 2836 drbd_map):
2837 """Verifies and the node DRBD status. 2838 2839 @type ninfo: L{objects.Node} 2840 @param ninfo: the node to check 2841 @param nresult: the remote results for the node 2842 @param instanceinfo: the dict of instances 2843 @param drbd_helper: the configured DRBD usermode helper 2844 @param drbd_map: the DRBD map as returned by 2845 L{ganeti.config.ConfigWriter.ComputeDRBDMap} 2846 2847 """ 2848 self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper) 2849 2850 # compute the DRBD minors 2851 node_drbd = {} 2852 for minor, inst_uuid in drbd_map[ninfo.uuid].items(): 2853 test = inst_uuid not in instanceinfo 2854 self._ErrorIf(test, constants.CV_ECLUSTERCFG, None, 2855 "ghost instance '%s' in temporary DRBD map", inst_uuid) 2856 # ghost instance should not be running, but otherwise we 2857 # don't give double warnings (both ghost instance and 2858 # unallocated minor in use) 2859 if test: 2860 node_drbd[minor] = (inst_uuid, False) 2861 else: 2862 instance = instanceinfo[inst_uuid] 2863 node_drbd[minor] = (inst_uuid, instance.disks_active) 2864 2865 # and now check them 2866 used_minors = nresult.get(constants.NV_DRBDLIST, []) 2867 test = not isinstance(used_minors, (tuple, list)) 2868 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 2869 "cannot parse drbd status file: %s", str(used_minors)) 2870 if test: 2871 # we cannot check drbd status 2872 return 2873 2874 for minor, (inst_uuid, must_exist) in node_drbd.items(): 2875 test = minor not in used_minors and must_exist 2876 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 2877 "drbd minor %d of instance %s is not active", minor, 2878 self.cfg.GetInstanceName(inst_uuid)) 2879 for minor in used_minors: 2880 test = minor not in node_drbd 2881 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 2882 "unallocated drbd minor %d is in use", minor)
2883
2884 - def _UpdateNodeOS(self, ninfo, nresult, nimg):
2885 """Builds the node OS structures. 2886 2887 @type ninfo: L{objects.Node} 2888 @param ninfo: the node to check 2889 @param nresult: the remote results for the node 2890 @param nimg: the node image object 2891 2892 """ 2893 remote_os = nresult.get(constants.NV_OSLIST, None) 2894 test = (not isinstance(remote_os, list) or 2895 not compat.all(isinstance(v, list) and len(v) == 8 2896 for v in remote_os)) 2897 2898 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name, 2899 "node hasn't returned valid OS data") 2900 2901 nimg.os_fail = test 2902 2903 if test: 2904 return 2905 2906 os_dict = {} 2907 2908 for (name, os_path, status, diagnose, 2909 variants, parameters, api_ver, 2910 trusted) in nresult[constants.NV_OSLIST]: 2911 2912 if name not in os_dict: 2913 os_dict[name] = [] 2914 2915 # parameters is a list of lists instead of list of tuples due to 2916 # JSON lacking a real tuple type, fix it: 2917 parameters = [tuple(v) for v in parameters] 2918 os_dict[name].append((os_path, status, diagnose, 2919 set(variants), set(parameters), set(api_ver), 2920 trusted)) 2921 2922 nimg.oslist = os_dict
2923
2924 - def _VerifyNodeOS(self, ninfo, nimg, base):
2925 """Verifies the node OS list. 2926 2927 @type ninfo: L{objects.Node} 2928 @param ninfo: the node to check 2929 @param nimg: the node image object 2930 @param base: the 'template' node we match against (e.g. from the master) 2931 2932 """ 2933 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?" 2934 2935 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l] 2936 for os_name, os_data in nimg.oslist.items(): 2937 assert os_data, "Empty OS status for OS %s?!" % os_name 2938 f_path, f_status, f_diag, f_var, f_param, f_api, f_trusted = os_data[0] 2939 self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name, 2940 "Invalid OS %s (located at %s): %s", 2941 os_name, f_path, f_diag) 2942 self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name, 2943 "OS '%s' has multiple entries" 2944 " (first one shadows the rest): %s", 2945 os_name, utils.CommaJoin([v[0] for v in os_data])) 2946 # comparisons with the 'base' image 2947 test = os_name not in base.oslist 2948 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name, 2949 "Extra OS %s not present on reference node (%s)", 2950 os_name, self.cfg.GetNodeName(base.uuid)) 2951 if test: 2952 continue 2953 assert base.oslist[os_name], "Base node has empty OS status?" 2954 _, b_status, _, b_var, b_param, b_api, b_trusted = base.oslist[os_name][0] 2955 if not b_status: 2956 # base OS is invalid, skipping 2957 continue 2958 for kind, a, b in [("API version", f_api, b_api), 2959 ("variants list", f_var, b_var), 2960 ("parameters", beautify_params(f_param), 2961 beautify_params(b_param))]: 2962 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name, 2963 "OS %s for %s differs from reference node %s:" 2964 " [%s] vs. [%s]", kind, os_name, 2965 self.cfg.GetNodeName(base.uuid), 2966 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b))) 2967 for kind, a, b in [("trusted", f_trusted, b_trusted)]: 2968 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name, 2969 "OS %s for %s differs from reference node %s:" 2970 " %s vs. %s", kind, os_name, 2971 self.cfg.GetNodeName(base.uuid), a, b) 2972 2973 # check any missing OSes 2974 missing = set(base.oslist.keys()).difference(nimg.oslist.keys()) 2975 self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name, 2976 "OSes present on reference node %s" 2977 " but missing on this node: %s", 2978 self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
2979
2980 - def _VerifyAcceptedFileStoragePaths(self, ninfo, nresult, is_master):
2981 """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}. 2982 2983 @type ninfo: L{objects.Node} 2984 @param ninfo: the node to check 2985 @param nresult: the remote results for the node 2986 @type is_master: bool 2987 @param is_master: Whether node is the master node 2988 2989 """ 2990 cluster = self.cfg.GetClusterInfo() 2991 if (is_master and 2992 (cluster.IsFileStorageEnabled() or 2993 cluster.IsSharedFileStorageEnabled())): 2994 try: 2995 fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS] 2996 except KeyError: 2997 # This should never happen 2998 self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 2999 "Node did not return forbidden file storage paths") 3000 else: 3001 self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 3002 "Found forbidden file storage paths: %s", 3003 utils.CommaJoin(fspaths)) 3004 else: 3005 self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult, 3006 constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 3007 "Node should not have returned forbidden file storage" 3008 " paths")
3009
3010 - def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template, 3011 verify_key, error_key):
3012 """Verifies (file) storage paths. 3013 3014 @type ninfo: L{objects.Node} 3015 @param ninfo: the node to check 3016 @param nresult: the remote results for the node 3017 @type file_disk_template: string 3018 @param file_disk_template: file-based disk template, whose directory 3019 is supposed to be verified 3020 @type verify_key: string 3021 @param verify_key: key for the verification map of this file 3022 verification step 3023 @param error_key: error key to be added to the verification results 3024 in case something goes wrong in this verification step 3025 3026 """ 3027 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes( 3028 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER 3029 )) 3030 3031 cluster = self.cfg.GetClusterInfo() 3032 if cluster.IsDiskTemplateEnabled(file_disk_template): 3033 self._ErrorIf( 3034 verify_key in nresult, 3035 error_key, ninfo.name, 3036 "The configured %s storage path is unusable: %s" % 3037 (file_disk_template, nresult.get(verify_key)))
3038
3039 - def _VerifyFileStoragePaths(self, ninfo, nresult):
3040 """Verifies (file) storage paths. 3041 3042 @see: C{_VerifyStoragePaths} 3043 3044 """ 3045 self._VerifyStoragePaths( 3046 ninfo, nresult, constants.DT_FILE, 3047 constants.NV_FILE_STORAGE_PATH, 3048 constants.CV_ENODEFILESTORAGEPATHUNUSABLE)
3049
3050 - def _VerifySharedFileStoragePaths(self, ninfo, nresult):
3051 """Verifies (file) storage paths. 3052 3053 @see: C{_VerifyStoragePaths} 3054 3055 """ 3056 self._VerifyStoragePaths( 3057 ninfo, nresult, constants.DT_SHARED_FILE, 3058 constants.NV_SHARED_FILE_STORAGE_PATH, 3059 constants.CV_ENODESHAREDFILESTORAGEPATHUNUSABLE)
3060
3061 - def _VerifyGlusterStoragePaths(self, ninfo, nresult):
3062 """Verifies (file) storage paths. 3063 3064 @see: C{_VerifyStoragePaths} 3065 3066 """ 3067 self._VerifyStoragePaths( 3068 ninfo, nresult, constants.DT_GLUSTER, 3069 constants.NV_GLUSTER_STORAGE_PATH, 3070 constants.CV_ENODEGLUSTERSTORAGEPATHUNUSABLE)
3071
3072 - def _VerifyOob(self, ninfo, nresult):
3073 """Verifies out of band functionality of a node. 3074 3075 @type ninfo: L{objects.Node} 3076 @param ninfo: the node to check 3077 @param nresult: the remote results for the node 3078 3079 """ 3080 # We just have to verify the paths on master and/or master candidates 3081 # as the oob helper is invoked on the master 3082 if ((ninfo.master_candidate or ninfo.master_capable) and 3083 constants.NV_OOB_PATHS in nresult): 3084 for path_result in nresult[constants.NV_OOB_PATHS]: 3085 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, 3086 ninfo.name, path_result)
3087
3088 - def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
3089 """Verifies and updates the node volume data. 3090 3091 This function will update a L{NodeImage}'s internal structures 3092 with data from the remote call. 3093 3094 @type ninfo: L{objects.Node} 3095 @param ninfo: the node to check 3096 @param nresult: the remote results for the node 3097 @param nimg: the node image object 3098 @param vg_name: the configured VG name 3099 3100 """ 3101 nimg.lvm_fail = True 3102 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data") 3103 if vg_name is None: 3104 pass 3105 elif isinstance(lvdata, basestring): 3106 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name, 3107 "LVM problem on node: %s", utils.SafeEncode(lvdata)) 3108 elif not isinstance(lvdata, dict): 3109 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name, 3110 "rpc call to node failed (lvlist)") 3111 else: 3112 nimg.volumes = lvdata 3113 nimg.lvm_fail = False
3114
3115 - def _UpdateNodeInstances(self, ninfo, nresult, nimg):
3116 """Verifies and updates the node instance list. 3117 3118 If the listing was successful, then updates this node's instance 3119 list. Otherwise, it marks the RPC call as failed for the instance 3120 list key. 3121 3122 @type ninfo: L{objects.Node} 3123 @param ninfo: the node to check 3124 @param nresult: the remote results for the node 3125 @param nimg: the node image object 3126 3127 """ 3128 idata = nresult.get(constants.NV_INSTANCELIST, None) 3129 test = not isinstance(idata, list) 3130 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 3131 "rpc call to node failed (instancelist): %s", 3132 utils.SafeEncode(str(idata))) 3133 if test: 3134 nimg.hyp_fail = True 3135 else: 3136 nimg.instances = [uuid for (uuid, _) in 3137 self.cfg.GetMultiInstanceInfoByName(idata)]
3138
3139 - def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
3140 """Verifies and computes a node information map 3141 3142 @type ninfo: L{objects.Node} 3143 @param ninfo: the node to check 3144 @param nresult: the remote results for the node 3145 @param nimg: the node image object 3146 @param vg_name: the configured VG name 3147 3148 """ 3149 # try to read free memory (from the hypervisor) 3150 hv_info = nresult.get(constants.NV_HVINFO, None) 3151 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info 3152 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 3153 "rpc call to node failed (hvinfo)") 3154 if not test: 3155 try: 3156 nimg.mfree = int(hv_info["memory_free"]) 3157 except (ValueError, TypeError): 3158 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, 3159 "node returned invalid nodeinfo, check hypervisor") 3160 3161 # FIXME: devise a free space model for file based instances as well 3162 if vg_name is not None: 3163 test = (constants.NV_VGLIST not in nresult or 3164 vg_name not in nresult[constants.NV_VGLIST]) 3165 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name, 3166 "node didn't return data for the volume group '%s'" 3167 " - it is either missing or broken", vg_name) 3168 if not test: 3169 try: 3170 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name]) 3171 except (ValueError, TypeError): 3172 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, 3173 "node returned invalid LVM info, check LVM status")
3174
3175 - def _CollectDiskInfo(self, node_uuids, node_image, instanceinfo):
3176 """Gets per-disk status information for all instances. 3177 3178 @type node_uuids: list of strings 3179 @param node_uuids: Node UUIDs 3180 @type node_image: dict of (UUID, L{objects.Node}) 3181 @param node_image: Node objects 3182 @type instanceinfo: dict of (UUID, L{objects.Instance}) 3183 @param instanceinfo: Instance objects 3184 @rtype: {instance: {node: [(succes, payload)]}} 3185 @return: a dictionary of per-instance dictionaries with nodes as 3186 keys and disk information as values; the disk information is a 3187 list of tuples (success, payload) 3188 3189 """ 3190 node_disks = {} 3191 node_disks_dev_inst_only = {} 3192 diskless_instances = set() 3193 nodisk_instances = set() 3194 diskless = constants.DT_DISKLESS 3195 3196 for nuuid in node_uuids: 3197 node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst, 3198 node_image[nuuid].sinst)) 3199 diskless_instances.update(uuid for uuid in node_inst_uuids 3200 if instanceinfo[uuid].disk_template == diskless) 3201 disks = [(inst_uuid, disk) 3202 for inst_uuid in node_inst_uuids 3203 for disk in self.cfg.GetInstanceDisks(inst_uuid)] 3204 3205 if not disks: 3206 nodisk_instances.update(uuid for uuid in node_inst_uuids 3207 if instanceinfo[uuid].disk_template != diskless) 3208 # No need to collect data 3209 continue 3210 3211 node_disks[nuuid] = disks 3212 3213 # _AnnotateDiskParams makes already copies of the disks 3214 dev_inst_only = [] 3215 for (inst_uuid, dev) in disks: 3216 (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev], 3217 self.cfg) 3218 dev_inst_only.append((anno_disk, instanceinfo[inst_uuid])) 3219 3220 node_disks_dev_inst_only[nuuid] = dev_inst_only 3221 3222 assert len(node_disks) == len(node_disks_dev_inst_only) 3223 3224 # Collect data from all nodes with disks 3225 result = self.rpc.call_blockdev_getmirrorstatus_multi( 3226 node_disks.keys(), node_disks_dev_inst_only) 3227 3228 assert len(result) == len(node_disks) 3229 3230 instdisk = {} 3231 3232 for (nuuid, nres) in result.items(): 3233 node = self.cfg.GetNodeInfo(nuuid) 3234 disks = node_disks[node.uuid] 3235 3236 if nres.offline: 3237 # No data from this node 3238 data = len(disks) * [(False, "node offline")] 3239 else: 3240 msg = nres.fail_msg 3241 self._ErrorIf(msg, constants.CV_ENODERPC, node.name, 3242 "while getting disk information: %s", msg) 3243 if msg: 3244 # No data from this node 3245 data = len(disks) * [(False, msg)] 3246 else: 3247 data = [] 3248 for idx, i in enumerate(nres.payload): 3249 if isinstance(i, (tuple, list)) and len(i) == 2: 3250 data.append(i) 3251 else: 3252 logging.warning("Invalid result from node %s, entry %d: %s", 3253 node.name, idx, i) 3254 data.append((False, "Invalid result from the remote node")) 3255 3256 for ((inst_uuid, _), status) in zip(disks, data): 3257 instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \ 3258 .append(status) 3259 3260 # Add empty entries for diskless instances. 3261 for inst_uuid in diskless_instances: 3262 assert inst_uuid not in instdisk 3263 instdisk[inst_uuid] = {} 3264 # ...and disk-full instances that happen to have no disks 3265 for inst_uuid in nodisk_instances: 3266 assert inst_uuid not in instdisk 3267 instdisk[inst_uuid] = {} 3268 3269 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and 3270 len(nuuids) <= len( 3271 self.cfg.GetInstanceNodes(instanceinfo[inst].uuid)) and 3272 compat.all(isinstance(s, (tuple, list)) and 3273 len(s) == 2 for s in statuses) 3274 for inst, nuuids in instdisk.items() 3275 for nuuid, statuses in nuuids.items()) 3276 if __debug__: 3277 instdisk_keys = set(instdisk) 3278 instanceinfo_keys = set(instanceinfo) 3279 assert instdisk_keys == instanceinfo_keys, \ 3280 ("instdisk keys (%s) do not match instanceinfo keys (%s)" % 3281 (instdisk_keys, instanceinfo_keys)) 3282 3283 return instdisk
3284 3285 @staticmethod
3286 - def _SshNodeSelector(group_uuid, all_nodes):
3287 """Create endless iterators for all potential SSH check hosts. 3288 3289 """ 3290 nodes = [node for node in all_nodes 3291 if (node.group != group_uuid and 3292 not node.offline)] 3293 keyfunc = operator.attrgetter("group") 3294 3295 return map(itertools.cycle, 3296 [sorted(map(operator.attrgetter("name"), names)) 3297 for _, names in itertools.groupby(sorted(nodes, key=keyfunc), 3298 keyfunc)])
3299 3300 @classmethod
3301 - def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
3302 """Choose which nodes should talk to which other nodes. 3303 3304 We will make nodes contact all nodes in their group, and one node from 3305 every other group. 3306 3307 @warning: This algorithm has a known issue if one node group is much 3308 smaller than others (e.g. just one node). In such a case all other 3309 nodes will talk to the single node. 3310 3311 """ 3312 online_nodes = sorted(node.name for node in group_nodes if not node.offline) 3313 sel = cls._SshNodeSelector(group_uuid, all_nodes) 3314 3315 return (online_nodes, 3316 dict((name, sorted([i.next() for i in sel])) 3317 for name in online_nodes))
3318
3319 - def BuildHooksEnv(self):
3320 """Build hooks env. 3321 3322 Cluster-Verify hooks just ran in the post phase and their failure makes 3323 the output be logged in the verify output and the verification to fail. 3324 3325 """ 3326 env = { 3327 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()), 3328 } 3329 3330 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags())) 3331 for node in self.my_node_info.values()) 3332 3333 return env
3334
3335 - def BuildHooksNodes(self):
3336 """Build hooks nodes. 3337 3338 """ 3339 return ([], list(self.my_node_info.keys()))
3340
3341 - def Exec(self, feedback_fn): # pylint: disable=R0915
3342 """Verify integrity of the node group, performing various test on nodes. 3343 3344 """ 3345 # This method has too many local variables. pylint: disable=R0914 3346 feedback_fn("* Verifying group '%s'" % self.group_info.name) 3347 3348 if not self.my_node_uuids: 3349 # empty node group 3350 feedback_fn("* Empty node group, skipping verification") 3351 return True 3352 3353 self.bad = False 3354 verbose = self.op.verbose 3355 self._feedback_fn = feedback_fn 3356 3357 vg_name = self.cfg.GetVGName() 3358 drbd_helper = self.cfg.GetDRBDHelper() 3359 cluster = self.cfg.GetClusterInfo() 3360 hypervisors = cluster.enabled_hypervisors 3361 node_data_list = self.my_node_info.values() 3362 3363 i_non_redundant = [] # Non redundant instances 3364 i_non_a_balanced = [] # Non auto-balanced instances 3365 i_offline = 0 # Count of offline instances 3366 n_offline = 0 # Count of offline nodes 3367 n_drained = 0 # Count of nodes being drained 3368 node_vol_should = {} 3369 3370 # FIXME: verify OS list 3371 3372 # File verification 3373 filemap = ComputeAncillaryFiles(cluster, False) 3374 3375 # do local checksums 3376 master_node_uuid = self.master_node = self.cfg.GetMasterNode() 3377 master_ip = self.cfg.GetMasterIP() 3378 3379 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids)) 3380 3381 user_scripts = [] 3382 if self.cfg.GetUseExternalMipScript(): 3383 user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT) 3384 3385 node_verify_param = { 3386 constants.NV_FILELIST: 3387 map(vcluster.MakeVirtualPath, 3388 utils.UniqueSequence(filename 3389 for files in filemap 3390 for filename in files)), 3391 constants.NV_NODELIST: 3392 self._SelectSshCheckNodes(node_data_list, self.group_uuid, 3393 self.all_node_info.values()), 3394 constants.NV_HYPERVISOR: hypervisors, 3395 constants.NV_HVPARAMS: 3396 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()), 3397 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip) 3398 for node in node_data_list 3399 if not node.offline], 3400 constants.NV_INSTANCELIST: hypervisors, 3401 constants.NV_VERSION: None, 3402 constants.NV_HVINFO: self.cfg.GetHypervisorType(), 3403 constants.NV_NODESETUP: None, 3404 constants.NV_TIME: None, 3405 constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip), 3406 constants.NV_OSLIST: None, 3407 constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(), 3408 constants.NV_USERSCRIPTS: user_scripts, 3409 constants.NV_CLIENT_CERT: None, 3410 } 3411 3412 if vg_name is not None: 3413 node_verify_param[constants.NV_VGLIST] = None 3414 node_verify_param[constants.NV_LVLIST] = vg_name 3415 node_verify_param[constants.NV_PVLIST] = [vg_name] 3416 3417 if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8): 3418 if drbd_helper: 3419 node_verify_param[constants.NV_DRBDVERSION] = None 3420 node_verify_param[constants.NV_DRBDLIST] = None 3421 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper 3422 3423 if cluster.IsFileStorageEnabled() or \ 3424 cluster.IsSharedFileStorageEnabled(): 3425 # Load file storage paths only from master node 3426 node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \ 3427 self.cfg.GetMasterNodeName() 3428 if cluster.IsFileStorageEnabled(): 3429 node_verify_param[constants.NV_FILE_STORAGE_PATH] = \ 3430 cluster.file_storage_dir 3431 if cluster.IsSharedFileStorageEnabled(): 3432 node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \ 3433 cluster.shared_file_storage_dir 3434 3435 # bridge checks 3436 # FIXME: this needs to be changed per node-group, not cluster-wide 3437 bridges = set() 3438 default_nicpp = cluster.nicparams[constants.PP_DEFAULT] 3439 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: 3440 bridges.add(default_nicpp[constants.NIC_LINK]) 3441 for inst_uuid in self.my_inst_info.values(): 3442 for nic in inst_uuid.nics: 3443 full_nic = cluster.SimpleFillNIC(nic.nicparams) 3444 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: 3445 bridges.add(full_nic[constants.NIC_LINK]) 3446 3447 if bridges: 3448 node_verify_param[constants.NV_BRIDGES] = list(bridges) 3449 3450 # Build our expected cluster state 3451 node_image = dict((node.uuid, self.NodeImage(offline=node.offline, 3452 uuid=node.uuid, 3453 vm_capable=node.vm_capable)) 3454 for node in node_data_list) 3455 3456 # Gather OOB paths 3457 oob_paths = [] 3458 for node in self.all_node_info.values(): 3459 path = SupportsOob(self.cfg, node) 3460 if path and path not in oob_paths: 3461 oob_paths.append(path) 3462 3463 if oob_paths: 3464 node_verify_param[constants.NV_OOB_PATHS] = oob_paths 3465 3466 for inst_uuid in self.my_inst_uuids: 3467 instance = self.my_inst_info[inst_uuid] 3468 if instance.admin_state == constants.ADMINST_OFFLINE: 3469 i_offline += 1 3470 3471 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid) 3472 for nuuid in inst_nodes: 3473 if nuuid not in node_image: 3474 gnode = self.NodeImage(uuid=nuuid) 3475 gnode.ghost = (nuuid not in self.all_node_info) 3476 node_image[nuuid] = gnode 3477 3478 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 3479 3480 pnode = instance.primary_node 3481 node_image[pnode].pinst.append(instance.uuid) 3482 3483 for snode in self.cfg.GetInstanceSecondaryNodes(instance.uuid): 3484 nimg = node_image[snode] 3485 nimg.sinst.append(instance.uuid) 3486 if pnode not in nimg.sbp: 3487 nimg.sbp[pnode] = [] 3488 nimg.sbp[pnode].append(instance.uuid) 3489 3490 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, 3491 self.my_node_info.keys()) 3492 # The value of exclusive_storage should be the same across the group, so if 3493 # it's True for at least a node, we act as if it were set for all the nodes 3494 self._exclusive_storage = compat.any(es_flags.values()) 3495 if self._exclusive_storage: 3496 node_verify_param[constants.NV_EXCLUSIVEPVS] = True 3497 3498 node_group_uuids = dict(map(lambda n: (n.name, n.group), 3499 self.cfg.GetAllNodesInfo().values())) 3500 groups_config = self.cfg.GetAllNodeGroupsInfoDict() 3501 3502 # At this point, we have the in-memory data structures complete, 3503 # except for the runtime information, which we'll gather next 3504 3505 # NOTE: Here we lock the configuration for the duration of RPC calls, 3506 # which means that the cluster configuration changes are blocked during 3507 # this period. 3508 # This is something that should be done only exceptionally and only for 3509 # justified cases! 3510 # In this case, we need the lock as we can only verify the integrity of 3511 # configuration files on MCs only if we know nobody else is modifying it. 3512 # FIXME: The check for integrity of config.data should be moved to 3513 # WConfD, which is the only one who can otherwise ensure nobody 3514 # will modify the configuration during the check. 3515 with self.cfg.GetConfigManager(shared=True): 3516 feedback_fn("* Gathering information about nodes (%s nodes)" % 3517 len(self.my_node_uuids)) 3518 # Force the configuration to be fully distributed before doing any tests 3519 self.cfg.FlushConfig() 3520 # Due to the way our RPC system works, exact response times cannot be 3521 # guaranteed (e.g. a broken node could run into a timeout). By keeping 3522 # the time before and after executing the request, we can at least have 3523 # a time window. 3524 nvinfo_starttime = time.time() 3525 # Get lock on the configuration so that nobody modifies it concurrently. 3526 # Otherwise it can be modified by other jobs, failing the consistency 3527 # test. 3528 # NOTE: This is an exceptional situation, we should otherwise avoid 3529 # locking the configuration for something but very fast, pure operations. 3530 cluster_name = self.cfg.GetClusterName() 3531 hvparams = self.cfg.GetClusterInfo().hvparams 3532 all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids, 3533 node_verify_param, 3534 cluster_name, 3535 hvparams, 3536 node_group_uuids, 3537 groups_config) 3538 nvinfo_endtime = time.time() 3539 3540 if self.extra_lv_nodes and vg_name is not None: 3541 feedback_fn("* Gathering information about extra nodes (%s nodes)" % 3542 len(self.extra_lv_nodes)) 3543 extra_lv_nvinfo = \ 3544 self.rpc.call_node_verify(self.extra_lv_nodes, 3545 {constants.NV_LVLIST: vg_name}, 3546 self.cfg.GetClusterName(), 3547 self.cfg.GetClusterInfo().hvparams, 3548 node_group_uuids, 3549 groups_config) 3550 else: 3551 extra_lv_nvinfo = {} 3552 3553 # If not all nodes are being checked, we need to make sure the master 3554 # node and a non-checked vm_capable node are in the list. 3555 absent_node_uuids = set(self.all_node_info).difference(self.my_node_info) 3556 if absent_node_uuids: 3557 vf_nvinfo = all_nvinfo.copy() 3558 vf_node_info = list(self.my_node_info.values()) 3559 additional_node_uuids = [] 3560 if master_node_uuid not in self.my_node_info: 3561 additional_node_uuids.append(master_node_uuid) 3562 vf_node_info.append(self.all_node_info[master_node_uuid]) 3563 # Add the first vm_capable node we find which is not included, 3564 # excluding the master node (which we already have) 3565 for node_uuid in absent_node_uuids: 3566 nodeinfo = self.all_node_info[node_uuid] 3567 if (nodeinfo.vm_capable and not nodeinfo.offline and 3568 node_uuid != master_node_uuid): 3569 additional_node_uuids.append(node_uuid) 3570 vf_node_info.append(self.all_node_info[node_uuid]) 3571 break 3572 key = constants.NV_FILELIST 3573 3574 feedback_fn("* Gathering information about the master node") 3575 vf_nvinfo.update(self.rpc.call_node_verify( 3576 additional_node_uuids, {key: node_verify_param[key]}, 3577 self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams, 3578 node_group_uuids, 3579 groups_config)) 3580 else: 3581 vf_nvinfo = all_nvinfo 3582 vf_node_info = self.my_node_info.values() 3583 3584 all_drbd_map = self.cfg.ComputeDRBDMap() 3585 3586 feedback_fn("* Gathering disk information (%s nodes)" % 3587 len(self.my_node_uuids)) 3588 instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image, 3589 self.my_inst_info) 3590 3591 feedback_fn("* Verifying configuration file consistency") 3592 3593 self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo) 3594 3595 self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap) 3596 3597 feedback_fn("* Verifying node status") 3598 3599 refos_img = None 3600 3601 for node_i in node_data_list: 3602 nimg = node_image[node_i.uuid] 3603 3604 if node_i.offline: 3605 if verbose: 3606 feedback_fn("* Skipping offline node %s" % (node_i.name,)) 3607 n_offline += 1 3608 continue 3609 3610 if node_i.uuid == master_node_uuid: 3611 ntype = "master" 3612 elif node_i.master_candidate: 3613 ntype = "master candidate" 3614 elif node_i.drained: 3615 ntype = "drained" 3616 n_drained += 1 3617 else: 3618 ntype = "regular" 3619 if verbose: 3620 feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype)) 3621 3622 msg = all_nvinfo[node_i.uuid].fail_msg 3623 self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name, 3624 "while contacting node: %s", msg) 3625 if msg: 3626 nimg.rpc_fail = True 3627 continue 3628 3629 nresult = all_nvinfo[node_i.uuid].payload 3630 3631 nimg.call_ok = self._VerifyNode(node_i, nresult) 3632 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime) 3633 self._VerifyNodeNetwork(node_i, nresult) 3634 self._VerifyNodeUserScripts(node_i, nresult) 3635 self._VerifyOob(node_i, nresult) 3636 self._VerifyAcceptedFileStoragePaths(node_i, nresult, 3637 node_i.uuid == master_node_uuid) 3638 self._VerifyFileStoragePaths(node_i, nresult) 3639 self._VerifySharedFileStoragePaths(node_i, nresult) 3640 self._VerifyGlusterStoragePaths(node_i, nresult) 3641 3642 if nimg.vm_capable: 3643 self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg) 3644 if constants.DT_DRBD8 in cluster.enabled_disk_templates: 3645 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper, 3646 all_drbd_map) 3647 3648 if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \ 3649 (constants.DT_DRBD8 in cluster.enabled_disk_templates): 3650 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name) 3651 self._UpdateNodeInstances(node_i, nresult, nimg) 3652 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name) 3653 self._UpdateNodeOS(node_i, nresult, nimg) 3654 3655 if not nimg.os_fail: 3656 if refos_img is None: 3657 refos_img = nimg 3658 self._VerifyNodeOS(node_i, nimg, refos_img) 3659 self._VerifyNodeBridges(node_i, nresult, bridges) 3660 3661 # Check whether all running instances are primary for the node. (This 3662 # can no longer be done from _VerifyInstance below, since some of the 3663 # wrong instances could be from other node groups.) 3664 non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst) 3665 3666 for inst_uuid in non_primary_inst_uuids: 3667 test = inst_uuid in self.all_inst_info 3668 self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, 3669 self.cfg.GetInstanceName(inst_uuid), 3670 "instance should not run on node %s", node_i.name) 3671 self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name, 3672 "node is running unknown instance %s", inst_uuid) 3673 3674 self._VerifyGroupDRBDVersion(all_nvinfo) 3675 self._VerifyGroupLVM(node_image, vg_name) 3676 3677 for node_uuid, result in extra_lv_nvinfo.items(): 3678 self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload, 3679 node_image[node_uuid], vg_name) 3680 3681 feedback_fn("* Verifying instance status") 3682 for inst_uuid in self.my_inst_uuids: 3683 instance = self.my_inst_info[inst_uuid] 3684 if verbose: 3685 feedback_fn("* Verifying instance %s" % instance.name) 3686 self._VerifyInstance(instance, node_image, instdisk[inst_uuid]) 3687 3688 # If the instance is non-redundant we cannot survive losing its primary 3689 # node, so we are not N+1 compliant. 3690 if instance.disk_template not in constants.DTS_MIRRORED: 3691 i_non_redundant.append(instance) 3692 3693 if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]: 3694 i_non_a_balanced.append(instance) 3695 3696 feedback_fn("* Verifying orphan volumes") 3697 reserved = utils.FieldSet(*cluster.reserved_lvs) 3698 3699 # We will get spurious "unknown volume" warnings if any node of this group 3700 # is secondary for an instance whose primary is in another group. To avoid 3701 # them, we find these instances and add their volumes to node_vol_should. 3702 for instance in self.all_inst_info.values(): 3703 for secondary in self.cfg.GetInstanceSecondaryNodes(instance.uuid): 3704 if (secondary in self.my_node_info 3705 and instance.name not in self.my_inst_info): 3706 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 3707 break 3708 3709 self._VerifyOrphanVolumes(vg_name, node_vol_should, node_image, reserved) 3710 3711 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks: 3712 feedback_fn("* Verifying N+1 Memory redundancy") 3713 self._VerifyNPlusOneMemory(node_image, self.my_inst_info) 3714 3715 feedback_fn("* Other Notes") 3716 if i_non_redundant: 3717 feedback_fn(" - NOTICE: %d non-redundant instance(s) found." 3718 % len(i_non_redundant)) 3719 3720 if i_non_a_balanced: 3721 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found." 3722 % len(i_non_a_balanced)) 3723 3724 if i_offline: 3725 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline) 3726 3727 if n_offline: 3728 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline) 3729 3730 if n_drained: 3731 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained) 3732 3733 return not self.bad
3734
3735 - def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
3736 """Analyze the post-hooks' result 3737 3738 This method analyses the hook result, handles it, and sends some 3739 nicely-formatted feedback back to the user. 3740 3741 @param phase: one of L{constants.HOOKS_PHASE_POST} or 3742 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase 3743 @param hooks_results: the results of the multi-node hooks rpc call 3744 @param feedback_fn: function used send feedback back to the caller 3745 @param lu_result: previous Exec result 3746 @return: the new Exec result, based on the previous result 3747 and hook results 3748 3749 """ 3750 # We only really run POST phase hooks, only for non-empty groups, 3751 # and are only interested in their results 3752 if not self.my_node_uuids: 3753 # empty node group 3754 pass 3755 elif phase == constants.HOOKS_PHASE_POST: 3756 # Used to change hooks' output to proper indentation 3757 feedback_fn("* Hooks Results") 3758 assert hooks_results, "invalid result from hooks" 3759 3760 for node_name in hooks_results: 3761 res = hooks_results[node_name] 3762 msg = res.fail_msg 3763 test = msg and not res.offline 3764 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, 3765 "Communication failure in hooks execution: %s", msg) 3766 if test: 3767 lu_result = False 3768 continue 3769 if res.offline: 3770 # No need to investigate payload if node is offline 3771 continue 3772 for script, hkr, output in res.payload: 3773 test = hkr == constants.HKR_FAIL 3774 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, 3775 "Script %s failed, output:", script) 3776 if test: 3777 output = self._HOOKS_INDENT_RE.sub(" ", output) 3778 feedback_fn("%s" % output) 3779 lu_result = False 3780 3781 return lu_result
3782
3783 3784 -class LUClusterVerifyDisks(NoHooksLU):
3785 """Verifies the cluster disks status. 3786 3787 """ 3788 REQ_BGL = False 3789
3790 - def ExpandNames(self):
3791 self.share_locks = ShareAll() 3792 self.needed_locks = { 3793 locking.LEVEL_NODEGROUP: locking.ALL_SET, 3794 }
3795
3796 - def Exec(self, feedback_fn):
3797 group_names = self.owned_locks(locking.LEVEL_NODEGROUP) 3798 3799 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group 3800 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)] 3801 for group in group_names])
3802