Package ganeti :: Package cmdlib :: Package cluster :: Module verify
[hide private]
[frames] | no frames]

Source Code for Module ganeti.cmdlib.cluster.verify

   1  # 
   2  # 
   3   
   4  # Copyright (C) 2014 Google Inc. 
   5  # All rights reserved. 
   6  # 
   7  # Redistribution and use in source and binary forms, with or without 
   8  # modification, are permitted provided that the following conditions are 
   9  # met: 
  10  # 
  11  # 1. Redistributions of source code must retain the above copyright notice, 
  12  # this list of conditions and the following disclaimer. 
  13  # 
  14  # 2. Redistributions in binary form must reproduce the above copyright 
  15  # notice, this list of conditions and the following disclaimer in the 
  16  # documentation and/or other materials provided with the distribution. 
  17  # 
  18  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
  19  # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
  20  # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
  21  # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
  22  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  23  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  24  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  25  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  26  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  27  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  28  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  29   
  30  """Logical units for cluster verification.""" 
  31   
  32  import itertools 
  33  import logging 
  34  import operator 
  35  import re 
  36  import time 
  37  import ganeti.masterd.instance 
  38  import ganeti.rpc.node as rpc 
  39   
  40  from ganeti import compat 
  41  from ganeti import constants 
  42  from ganeti import errors 
  43  from ganeti import locking 
  44  from ganeti import pathutils 
  45  from ganeti import utils 
  46  from ganeti import vcluster 
  47  from ganeti import hypervisor 
  48  from ganeti import opcodes 
  49   
  50  from ganeti.cmdlib.base import LogicalUnit, NoHooksLU, ResultWithJobs 
  51  from ganeti.cmdlib.common import ShareAll, ComputeAncillaryFiles, \ 
  52      CheckNodePVs, ComputeIPolicyInstanceViolation, AnnotateDiskParams, \ 
  53      SupportsOob 
54 55 56 -def _GetAllHypervisorParameters(cluster, instances):
57 """Compute the set of all hypervisor parameters. 58 59 @type cluster: L{objects.Cluster} 60 @param cluster: the cluster object 61 @param instances: list of L{objects.Instance} 62 @param instances: additional instances from which to obtain parameters 63 @rtype: list of (origin, hypervisor, parameters) 64 @return: a list with all parameters found, indicating the hypervisor they 65 apply to, and the origin (can be "cluster", "os X", or "instance Y") 66 67 """ 68 hvp_data = [] 69 70 for hv_name in cluster.enabled_hypervisors: 71 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name))) 72 73 for os_name, os_hvp in cluster.os_hvp.items(): 74 for hv_name, hv_params in os_hvp.items(): 75 if hv_params: 76 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name) 77 hvp_data.append(("os %s" % os_name, hv_name, full_params)) 78 79 # TODO: collapse identical parameter values in a single one 80 for instance in instances: 81 if instance.hvparams: 82 hvp_data.append(("instance %s" % instance.name, instance.hypervisor, 83 cluster.FillHV(instance))) 84 85 return hvp_data
86
87 88 -class _VerifyErrors(object):
89 """Mix-in for cluster/group verify LUs. 90 91 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects 92 self.op and self._feedback_fn to be available.) 93 94 """ 95 96 ETYPE_ERROR = constants.CV_ERROR 97 ETYPE_WARNING = constants.CV_WARNING 98
99 - def _ErrorMsgList(self, error_descriptor, object_name, message_list, 100 log_type=ETYPE_ERROR):
101 """Format multiple error messages. 102 103 Based on the opcode's error_codes parameter, either format a 104 parseable error code, or a simpler error string. 105 106 This must be called only from Exec and functions called from Exec. 107 108 109 @type error_descriptor: tuple (string, string, string) 110 @param error_descriptor: triplet describing the error (object_type, 111 code, description) 112 @type object_name: string 113 @param object_name: name of object (instance, node ..) the error relates to 114 @type message_list: list of strings 115 @param message_list: body of error messages 116 @type log_type: string 117 @param log_type: log message type (WARNING, ERROR ..) 118 """ 119 # Called with empty list - nothing to do 120 if not message_list: 121 return 122 123 object_type, error_code, _ = error_descriptor 124 # If the error code is in the list of ignored errors, demote the error to a 125 # warning 126 if error_code in self.op.ignore_errors: # pylint: disable=E1101 127 log_type = self.ETYPE_WARNING 128 129 prefixed_list = [] 130 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101 131 for msg in message_list: 132 prefixed_list.append(" - %s:%s:%s:%s:%s" % ( 133 log_type, error_code, object_type, object_name, msg)) 134 else: 135 if not object_name: 136 object_name = "" 137 for msg in message_list: 138 prefixed_list.append(" - %s: %s %s: %s" % ( 139 log_type, object_type, object_name, msg)) 140 141 # Report messages via the feedback_fn 142 # pylint: disable=E1101 143 self._feedback_fn(constants.ELOG_MESSAGE_LIST, prefixed_list) 144 # pylint: enable=E1101 145 146 # do not mark the operation as failed for WARN cases only 147 if log_type == self.ETYPE_ERROR: 148 self.bad = True
149
150 - def _ErrorMsg(self, error_descriptor, object_name, message, 151 log_type=ETYPE_ERROR):
152 """Log a single error message. 153 154 """ 155 self._ErrorMsgList(error_descriptor, object_name, [message], log_type)
156 157 # TODO: Replace this method with a cleaner interface, get rid of the if 158 # condition as it only rarely saves lines, but makes things less readable.
159 - def _ErrorIf(self, cond, *args, **kwargs):
160 """Log an error message if the passed condition is True. 161 162 """ 163 if (bool(cond) 164 or self.op.debug_simulate_errors): # pylint: disable=E1101 165 self._Error(*args, **kwargs)
166 167 # TODO: Replace this method with a cleaner interface
168 - def _Error(self, ecode, item, message, *args, **kwargs):
169 """Log an error message if the passed condition is True. 170 171 """ 172 #TODO: Remove 'code' argument in favour of using log_type 173 log_type = kwargs.get('code', self.ETYPE_ERROR) 174 if args: 175 message = message % args 176 self._ErrorMsgList(ecode, item, [message], log_type=log_type)
177
178 179 -class LUClusterVerify(NoHooksLU):
180 """Submits all jobs necessary to verify the cluster. 181 182 """ 183 REQ_BGL = False 184
185 - def ExpandNames(self):
186 self.needed_locks = {}
187
188 - def Exec(self, feedback_fn):
189 jobs = [] 190 191 if self.op.group_name: 192 groups = [self.op.group_name] 193 depends_fn = lambda: None 194 else: 195 groups = self.cfg.GetNodeGroupList() 196 197 # Verify global configuration 198 jobs.append([ 199 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors), 200 ]) 201 202 # Always depend on global verification 203 depends_fn = lambda: [(-len(jobs), [])] 204 205 jobs.extend( 206 [opcodes.OpClusterVerifyGroup(group_name=group, 207 ignore_errors=self.op.ignore_errors, 208 depends=depends_fn(), 209 verify_clutter=self.op.verify_clutter)] 210 for group in groups) 211 212 # Fix up all parameters 213 for op in itertools.chain(*jobs): # pylint: disable=W0142 214 op.debug_simulate_errors = self.op.debug_simulate_errors 215 op.verbose = self.op.verbose 216 op.error_codes = self.op.error_codes 217 try: 218 op.skip_checks = self.op.skip_checks 219 except AttributeError: 220 assert not isinstance(op, opcodes.OpClusterVerifyGroup) 221 222 return ResultWithJobs(jobs)
223
224 225 -class LUClusterVerifyDisks(NoHooksLU):
226 """Verifies the cluster disks status. 227 228 """ 229 REQ_BGL = False 230
231 - def ExpandNames(self):
232 self.share_locks = ShareAll() 233 self.needed_locks = { 234 locking.LEVEL_NODEGROUP: locking.ALL_SET, 235 }
236
237 - def Exec(self, feedback_fn):
238 group_names = self.owned_locks(locking.LEVEL_NODEGROUP) 239 240 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group 241 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)] 242 for group in group_names])
243
244 245 -class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
246 """Verifies the cluster config. 247 248 """ 249 REQ_BGL = False 250
251 - def _VerifyHVP(self, hvp_data):
252 """Verifies locally the syntax of the hypervisor parameters. 253 254 """ 255 for item, hv_name, hv_params in hvp_data: 256 msg = ("hypervisor %s parameters syntax check (source %s): %%s" % 257 (item, hv_name)) 258 try: 259 hv_class = hypervisor.GetHypervisorClass(hv_name) 260 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) 261 hv_class.CheckParameterSyntax(hv_params) 262 except errors.GenericError, err: 263 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
264
265 - def ExpandNames(self):
266 self.needed_locks = dict.fromkeys(locking.LEVELS, locking.ALL_SET) 267 self.share_locks = ShareAll()
268
269 - def CheckPrereq(self):
270 """Check prerequisites. 271 272 """ 273 # Retrieve all information 274 self.all_group_info = self.cfg.GetAllNodeGroupsInfo() 275 self.all_node_info = self.cfg.GetAllNodesInfo() 276 self.all_inst_info = self.cfg.GetAllInstancesInfo()
277
278 - def Exec(self, feedback_fn):
279 """Verify integrity of cluster, performing various test on nodes. 280 281 """ 282 self.bad = False 283 self._feedback_fn = feedback_fn 284 285 feedback_fn("* Verifying cluster config") 286 287 msg_list = self.cfg.VerifyConfig() 288 self._ErrorMsgList(constants.CV_ECLUSTERCFG, None, msg_list) 289 290 feedback_fn("* Verifying cluster certificate files") 291 292 for cert_filename in pathutils.ALL_CERT_FILES: 293 (errcode, msg) = utils.VerifyCertificate(cert_filename) 294 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode) 295 296 self._ErrorIf(not utils.CanRead(constants.LUXID_USER, 297 pathutils.NODED_CERT_FILE), 298 constants.CV_ECLUSTERCERT, 299 None, 300 pathutils.NODED_CERT_FILE + " must be accessible by the " + 301 constants.LUXID_USER + " user") 302 303 feedback_fn("* Verifying hypervisor parameters") 304 305 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(), 306 self.all_inst_info.values())) 307 308 feedback_fn("* Verifying all nodes belong to an existing group") 309 310 # We do this verification here because, should this bogus circumstance 311 # occur, it would never be caught by VerifyGroup, which only acts on 312 # nodes/instances reachable from existing node groups. 313 314 dangling_nodes = set(node for node in self.all_node_info.values() 315 if node.group not in self.all_group_info) 316 317 dangling_instances = {} 318 no_node_instances = [] 319 320 for inst in self.all_inst_info.values(): 321 if inst.primary_node in [node.uuid for node in dangling_nodes]: 322 dangling_instances.setdefault(inst.primary_node, []).append(inst) 323 elif inst.primary_node not in self.all_node_info: 324 no_node_instances.append(inst) 325 326 pretty_dangling = [ 327 "%s (%s)" % 328 (node.name, 329 utils.CommaJoin(inst.name for 330 inst in dangling_instances.get(node.uuid, []))) 331 for node in dangling_nodes] 332 333 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES, 334 None, 335 "the following nodes (and their instances) belong to a non" 336 " existing group: %s", utils.CommaJoin(pretty_dangling)) 337 338 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST, 339 None, 340 "the following instances have a non-existing primary-node:" 341 " %s", utils.CommaJoin(inst.name for 342 inst in no_node_instances)) 343 344 return not self.bad
345
346 347 -class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
348 """Verifies the status of a node group. 349 350 """ 351 HPATH = "cluster-verify" 352 HTYPE = constants.HTYPE_CLUSTER 353 REQ_BGL = False 354 355 _HOOKS_INDENT_RE = re.compile("^", re.M) 356
357 - class NodeImage(object):
358 """A class representing the logical and physical status of a node. 359 360 @type uuid: string 361 @ivar uuid: the node UUID to which this object refers 362 @ivar volumes: a structure as returned from 363 L{ganeti.backend.GetVolumeList} (runtime) 364 @ivar instances: a list of running instances (runtime) 365 @ivar pinst: list of configured primary instances (config) 366 @ivar sinst: list of configured secondary instances (config) 367 @ivar sbp: dictionary of {primary-node: list of instances} for all 368 instances for which this node is secondary (config) 369 @ivar mfree: free memory, as reported by hypervisor (runtime) 370 @ivar dfree: free disk, as reported by the node (runtime) 371 @ivar offline: the offline status (config) 372 @type rpc_fail: boolean 373 @ivar rpc_fail: whether the RPC verify call was successfull (overall, 374 not whether the individual keys were correct) (runtime) 375 @type lvm_fail: boolean 376 @ivar lvm_fail: whether the RPC call didn't return valid LVM data 377 @type hyp_fail: boolean 378 @ivar hyp_fail: whether the RPC call didn't return the instance list 379 @type ghost: boolean 380 @ivar ghost: whether this is a known node or not (config) 381 @type os_fail: boolean 382 @ivar os_fail: whether the RPC call didn't return valid OS data 383 @type oslist: list 384 @ivar oslist: list of OSes as diagnosed by DiagnoseOS 385 @type vm_capable: boolean 386 @ivar vm_capable: whether the node can host instances 387 @type pv_min: float 388 @ivar pv_min: size in MiB of the smallest PVs 389 @type pv_max: float 390 @ivar pv_max: size in MiB of the biggest PVs 391 392 """
393 - def __init__(self, offline=False, uuid=None, vm_capable=True):
394 self.uuid = uuid 395 self.volumes = {} 396 self.instances = [] 397 self.pinst = [] 398 self.sinst = [] 399 self.sbp = {} 400 self.mfree = 0 401 self.dfree = 0 402 self.offline = offline 403 self.vm_capable = vm_capable 404 self.rpc_fail = False 405 self.lvm_fail = False 406 self.hyp_fail = False 407 self.ghost = False 408 self.os_fail = False 409 self.oslist = {} 410 self.pv_min = None 411 self.pv_max = None
412
413 - def ExpandNames(self):
414 # This raises errors.OpPrereqError on its own: 415 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name) 416 417 # Get instances in node group; this is unsafe and needs verification later 418 inst_uuids = \ 419 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) 420 421 self.needed_locks = { 422 locking.LEVEL_INSTANCE: self.cfg.GetInstanceNames(inst_uuids), 423 locking.LEVEL_NODEGROUP: [self.group_uuid], 424 locking.LEVEL_NODE: [], 425 } 426 427 self.share_locks = ShareAll()
428
429 - def DeclareLocks(self, level):
430 if level == locking.LEVEL_NODE: 431 # Get members of node group; this is unsafe and needs verification later 432 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members) 433 434 # In Exec(), we warn about mirrored instances that have primary and 435 # secondary living in separate node groups. To fully verify that 436 # volumes for these instances are healthy, we will need to do an 437 # extra call to their secondaries. We ensure here those nodes will 438 # be locked. 439 for inst_name in self.owned_locks(locking.LEVEL_INSTANCE): 440 # Important: access only the instances whose lock is owned 441 instance = self.cfg.GetInstanceInfoByName(inst_name) 442 disks = self.cfg.GetInstanceDisks(instance.uuid) 443 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 444 nodes.update(self.cfg.GetInstanceSecondaryNodes(instance.uuid)) 445 446 self.needed_locks[locking.LEVEL_NODE] = nodes
447
448 - def CheckPrereq(self):
449 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP) 450 self.group_info = self.cfg.GetNodeGroup(self.group_uuid) 451 452 group_node_uuids = set(self.group_info.members) 453 group_inst_uuids = \ 454 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) 455 456 unlocked_node_uuids = \ 457 group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE)) 458 459 unlocked_inst_uuids = \ 460 group_inst_uuids.difference( 461 [self.cfg.GetInstanceInfoByName(name).uuid 462 for name in self.owned_locks(locking.LEVEL_INSTANCE)]) 463 464 if unlocked_node_uuids: 465 raise errors.OpPrereqError( 466 "Missing lock for nodes: %s" % 467 utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)), 468 errors.ECODE_STATE) 469 470 if unlocked_inst_uuids: 471 raise errors.OpPrereqError( 472 "Missing lock for instances: %s" % 473 utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)), 474 errors.ECODE_STATE) 475 476 self.all_node_info = self.cfg.GetAllNodesInfo() 477 self.all_inst_info = self.cfg.GetAllInstancesInfo() 478 self.all_disks_info = self.cfg.GetAllDisksInfo() 479 480 self.my_node_uuids = group_node_uuids 481 self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid]) 482 for node_uuid in group_node_uuids) 483 484 self.my_inst_uuids = group_inst_uuids 485 self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid]) 486 for inst_uuid in group_inst_uuids) 487 488 # We detect here the nodes that will need the extra RPC calls for verifying 489 # split LV volumes; they should be locked. 490 extra_lv_nodes = set() 491 492 for inst in self.my_inst_info.values(): 493 disks = self.cfg.GetInstanceDisks(inst.uuid) 494 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 495 inst_nodes = self.cfg.GetInstanceNodes(inst.uuid) 496 for nuuid in inst_nodes: 497 if self.all_node_info[nuuid].group != self.group_uuid: 498 extra_lv_nodes.add(nuuid) 499 500 unlocked_lv_nodes = \ 501 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE)) 502 503 if unlocked_lv_nodes: 504 raise errors.OpPrereqError("Missing node locks for LV check: %s" % 505 utils.CommaJoin(unlocked_lv_nodes), 506 errors.ECODE_STATE) 507 self.extra_lv_nodes = list(extra_lv_nodes)
508
509 - def _VerifyNode(self, ninfo, nresult):
510 """Perform some basic validation on data returned from a node. 511 512 - check the result data structure is well formed and has all the 513 mandatory fields 514 - check ganeti version 515 516 @type ninfo: L{objects.Node} 517 @param ninfo: the node to check 518 @param nresult: the results from the node 519 @rtype: boolean 520 @return: whether overall this call was successful (and we can expect 521 reasonable values in the respose) 522 523 """ 524 # main result, nresult should be a non-empty dict 525 test = not nresult or not isinstance(nresult, dict) 526 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name, 527 "unable to verify node: no data returned") 528 if test: 529 return False 530 531 # compares ganeti version 532 local_version = constants.PROTOCOL_VERSION 533 remote_version = nresult.get("version", None) 534 test = not (remote_version and 535 isinstance(remote_version, (list, tuple)) and 536 len(remote_version) == 2) 537 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name, 538 "connection to node returned invalid data") 539 if test: 540 return False 541 542 test = local_version != remote_version[0] 543 self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name, 544 "incompatible protocol versions: master %s," 545 " node %s", local_version, remote_version[0]) 546 if test: 547 return False 548 549 # node seems compatible, we can actually try to look into its results 550 551 # full package version 552 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1], 553 constants.CV_ENODEVERSION, ninfo.name, 554 "software version mismatch: master %s, node %s", 555 constants.RELEASE_VERSION, remote_version[1], 556 code=self.ETYPE_WARNING) 557 558 hyp_result = nresult.get(constants.NV_HYPERVISOR, None) 559 if ninfo.vm_capable and isinstance(hyp_result, dict): 560 for hv_name, hv_result in hyp_result.iteritems(): 561 test = hv_result is not None 562 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 563 "hypervisor %s verify failure: '%s'", hv_name, hv_result) 564 565 hvp_result = nresult.get(constants.NV_HVPARAMS, None) 566 if ninfo.vm_capable and isinstance(hvp_result, list): 567 for item, hv_name, hv_result in hvp_result: 568 self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name, 569 "hypervisor %s parameter verify failure (source %s): %s", 570 hv_name, item, hv_result) 571 572 test = nresult.get(constants.NV_NODESETUP, 573 ["Missing NODESETUP results"]) 574 self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name, 575 "node setup error: %s", "; ".join(test)) 576 577 return True
578
579 - def _VerifyNodeTime(self, ninfo, nresult, 580 nvinfo_starttime, nvinfo_endtime):
581 """Check the node time. 582 583 @type ninfo: L{objects.Node} 584 @param ninfo: the node to check 585 @param nresult: the remote results for the node 586 @param nvinfo_starttime: the start time of the RPC call 587 @param nvinfo_endtime: the end time of the RPC call 588 589 """ 590 ntime = nresult.get(constants.NV_TIME, None) 591 try: 592 ntime_merged = utils.MergeTime(ntime) 593 except (ValueError, TypeError): 594 self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name, 595 "Node returned invalid time") 596 return 597 598 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW): 599 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged) 600 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW): 601 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime) 602 else: 603 ntime_diff = None 604 605 self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name, 606 "Node time diverges by at least %s from master node time", 607 ntime_diff)
608
609 - def _UpdateVerifyNodeLVM(self, ninfo, nresult, vg_name, nimg):
610 """Check the node LVM results and update info for cross-node checks. 611 612 @type ninfo: L{objects.Node} 613 @param ninfo: the node to check 614 @param nresult: the remote results for the node 615 @param vg_name: the configured VG name 616 @type nimg: L{NodeImage} 617 @param nimg: node image 618 619 """ 620 if vg_name is None: 621 return 622 623 # checks vg existence and size > 20G 624 vglist = nresult.get(constants.NV_VGLIST, None) 625 test = not vglist 626 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name, 627 "unable to check volume groups") 628 if not test: 629 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name, 630 constants.MIN_VG_SIZE) 631 self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus) 632 633 # Check PVs 634 (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage) 635 for em in errmsgs: 636 self._Error(constants.CV_ENODELVM, ninfo.name, em) 637 if pvminmax is not None: 638 (nimg.pv_min, nimg.pv_max) = pvminmax
639
640 - def _VerifyGroupDRBDVersion(self, node_verify_infos):
641 """Check cross-node DRBD version consistency. 642 643 @type node_verify_infos: dict 644 @param node_verify_infos: infos about nodes as returned from the 645 node_verify call. 646 647 """ 648 node_versions = {} 649 for node_uuid, ndata in node_verify_infos.items(): 650 nresult = ndata.payload 651 if nresult: 652 version = nresult.get(constants.NV_DRBDVERSION, None) 653 if version: 654 node_versions[node_uuid] = version 655 656 if len(set(node_versions.values())) > 1: 657 for node_uuid, version in sorted(node_versions.items()): 658 msg = "DRBD version mismatch: %s" % version 659 self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg, 660 code=self.ETYPE_WARNING)
661
662 - def _VerifyGroupLVM(self, node_image, vg_name):
663 """Check cross-node consistency in LVM. 664 665 @type node_image: dict 666 @param node_image: info about nodes, mapping from node to names to 667 L{NodeImage} objects 668 @param vg_name: the configured VG name 669 670 """ 671 if vg_name is None: 672 return 673 674 # Only exclusive storage needs this kind of checks 675 if not self._exclusive_storage: 676 return 677 678 # exclusive_storage wants all PVs to have the same size (approximately), 679 # if the smallest and the biggest ones are okay, everything is fine. 680 # pv_min is None iff pv_max is None 681 vals = filter((lambda ni: ni.pv_min is not None), node_image.values()) 682 if not vals: 683 return 684 (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals) 685 (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals) 686 bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax) 687 self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE, self.group_info.name, 688 "PV sizes differ too much in the group; smallest (%s MB) is" 689 " on %s, biggest (%s MB) is on %s", 690 pvmin, self.cfg.GetNodeName(minnode_uuid), 691 pvmax, self.cfg.GetNodeName(maxnode_uuid))
692
693 - def _VerifyNodeBridges(self, ninfo, nresult, bridges):
694 """Check the node bridges. 695 696 @type ninfo: L{objects.Node} 697 @param ninfo: the node to check 698 @param nresult: the remote results for the node 699 @param bridges: the expected list of bridges 700 701 """ 702 if not bridges: 703 return 704 705 missing = nresult.get(constants.NV_BRIDGES, None) 706 test = not isinstance(missing, list) 707 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 708 "did not return valid bridge information") 709 if not test: 710 self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name, 711 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
712
713 - def _VerifyNodeUserScripts(self, ninfo, nresult):
714 """Check the results of user scripts presence and executability on the node 715 716 @type ninfo: L{objects.Node} 717 @param ninfo: the node to check 718 @param nresult: the remote results for the node 719 720 """ 721 test = not constants.NV_USERSCRIPTS in nresult 722 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, ninfo.name, 723 "did not return user scripts information") 724 725 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None) 726 if not test: 727 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, ninfo.name, 728 "user scripts not present or not executable: %s" % 729 utils.CommaJoin(sorted(broken_scripts)))
730
731 - def _VerifyNodeNetwork(self, ninfo, nresult):
732 """Check the node network connectivity results. 733 734 @type ninfo: L{objects.Node} 735 @param ninfo: the node to check 736 @param nresult: the remote results for the node 737 738 """ 739 test = constants.NV_NODELIST not in nresult 740 self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name, 741 "node hasn't returned node ssh connectivity data") 742 if not test: 743 if nresult[constants.NV_NODELIST]: 744 for a_node, a_msg in nresult[constants.NV_NODELIST].items(): 745 self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name, 746 "ssh communication with node '%s': %s", a_node, a_msg) 747 748 test = constants.NV_NODENETTEST not in nresult 749 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 750 "node hasn't returned node tcp connectivity data") 751 if not test: 752 if nresult[constants.NV_NODENETTEST]: 753 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys()) 754 for anode in nlist: 755 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, 756 "tcp communication with node '%s': %s", 757 anode, nresult[constants.NV_NODENETTEST][anode]) 758 759 test = constants.NV_MASTERIP not in nresult 760 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 761 "node hasn't returned node master IP reachability data") 762 if not test: 763 if not nresult[constants.NV_MASTERIP]: 764 if ninfo.uuid == self.master_node: 765 msg = "the master node cannot reach the master IP (not configured?)" 766 else: 767 msg = "cannot reach the master IP" 768 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg)
769
770 - def _VerifyInstance(self, instance, node_image, diskstatus):
771 """Verify an instance. 772 773 This function checks to see if the required block devices are 774 available on the instance's node, and that the nodes are in the correct 775 state. 776 777 """ 778 pnode_uuid = instance.primary_node 779 pnode_img = node_image[pnode_uuid] 780 groupinfo = self.cfg.GetAllNodeGroupsInfo() 781 782 node_vol_should = {} 783 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 784 785 cluster = self.cfg.GetClusterInfo() 786 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster, 787 self.group_info) 788 err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg) 789 self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name, 790 utils.CommaJoin(err), code=self.ETYPE_WARNING) 791 792 for node_uuid in node_vol_should: 793 n_img = node_image[node_uuid] 794 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail: 795 # ignore missing volumes on offline or broken nodes 796 continue 797 for volume in node_vol_should[node_uuid]: 798 test = volume not in n_img.volumes 799 self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name, 800 "volume %s missing on node %s", volume, 801 self.cfg.GetNodeName(node_uuid)) 802 803 if instance.admin_state == constants.ADMINST_UP: 804 test = instance.uuid not in pnode_img.instances and not pnode_img.offline 805 self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name, 806 "instance not running on its primary node %s", 807 self.cfg.GetNodeName(pnode_uuid)) 808 self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE, 809 instance.name, "instance is marked as running and lives on" 810 " offline node %s", self.cfg.GetNodeName(pnode_uuid)) 811 812 diskdata = [(nname, success, status, idx) 813 for (nname, disks) in diskstatus.items() 814 for idx, (success, status) in enumerate(disks)] 815 816 for nname, success, bdev_status, idx in diskdata: 817 # the 'ghost node' construction in Exec() ensures that we have a 818 # node here 819 snode = node_image[nname] 820 bad_snode = snode.ghost or snode.offline 821 self._ErrorIf(instance.disks_active and 822 not success and not bad_snode, 823 constants.CV_EINSTANCEFAULTYDISK, instance.name, 824 "couldn't retrieve status for disk/%s on %s: %s", 825 idx, self.cfg.GetNodeName(nname), bdev_status) 826 827 if instance.disks_active and success and bdev_status.is_degraded: 828 msg = "disk/%s on %s is degraded" % (idx, self.cfg.GetNodeName(nname)) 829 830 code = self.ETYPE_ERROR 831 accepted_lds = [constants.LDS_OKAY, constants.LDS_SYNC] 832 833 if bdev_status.ldisk_status in accepted_lds: 834 code = self.ETYPE_WARNING 835 836 msg += "; local disk state is '%s'" % \ 837 constants.LDS_NAMES[bdev_status.ldisk_status] 838 839 self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg, 840 code=code) 841 842 self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline, 843 constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid), 844 "instance %s, connection to primary node failed", 845 instance.name) 846 847 secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid) 848 self._ErrorIf(len(secondary_nodes) > 1, 849 constants.CV_EINSTANCELAYOUT, instance.name, 850 "instance has multiple secondary nodes: %s", 851 utils.CommaJoin(secondary_nodes), 852 code=self.ETYPE_WARNING) 853 854 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid) 855 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, inst_nodes) 856 disks = self.cfg.GetInstanceDisks(instance.uuid) 857 if any(es_flags.values()): 858 if not utils.AllDiskOfType(disks, constants.DTS_EXCL_STORAGE): 859 # Disk template not compatible with exclusive_storage: no instance 860 # node should have the flag set 861 es_nodes = [n 862 for (n, es) in es_flags.items() 863 if es] 864 unsupported = [d.dev_type for d in disks 865 if d.dev_type not in constants.DTS_EXCL_STORAGE] 866 self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name, 867 "instance uses disk types %s, which are not supported on" 868 " nodes that have exclusive storage set: %s", 869 utils.CommaJoin(unsupported), 870 utils.CommaJoin(self.cfg.GetNodeNames(es_nodes))) 871 for (idx, disk) in enumerate(disks): 872 self._ErrorIf(disk.spindles is None, 873 constants.CV_EINSTANCEMISSINGCFGPARAMETER, instance.name, 874 "number of spindles not configured for disk %s while" 875 " exclusive storage is enabled, try running" 876 " gnt-cluster repair-disk-sizes", idx) 877 878 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 879 instance_nodes = utils.NiceSort(inst_nodes) 880 instance_groups = {} 881 882 for node_uuid in instance_nodes: 883 instance_groups.setdefault(self.all_node_info[node_uuid].group, 884 []).append(node_uuid) 885 886 pretty_list = [ 887 "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)), 888 groupinfo[group].name) 889 # Sort so that we always list the primary node first. 890 for group, nodes in sorted(instance_groups.items(), 891 key=lambda (_, nodes): pnode_uuid in nodes, 892 reverse=True)] 893 894 self._ErrorIf(len(instance_groups) > 1, 895 constants.CV_EINSTANCESPLITGROUPS, 896 instance.name, "instance has primary and secondary nodes in" 897 " different groups: %s", utils.CommaJoin(pretty_list), 898 code=self.ETYPE_WARNING) 899 900 inst_nodes_offline = [] 901 for snode in secondary_nodes: 902 s_img = node_image[snode] 903 self._ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC, 904 self.cfg.GetNodeName(snode), 905 "instance %s, connection to secondary node failed", 906 instance.name) 907 908 if s_img.offline: 909 inst_nodes_offline.append(snode) 910 911 # warn that the instance lives on offline nodes 912 self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, 913 instance.name, "instance has offline secondary node(s) %s", 914 utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline))) 915 # ... or ghost/non-vm_capable nodes 916 for node_uuid in inst_nodes: 917 self._ErrorIf(node_image[node_uuid].ghost, constants.CV_EINSTANCEBADNODE, 918 instance.name, "instance lives on ghost node %s", 919 self.cfg.GetNodeName(node_uuid)) 920 self._ErrorIf(not node_image[node_uuid].vm_capable, 921 constants.CV_EINSTANCEBADNODE, instance.name, 922 "instance lives on non-vm_capable node %s", 923 self.cfg.GetNodeName(node_uuid))
924
925 - def _VerifyOrphanVolumes(self, vg_name, node_vol_should, node_image, 926 reserved):
927 """Verify if there are any unknown volumes in the cluster. 928 929 The .os, .swap and backup volumes are ignored. All other volumes are 930 reported as unknown. 931 932 @type vg_name: string 933 @param vg_name: the name of the Ganeti-administered volume group 934 @type reserved: L{ganeti.utils.FieldSet} 935 @param reserved: a FieldSet of reserved volume names 936 937 """ 938 for node_uuid, n_img in node_image.items(): 939 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or 940 self.all_node_info[node_uuid].group != self.group_uuid): 941 # skip non-healthy nodes 942 continue 943 for volume in n_img.volumes: 944 # skip volumes not belonging to the ganeti-administered volume group 945 if volume.split('/')[0] != vg_name: 946 continue 947 948 test = ((node_uuid not in node_vol_should or 949 volume not in node_vol_should[node_uuid]) and 950 not reserved.Matches(volume)) 951 self._ErrorIf(test, constants.CV_ENODEORPHANLV, 952 self.cfg.GetNodeName(node_uuid), 953 "volume %s is unknown", volume, 954 code=_VerifyErrors.ETYPE_WARNING)
955
956 - def _VerifyNPlusOneMemory(self, node_image, all_insts):
957 """Verify N+1 Memory Resilience. 958 959 Check that if one single node dies we can still start all the 960 instances it was primary for. 961 962 """ 963 cluster_info = self.cfg.GetClusterInfo() 964 for node_uuid, n_img in node_image.items(): 965 # This code checks that every node which is now listed as 966 # secondary has enough memory to host all instances it is 967 # supposed to should a single other node in the cluster fail. 968 # FIXME: not ready for failover to an arbitrary node 969 # FIXME: does not support file-backed instances 970 # WARNING: we currently take into account down instances as well 971 # as up ones, considering that even if they're down someone 972 # might want to start them even in the event of a node failure. 973 if n_img.offline or \ 974 self.all_node_info[node_uuid].group != self.group_uuid: 975 # we're skipping nodes marked offline and nodes in other groups from 976 # the N+1 warning, since most likely we don't have good memory 977 # information from them; we already list instances living on such 978 # nodes, and that's enough warning 979 continue 980 #TODO(dynmem): also consider ballooning out other instances 981 for prinode, inst_uuids in n_img.sbp.items(): 982 needed_mem = 0 983 for inst_uuid in inst_uuids: 984 bep = cluster_info.FillBE(all_insts[inst_uuid]) 985 if bep[constants.BE_AUTO_BALANCE]: 986 needed_mem += bep[constants.BE_MINMEM] 987 test = n_img.mfree < needed_mem 988 self._ErrorIf(test, constants.CV_ENODEN1, 989 self.cfg.GetNodeName(node_uuid), 990 "not enough memory to accomodate instance failovers" 991 " should node %s fail (%dMiB needed, %dMiB available)", 992 self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
993
994 - def _VerifyClientCertificates(self, nodes, all_nvinfo):
995 """Verifies the consistency of the client certificates. 996 997 This includes several aspects: 998 - the individual validation of all nodes' certificates 999 - the consistency of the master candidate certificate map 1000 - the consistency of the master candidate certificate map with the 1001 certificates that the master candidates are actually using. 1002 1003 @param nodes: the list of nodes to consider in this verification 1004 @param all_nvinfo: the map of results of the verify_node call to 1005 all nodes 1006 1007 """ 1008 candidate_certs = self.cfg.GetClusterInfo().candidate_certs 1009 if candidate_certs is None or len(candidate_certs) == 0: 1010 self._ErrorIf( 1011 True, constants.CV_ECLUSTERCLIENTCERT, None, 1012 "The cluster's list of master candidate certificates is empty." 1013 " If you just updated the cluster, please run" 1014 " 'gnt-cluster renew-crypto --new-node-certificates'.") 1015 return 1016 1017 self._ErrorIf( 1018 len(candidate_certs) != len(set(candidate_certs.values())), 1019 constants.CV_ECLUSTERCLIENTCERT, None, 1020 "There are at least two master candidates configured to use the same" 1021 " certificate.") 1022 1023 # collect the client certificate 1024 for node in nodes: 1025 if node.offline: 1026 continue 1027 1028 nresult = all_nvinfo[node.uuid] 1029 if nresult.fail_msg or not nresult.payload: 1030 continue 1031 1032 (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None) 1033 1034 self._ErrorIf( 1035 errcode is not None, constants.CV_ECLUSTERCLIENTCERT, None, 1036 "Client certificate of node '%s' failed validation: %s (code '%s')", 1037 node.uuid, msg, errcode) 1038 1039 if not errcode: 1040 digest = msg 1041 if node.master_candidate: 1042 if node.uuid in candidate_certs: 1043 self._ErrorIf( 1044 digest != candidate_certs[node.uuid], 1045 constants.CV_ECLUSTERCLIENTCERT, None, 1046 "Client certificate digest of master candidate '%s' does not" 1047 " match its entry in the cluster's map of master candidate" 1048 " certificates. Expected: %s Got: %s", node.uuid, 1049 digest, candidate_certs[node.uuid]) 1050 else: 1051 self._ErrorIf( 1052 True, constants.CV_ECLUSTERCLIENTCERT, None, 1053 "The master candidate '%s' does not have an entry in the" 1054 " map of candidate certificates.", node.uuid) 1055 self._ErrorIf( 1056 digest in candidate_certs.values(), 1057 constants.CV_ECLUSTERCLIENTCERT, None, 1058 "Master candidate '%s' is using a certificate of another node.", 1059 node.uuid) 1060 else: 1061 self._ErrorIf( 1062 node.uuid in candidate_certs, 1063 constants.CV_ECLUSTERCLIENTCERT, None, 1064 "Node '%s' is not a master candidate, but still listed in the" 1065 " map of master candidate certificates.", node.uuid) 1066 self._ErrorIf( 1067 (node.uuid not in candidate_certs) and 1068 (digest in candidate_certs.values()), 1069 constants.CV_ECLUSTERCLIENTCERT, None, 1070 "Node '%s' is not a master candidate and is incorrectly using a" 1071 " certificate of another node which is master candidate.", 1072 node.uuid)
1073
1074 - def _VerifySshSetup(self, nodes, all_nvinfo):
1075 """Evaluates the verification results of the SSH setup and clutter test. 1076 1077 @param nodes: List of L{objects.Node} objects 1078 @param all_nvinfo: RPC results 1079 1080 """ 1081 for node in nodes: 1082 if not node.offline: 1083 nresult = all_nvinfo[node.uuid] 1084 if nresult.fail_msg or not nresult.payload: 1085 self._ErrorIf(True, constants.CV_ENODESSH, node.name, 1086 "Could not verify the SSH setup of this node.") 1087 return 1088 for ssh_test in [constants.NV_SSH_SETUP, constants.NV_SSH_CLUTTER]: 1089 result = nresult.payload.get(ssh_test, None) 1090 error_msg = "" 1091 if isinstance(result, list): 1092 error_msg = " ".join(result) 1093 self._ErrorIf(result, 1094 constants.CV_ENODESSH, None, error_msg)
1095
1096 - def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo, 1097 (files_all, files_opt, files_mc, files_vm)):
1098 """Verifies file checksums collected from all nodes. 1099 1100 @param nodes: List of L{objects.Node} objects 1101 @param master_node_uuid: UUID of master node 1102 @param all_nvinfo: RPC results 1103 1104 """ 1105 # Define functions determining which nodes to consider for a file 1106 files2nodefn = [ 1107 (files_all, None), 1108 (files_mc, lambda node: (node.master_candidate or 1109 node.uuid == master_node_uuid)), 1110 (files_vm, lambda node: node.vm_capable), 1111 ] 1112 1113 # Build mapping from filename to list of nodes which should have the file 1114 nodefiles = {} 1115 for (files, fn) in files2nodefn: 1116 if fn is None: 1117 filenodes = nodes 1118 else: 1119 filenodes = filter(fn, nodes) 1120 nodefiles.update((filename, 1121 frozenset(map(operator.attrgetter("uuid"), filenodes))) 1122 for filename in files) 1123 1124 assert set(nodefiles) == (files_all | files_mc | files_vm) 1125 1126 fileinfo = dict((filename, {}) for filename in nodefiles) 1127 ignore_nodes = set() 1128 1129 for node in nodes: 1130 if node.offline: 1131 ignore_nodes.add(node.uuid) 1132 continue 1133 1134 nresult = all_nvinfo[node.uuid] 1135 1136 if nresult.fail_msg or not nresult.payload: 1137 node_files = None 1138 else: 1139 fingerprints = nresult.payload.get(constants.NV_FILELIST, {}) 1140 node_files = dict((vcluster.LocalizeVirtualPath(key), value) 1141 for (key, value) in fingerprints.items()) 1142 del fingerprints 1143 1144 test = not (node_files and isinstance(node_files, dict)) 1145 self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name, 1146 "Node did not return file checksum data") 1147 if test: 1148 ignore_nodes.add(node.uuid) 1149 continue 1150 1151 # Build per-checksum mapping from filename to nodes having it 1152 for (filename, checksum) in node_files.items(): 1153 assert filename in nodefiles 1154 fileinfo[filename].setdefault(checksum, set()).add(node.uuid) 1155 1156 for (filename, checksums) in fileinfo.items(): 1157 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum" 1158 1159 # Nodes having the file 1160 with_file = frozenset(node_uuid 1161 for node_uuids in fileinfo[filename].values() 1162 for node_uuid in node_uuids) - ignore_nodes 1163 1164 expected_nodes = nodefiles[filename] - ignore_nodes 1165 1166 # Nodes missing file 1167 missing_file = expected_nodes - with_file 1168 1169 if filename in files_opt: 1170 # All or no nodes 1171 self._ErrorIf(missing_file and missing_file != expected_nodes, 1172 constants.CV_ECLUSTERFILECHECK, None, 1173 "File %s is optional, but it must exist on all or no" 1174 " nodes (not found on %s)", 1175 filename, 1176 utils.CommaJoin( 1177 utils.NiceSort( 1178 map(self.cfg.GetNodeName, missing_file)))) 1179 else: 1180 self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None, 1181 "File %s is missing from node(s) %s", filename, 1182 utils.CommaJoin( 1183 utils.NiceSort( 1184 map(self.cfg.GetNodeName, missing_file)))) 1185 1186 # Warn if a node has a file it shouldn't 1187 unexpected = with_file - expected_nodes 1188 self._ErrorIf(unexpected, 1189 constants.CV_ECLUSTERFILECHECK, None, 1190 "File %s should not exist on node(s) %s", 1191 filename, utils.CommaJoin( 1192 utils.NiceSort(map(self.cfg.GetNodeName, unexpected)))) 1193 1194 # See if there are multiple versions of the file 1195 test = len(checksums) > 1 1196 if test: 1197 variants = ["variant %s on %s" % 1198 (idx + 1, 1199 utils.CommaJoin(utils.NiceSort( 1200 map(self.cfg.GetNodeName, node_uuids)))) 1201 for (idx, (checksum, node_uuids)) in 1202 enumerate(sorted(checksums.items()))] 1203 else: 1204 variants = [] 1205 1206 self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None, 1207 "File %s found with %s different checksums (%s)", 1208 filename, len(checksums), "; ".join(variants))
1209
1210 - def _VerifyNodeDrbdHelper(self, ninfo, nresult, drbd_helper):
1211 """Verify the drbd helper. 1212 1213 """ 1214 if drbd_helper: 1215 helper_result = nresult.get(constants.NV_DRBDHELPER, None) 1216 test = (helper_result is None) 1217 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1218 "no drbd usermode helper returned") 1219 if helper_result: 1220 status, payload = helper_result 1221 test = not status 1222 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1223 "drbd usermode helper check unsuccessful: %s", payload) 1224 test = status and (payload != drbd_helper) 1225 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1226 "wrong drbd usermode helper: %s", payload)
1227 1228 @staticmethod
1229 - def _ComputeDrbdMinors(ninfo, instanceinfo, disks_info, drbd_map, error_if):
1230 """Gives the DRBD information in a map for a node. 1231 1232 @type ninfo: L{objects.Node} 1233 @param ninfo: the node to check 1234 @param instanceinfo: the dict of instances 1235 @param disks_info: the dict of disks 1236 @param drbd_map: the DRBD map as returned by 1237 L{ganeti.config.ConfigWriter.ComputeDRBDMap} 1238 @type error_if: callable like L{_ErrorIf} 1239 @param error_if: The error reporting function 1240 @return: dict from minor number to (disk_uuid, instance_uuid, active) 1241 1242 """ 1243 node_drbd = {} 1244 for minor, disk_uuid in drbd_map[ninfo.uuid].items(): 1245 test = disk_uuid not in disks_info 1246 error_if(test, constants.CV_ECLUSTERCFG, None, 1247 "ghost disk '%s' in temporary DRBD map", disk_uuid) 1248 # ghost disk should not be active, but otherwise we 1249 # don't give double warnings (both ghost disk and 1250 # unallocated minor in use) 1251 if test: 1252 node_drbd[minor] = (disk_uuid, None, False) 1253 else: 1254 disk_active = False 1255 disk_instance = None 1256 for (inst_uuid, inst) in instanceinfo.items(): 1257 if disk_uuid in inst.disks: 1258 disk_active = inst.disks_active 1259 disk_instance = inst_uuid 1260 break 1261 node_drbd[minor] = (disk_uuid, disk_instance, disk_active) 1262 return node_drbd
1263
1264 - def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, disks_info, 1265 drbd_helper, drbd_map):
1266 """Verifies and the node DRBD status. 1267 1268 @type ninfo: L{objects.Node} 1269 @param ninfo: the node to check 1270 @param nresult: the remote results for the node 1271 @param instanceinfo: the dict of instances 1272 @param disks_info: the dict of disks 1273 @param drbd_helper: the configured DRBD usermode helper 1274 @param drbd_map: the DRBD map as returned by 1275 L{ganeti.config.ConfigWriter.ComputeDRBDMap} 1276 1277 """ 1278 self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper) 1279 1280 # compute the DRBD minors 1281 node_drbd = self._ComputeDrbdMinors(ninfo, instanceinfo, disks_info, 1282 drbd_map, self._ErrorIf) 1283 1284 # and now check them 1285 used_minors = nresult.get(constants.NV_DRBDLIST, []) 1286 test = not isinstance(used_minors, (tuple, list)) 1287 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1288 "cannot parse drbd status file: %s", str(used_minors)) 1289 if test: 1290 # we cannot check drbd status 1291 return 1292 1293 for minor, (disk_uuid, inst_uuid, must_exist) in node_drbd.items(): 1294 test = minor not in used_minors and must_exist 1295 if inst_uuid is not None: 1296 attached = "(attached in instance '%s')" % \ 1297 self.cfg.GetInstanceName(inst_uuid) 1298 else: 1299 attached = "(detached)" 1300 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1301 "drbd minor %d of disk %s %s is not active", 1302 minor, disk_uuid, attached) 1303 for minor in used_minors: 1304 test = minor not in node_drbd 1305 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1306 "unallocated drbd minor %d is in use", minor)
1307
1308 - def _UpdateNodeOS(self, ninfo, nresult, nimg):
1309 """Builds the node OS structures. 1310 1311 @type ninfo: L{objects.Node} 1312 @param ninfo: the node to check 1313 @param nresult: the remote results for the node 1314 @param nimg: the node image object 1315 1316 """ 1317 remote_os = nresult.get(constants.NV_OSLIST, None) 1318 test = (not isinstance(remote_os, list) or 1319 not compat.all(isinstance(v, list) and len(v) == 8 1320 for v in remote_os)) 1321 1322 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name, 1323 "node hasn't returned valid OS data") 1324 1325 nimg.os_fail = test 1326 1327 if test: 1328 return 1329 1330 os_dict = {} 1331 1332 for (name, os_path, status, diagnose, 1333 variants, parameters, api_ver, 1334 trusted) in nresult[constants.NV_OSLIST]: 1335 1336 if name not in os_dict: 1337 os_dict[name] = [] 1338 1339 # parameters is a list of lists instead of list of tuples due to 1340 # JSON lacking a real tuple type, fix it: 1341 parameters = [tuple(v) for v in parameters] 1342 os_dict[name].append((os_path, status, diagnose, 1343 set(variants), set(parameters), set(api_ver), 1344 trusted)) 1345 1346 nimg.oslist = os_dict
1347
1348 - def _VerifyNodeOS(self, ninfo, nimg, base):
1349 """Verifies the node OS list. 1350 1351 @type ninfo: L{objects.Node} 1352 @param ninfo: the node to check 1353 @param nimg: the node image object 1354 @param base: the 'template' node we match against (e.g. from the master) 1355 1356 """ 1357 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?" 1358 1359 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l] 1360 for os_name, os_data in nimg.oslist.items(): 1361 assert os_data, "Empty OS status for OS %s?!" % os_name 1362 f_path, f_status, f_diag, f_var, f_param, f_api, f_trusted = os_data[0] 1363 self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name, 1364 "Invalid OS %s (located at %s): %s", 1365 os_name, f_path, f_diag) 1366 self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name, 1367 "OS '%s' has multiple entries" 1368 " (first one shadows the rest): %s", 1369 os_name, utils.CommaJoin([v[0] for v in os_data])) 1370 # comparisons with the 'base' image 1371 test = os_name not in base.oslist 1372 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name, 1373 "Extra OS %s not present on reference node (%s)", 1374 os_name, self.cfg.GetNodeName(base.uuid)) 1375 if test: 1376 continue 1377 assert base.oslist[os_name], "Base node has empty OS status?" 1378 _, b_status, _, b_var, b_param, b_api, b_trusted = base.oslist[os_name][0] 1379 if not b_status: 1380 # base OS is invalid, skipping 1381 continue 1382 for kind, a, b in [("API version", f_api, b_api), 1383 ("variants list", f_var, b_var), 1384 ("parameters", beautify_params(f_param), 1385 beautify_params(b_param))]: 1386 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name, 1387 "OS %s for %s differs from reference node %s:" 1388 " [%s] vs. [%s]", kind, os_name, 1389 self.cfg.GetNodeName(base.uuid), 1390 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b))) 1391 for kind, a, b in [("trusted", f_trusted, b_trusted)]: 1392 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name, 1393 "OS %s for %s differs from reference node %s:" 1394 " %s vs. %s", kind, os_name, 1395 self.cfg.GetNodeName(base.uuid), a, b) 1396 1397 # check any missing OSes 1398 missing = set(base.oslist.keys()).difference(nimg.oslist.keys()) 1399 self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name, 1400 "OSes present on reference node %s" 1401 " but missing on this node: %s", 1402 self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
1403
1404 - def _VerifyAcceptedFileStoragePaths(self, ninfo, nresult, is_master):
1405 """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}. 1406 1407 @type ninfo: L{objects.Node} 1408 @param ninfo: the node to check 1409 @param nresult: the remote results for the node 1410 @type is_master: bool 1411 @param is_master: Whether node is the master node 1412 1413 """ 1414 cluster = self.cfg.GetClusterInfo() 1415 if (is_master and 1416 (cluster.IsFileStorageEnabled() or 1417 cluster.IsSharedFileStorageEnabled())): 1418 try: 1419 fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS] 1420 except KeyError: 1421 # This should never happen 1422 self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1423 "Node did not return forbidden file storage paths") 1424 else: 1425 self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1426 "Found forbidden file storage paths: %s", 1427 utils.CommaJoin(fspaths)) 1428 else: 1429 self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult, 1430 constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1431 "Node should not have returned forbidden file storage" 1432 " paths")
1433
1434 - def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template, 1435 verify_key, error_key):
1436 """Verifies (file) storage paths. 1437 1438 @type ninfo: L{objects.Node} 1439 @param ninfo: the node to check 1440 @param nresult: the remote results for the node 1441 @type file_disk_template: string 1442 @param file_disk_template: file-based disk template, whose directory 1443 is supposed to be verified 1444 @type verify_key: string 1445 @param verify_key: key for the verification map of this file 1446 verification step 1447 @param error_key: error key to be added to the verification results 1448 in case something goes wrong in this verification step 1449 1450 """ 1451 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes( 1452 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER 1453 )) 1454 1455 cluster = self.cfg.GetClusterInfo() 1456 if cluster.IsDiskTemplateEnabled(file_disk_template): 1457 self._ErrorIf( 1458 verify_key in nresult, 1459 error_key, ninfo.name, 1460 "The configured %s storage path is unusable: %s" % 1461 (file_disk_template, nresult.get(verify_key)))
1462
1463 - def _VerifyFileStoragePaths(self, ninfo, nresult):
1464 """Verifies (file) storage paths. 1465 1466 @see: C{_VerifyStoragePaths} 1467 1468 """ 1469 self._VerifyStoragePaths( 1470 ninfo, nresult, constants.DT_FILE, 1471 constants.NV_FILE_STORAGE_PATH, 1472 constants.CV_ENODEFILESTORAGEPATHUNUSABLE)
1473
1474 - def _VerifySharedFileStoragePaths(self, ninfo, nresult):
1475 """Verifies (file) storage paths. 1476 1477 @see: C{_VerifyStoragePaths} 1478 1479 """ 1480 self._VerifyStoragePaths( 1481 ninfo, nresult, constants.DT_SHARED_FILE, 1482 constants.NV_SHARED_FILE_STORAGE_PATH, 1483 constants.CV_ENODESHAREDFILESTORAGEPATHUNUSABLE)
1484
1485 - def _VerifyGlusterStoragePaths(self, ninfo, nresult):
1486 """Verifies (file) storage paths. 1487 1488 @see: C{_VerifyStoragePaths} 1489 1490 """ 1491 self._VerifyStoragePaths( 1492 ninfo, nresult, constants.DT_GLUSTER, 1493 constants.NV_GLUSTER_STORAGE_PATH, 1494 constants.CV_ENODEGLUSTERSTORAGEPATHUNUSABLE)
1495
1496 - def _VerifyOob(self, ninfo, nresult):
1497 """Verifies out of band functionality of a node. 1498 1499 @type ninfo: L{objects.Node} 1500 @param ninfo: the node to check 1501 @param nresult: the remote results for the node 1502 1503 """ 1504 # We just have to verify the paths on master and/or master candidates 1505 # as the oob helper is invoked on the master 1506 if ((ninfo.master_candidate or ninfo.master_capable) and 1507 constants.NV_OOB_PATHS in nresult): 1508 for path_result in nresult[constants.NV_OOB_PATHS]: 1509 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, 1510 ninfo.name, path_result)
1511
1512 - def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1513 """Verifies and updates the node volume data. 1514 1515 This function will update a L{NodeImage}'s internal structures 1516 with data from the remote call. 1517 1518 @type ninfo: L{objects.Node} 1519 @param ninfo: the node to check 1520 @param nresult: the remote results for the node 1521 @param nimg: the node image object 1522 @param vg_name: the configured VG name 1523 1524 """ 1525 nimg.lvm_fail = True 1526 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data") 1527 if vg_name is None: 1528 pass 1529 elif isinstance(lvdata, basestring): 1530 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name, 1531 "LVM problem on node: %s", utils.SafeEncode(lvdata)) 1532 elif not isinstance(lvdata, dict): 1533 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name, 1534 "rpc call to node failed (lvlist)") 1535 else: 1536 nimg.volumes = lvdata 1537 nimg.lvm_fail = False
1538
1539 - def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1540 """Verifies and updates the node instance list. 1541 1542 If the listing was successful, then updates this node's instance 1543 list. Otherwise, it marks the RPC call as failed for the instance 1544 list key. 1545 1546 @type ninfo: L{objects.Node} 1547 @param ninfo: the node to check 1548 @param nresult: the remote results for the node 1549 @param nimg: the node image object 1550 1551 """ 1552 idata = nresult.get(constants.NV_INSTANCELIST, None) 1553 test = not isinstance(idata, list) 1554 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 1555 "rpc call to node failed (instancelist): %s", 1556 utils.SafeEncode(str(idata))) 1557 if test: 1558 nimg.hyp_fail = True 1559 else: 1560 nimg.instances = [uuid for (uuid, _) in 1561 self.cfg.GetMultiInstanceInfoByName(idata)]
1562
1563 - def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1564 """Verifies and computes a node information map 1565 1566 @type ninfo: L{objects.Node} 1567 @param ninfo: the node to check 1568 @param nresult: the remote results for the node 1569 @param nimg: the node image object 1570 @param vg_name: the configured VG name 1571 1572 """ 1573 # try to read free memory (from the hypervisor) 1574 hv_info = nresult.get(constants.NV_HVINFO, None) 1575 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info 1576 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 1577 "rpc call to node failed (hvinfo)") 1578 if not test: 1579 try: 1580 nimg.mfree = int(hv_info["memory_free"]) 1581 except (ValueError, TypeError): 1582 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, 1583 "node returned invalid nodeinfo, check hypervisor") 1584 1585 # FIXME: devise a free space model for file based instances as well 1586 if vg_name is not None: 1587 test = (constants.NV_VGLIST not in nresult or 1588 vg_name not in nresult[constants.NV_VGLIST]) 1589 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name, 1590 "node didn't return data for the volume group '%s'" 1591 " - it is either missing or broken", vg_name) 1592 if not test: 1593 try: 1594 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name]) 1595 except (ValueError, TypeError): 1596 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, 1597 "node returned invalid LVM info, check LVM status")
1598
1599 - def _CollectDiskInfo(self, node_uuids, node_image, instanceinfo):
1600 """Gets per-disk status information for all instances. 1601 1602 @type node_uuids: list of strings 1603 @param node_uuids: Node UUIDs 1604 @type node_image: dict of (UUID, L{objects.Node}) 1605 @param node_image: Node objects 1606 @type instanceinfo: dict of (UUID, L{objects.Instance}) 1607 @param instanceinfo: Instance objects 1608 @rtype: {instance: {node: [(succes, payload)]}} 1609 @return: a dictionary of per-instance dictionaries with nodes as 1610 keys and disk information as values; the disk information is a 1611 list of tuples (success, payload) 1612 1613 """ 1614 node_disks = {} 1615 node_disks_dev_inst_only = {} 1616 diskless_instances = set() 1617 nodisk_instances = set() 1618 1619 for nuuid in node_uuids: 1620 node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst, 1621 node_image[nuuid].sinst)) 1622 diskless_instances.update(uuid for uuid in node_inst_uuids 1623 if not instanceinfo[uuid].disks) 1624 disks = [(inst_uuid, disk) 1625 for inst_uuid in node_inst_uuids 1626 for disk in self.cfg.GetInstanceDisks(inst_uuid)] 1627 1628 if not disks: 1629 nodisk_instances.update(uuid for uuid in node_inst_uuids 1630 if instanceinfo[uuid].disks) 1631 # No need to collect data 1632 continue 1633 1634 node_disks[nuuid] = disks 1635 1636 # _AnnotateDiskParams makes already copies of the disks 1637 dev_inst_only = [] 1638 for (inst_uuid, dev) in disks: 1639 (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev], 1640 self.cfg) 1641 dev_inst_only.append((anno_disk, instanceinfo[inst_uuid])) 1642 1643 node_disks_dev_inst_only[nuuid] = dev_inst_only 1644 1645 assert len(node_disks) == len(node_disks_dev_inst_only) 1646 1647 # Collect data from all nodes with disks 1648 result = self.rpc.call_blockdev_getmirrorstatus_multi( 1649 node_disks.keys(), node_disks_dev_inst_only) 1650 1651 assert len(result) == len(node_disks) 1652 1653 instdisk = {} 1654 1655 for (nuuid, nres) in result.items(): 1656 node = self.cfg.GetNodeInfo(nuuid) 1657 disks = node_disks[node.uuid] 1658 1659 if nres.offline: 1660 # No data from this node 1661 data = len(disks) * [(False, "node offline")] 1662 else: 1663 msg = nres.fail_msg 1664 self._ErrorIf(msg, constants.CV_ENODERPC, node.name, 1665 "while getting disk information: %s", msg) 1666 if msg: 1667 # No data from this node 1668 data = len(disks) * [(False, msg)] 1669 else: 1670 data = [] 1671 for idx, i in enumerate(nres.payload): 1672 if isinstance(i, (tuple, list)) and len(i) == 2: 1673 data.append(i) 1674 else: 1675 logging.warning("Invalid result from node %s, entry %d: %s", 1676 node.name, idx, i) 1677 data.append((False, "Invalid result from the remote node")) 1678 1679 for ((inst_uuid, _), status) in zip(disks, data): 1680 instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \ 1681 .append(status) 1682 1683 # Add empty entries for diskless instances. 1684 for inst_uuid in diskless_instances: 1685 assert inst_uuid not in instdisk 1686 instdisk[inst_uuid] = {} 1687 # ...and disk-full instances that happen to have no disks 1688 for inst_uuid in nodisk_instances: 1689 assert inst_uuid not in instdisk 1690 instdisk[inst_uuid] = {} 1691 1692 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and 1693 len(nuuids) <= len( 1694 self.cfg.GetInstanceNodes(instanceinfo[inst].uuid)) and 1695 compat.all(isinstance(s, (tuple, list)) and 1696 len(s) == 2 for s in statuses) 1697 for inst, nuuids in instdisk.items() 1698 for nuuid, statuses in nuuids.items()) 1699 if __debug__: 1700 instdisk_keys = set(instdisk) 1701 instanceinfo_keys = set(instanceinfo) 1702 assert instdisk_keys == instanceinfo_keys, \ 1703 ("instdisk keys (%s) do not match instanceinfo keys (%s)" % 1704 (instdisk_keys, instanceinfo_keys)) 1705 1706 return instdisk
1707 1708 @staticmethod
1709 - def _SshNodeSelector(group_uuid, all_nodes):
1710 """Create endless iterators for all potential SSH check hosts. 1711 1712 """ 1713 nodes = [node for node in all_nodes 1714 if (node.group != group_uuid and 1715 not node.offline)] 1716 keyfunc = operator.attrgetter("group") 1717 1718 return map(itertools.cycle, 1719 [sorted(map(operator.attrgetter("name"), names)) 1720 for _, names in itertools.groupby(sorted(nodes, key=keyfunc), 1721 keyfunc)])
1722 1723 @classmethod
1724 - def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
1725 """Choose which nodes should talk to which other nodes. 1726 1727 We will make nodes contact all nodes in their group, and one node from 1728 every other group. 1729 1730 @rtype: tuple of (string, dict of strings to list of strings, string) 1731 @return: a tuple containing the list of all online nodes, a dictionary 1732 mapping node names to additional nodes of other node groups to which 1733 connectivity should be tested, and a list of all online master 1734 candidates 1735 1736 @warning: This algorithm has a known issue if one node group is much 1737 smaller than others (e.g. just one node). In such a case all other 1738 nodes will talk to the single node. 1739 1740 """ 1741 online_nodes = sorted(node.name for node in group_nodes if not node.offline) 1742 online_mcs = sorted(node.name for node in group_nodes 1743 if (node.master_candidate and not node.offline)) 1744 sel = cls._SshNodeSelector(group_uuid, all_nodes) 1745 1746 return (online_nodes, 1747 dict((name, sorted([i.next() for i in sel])) 1748 for name in online_nodes), 1749 online_mcs)
1750
1751 - def _PrepareSshSetupCheck(self):
1752 """Prepare the input data for the SSH setup verification. 1753 1754 """ 1755 all_nodes_info = self.cfg.GetAllNodesInfo() 1756 potential_master_candidates = self.cfg.GetPotentialMasterCandidates() 1757 node_status = [ 1758 (uuid, node_info.name, node_info.master_candidate, 1759 node_info.name in potential_master_candidates, not node_info.offline) 1760 for (uuid, node_info) in all_nodes_info.items()] 1761 return node_status
1762
1763 - def BuildHooksEnv(self):
1764 """Build hooks env. 1765 1766 Cluster-Verify hooks just ran in the post phase and their failure makes 1767 the output be logged in the verify output and the verification to fail. 1768 1769 """ 1770 env = { 1771 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()), 1772 } 1773 1774 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags())) 1775 for node in self.my_node_info.values()) 1776 1777 return env
1778
1779 - def BuildHooksNodes(self):
1780 """Build hooks nodes. 1781 1782 """ 1783 return ([], list(self.my_node_info.keys()))
1784 1785 @staticmethod
1786 - def _VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced, 1787 i_offline, n_offline, n_drained):
1788 feedback_fn("* Other Notes") 1789 if i_non_redundant: 1790 feedback_fn(" - NOTICE: %d non-redundant instance(s) found." 1791 % len(i_non_redundant)) 1792 1793 if i_non_a_balanced: 1794 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found." 1795 % len(i_non_a_balanced)) 1796 1797 if i_offline: 1798 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline) 1799 1800 if n_offline: 1801 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline) 1802 1803 if n_drained: 1804 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
1805
1806 - def Exec(self, feedback_fn): # pylint: disable=R0915
1807 """Verify integrity of the node group, performing various test on nodes. 1808 1809 """ 1810 # This method has too many local variables. pylint: disable=R0914 1811 feedback_fn("* Verifying group '%s'" % self.group_info.name) 1812 1813 if not self.my_node_uuids: 1814 # empty node group 1815 feedback_fn("* Empty node group, skipping verification") 1816 return True 1817 1818 self.bad = False 1819 verbose = self.op.verbose 1820 self._feedback_fn = feedback_fn 1821 1822 vg_name = self.cfg.GetVGName() 1823 drbd_helper = self.cfg.GetDRBDHelper() 1824 cluster = self.cfg.GetClusterInfo() 1825 hypervisors = cluster.enabled_hypervisors 1826 node_data_list = self.my_node_info.values() 1827 1828 i_non_redundant = [] # Non redundant instances 1829 i_non_a_balanced = [] # Non auto-balanced instances 1830 i_offline = 0 # Count of offline instances 1831 n_offline = 0 # Count of offline nodes 1832 n_drained = 0 # Count of nodes being drained 1833 node_vol_should = {} 1834 1835 # FIXME: verify OS list 1836 1837 # File verification 1838 filemap = ComputeAncillaryFiles(cluster, False) 1839 1840 # do local checksums 1841 master_node_uuid = self.master_node = self.cfg.GetMasterNode() 1842 master_ip = self.cfg.GetMasterIP() 1843 1844 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids)) 1845 1846 user_scripts = [] 1847 if self.cfg.GetUseExternalMipScript(): 1848 user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT) 1849 1850 node_verify_param = { 1851 constants.NV_FILELIST: 1852 map(vcluster.MakeVirtualPath, 1853 utils.UniqueSequence(filename 1854 for files in filemap 1855 for filename in files)), 1856 constants.NV_NODELIST: 1857 self._SelectSshCheckNodes(node_data_list, self.group_uuid, 1858 self.all_node_info.values()), 1859 constants.NV_HYPERVISOR: hypervisors, 1860 constants.NV_HVPARAMS: 1861 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()), 1862 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip) 1863 for node in node_data_list 1864 if not node.offline], 1865 constants.NV_INSTANCELIST: hypervisors, 1866 constants.NV_VERSION: None, 1867 constants.NV_HVINFO: self.cfg.GetHypervisorType(), 1868 constants.NV_NODESETUP: None, 1869 constants.NV_TIME: None, 1870 constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip), 1871 constants.NV_OSLIST: None, 1872 constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(), 1873 constants.NV_USERSCRIPTS: user_scripts, 1874 constants.NV_CLIENT_CERT: None, 1875 } 1876 1877 if self.cfg.GetClusterInfo().modify_ssh_setup: 1878 node_verify_param[constants.NV_SSH_SETUP] = self._PrepareSshSetupCheck() 1879 if self.op.verify_clutter: 1880 node_verify_param[constants.NV_SSH_CLUTTER] = True 1881 1882 if vg_name is not None: 1883 node_verify_param[constants.NV_VGLIST] = None 1884 node_verify_param[constants.NV_LVLIST] = vg_name 1885 node_verify_param[constants.NV_PVLIST] = [vg_name] 1886 1887 if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8): 1888 if drbd_helper: 1889 node_verify_param[constants.NV_DRBDVERSION] = None 1890 node_verify_param[constants.NV_DRBDLIST] = None 1891 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper 1892 1893 if cluster.IsFileStorageEnabled() or \ 1894 cluster.IsSharedFileStorageEnabled(): 1895 # Load file storage paths only from master node 1896 node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \ 1897 self.cfg.GetMasterNodeName() 1898 if cluster.IsFileStorageEnabled(): 1899 node_verify_param[constants.NV_FILE_STORAGE_PATH] = \ 1900 cluster.file_storage_dir 1901 if cluster.IsSharedFileStorageEnabled(): 1902 node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \ 1903 cluster.shared_file_storage_dir 1904 1905 # bridge checks 1906 # FIXME: this needs to be changed per node-group, not cluster-wide 1907 bridges = set() 1908 default_nicpp = cluster.nicparams[constants.PP_DEFAULT] 1909 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: 1910 bridges.add(default_nicpp[constants.NIC_LINK]) 1911 for inst_uuid in self.my_inst_info.values(): 1912 for nic in inst_uuid.nics: 1913 full_nic = cluster.SimpleFillNIC(nic.nicparams) 1914 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: 1915 bridges.add(full_nic[constants.NIC_LINK]) 1916 1917 if bridges: 1918 node_verify_param[constants.NV_BRIDGES] = list(bridges) 1919 1920 # Build our expected cluster state 1921 node_image = dict((node.uuid, self.NodeImage(offline=node.offline, 1922 uuid=node.uuid, 1923 vm_capable=node.vm_capable)) 1924 for node in node_data_list) 1925 1926 # Gather OOB paths 1927 oob_paths = [] 1928 for node in self.all_node_info.values(): 1929 path = SupportsOob(self.cfg, node) 1930 if path and path not in oob_paths: 1931 oob_paths.append(path) 1932 1933 if oob_paths: 1934 node_verify_param[constants.NV_OOB_PATHS] = oob_paths 1935 1936 for inst_uuid in self.my_inst_uuids: 1937 instance = self.my_inst_info[inst_uuid] 1938 if instance.admin_state == constants.ADMINST_OFFLINE: 1939 i_offline += 1 1940 1941 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid) 1942 for nuuid in inst_nodes: 1943 if nuuid not in node_image: 1944 gnode = self.NodeImage(uuid=nuuid) 1945 gnode.ghost = (nuuid not in self.all_node_info) 1946 node_image[nuuid] = gnode 1947 1948 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 1949 1950 pnode = instance.primary_node 1951 node_image[pnode].pinst.append(instance.uuid) 1952 1953 for snode in self.cfg.GetInstanceSecondaryNodes(instance.uuid): 1954 nimg = node_image[snode] 1955 nimg.sinst.append(instance.uuid) 1956 if pnode not in nimg.sbp: 1957 nimg.sbp[pnode] = [] 1958 nimg.sbp[pnode].append(instance.uuid) 1959 1960 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, 1961 self.my_node_info.keys()) 1962 # The value of exclusive_storage should be the same across the group, so if 1963 # it's True for at least a node, we act as if it were set for all the nodes 1964 self._exclusive_storage = compat.any(es_flags.values()) 1965 if self._exclusive_storage: 1966 node_verify_param[constants.NV_EXCLUSIVEPVS] = True 1967 1968 node_group_uuids = dict(map(lambda n: (n.name, n.group), 1969 self.cfg.GetAllNodesInfo().values())) 1970 groups_config = self.cfg.GetAllNodeGroupsInfoDict() 1971 1972 # At this point, we have the in-memory data structures complete, 1973 # except for the runtime information, which we'll gather next 1974 1975 # NOTE: Here we lock the configuration for the duration of RPC calls, 1976 # which means that the cluster configuration changes are blocked during 1977 # this period. 1978 # This is something that should be done only exceptionally and only for 1979 # justified cases! 1980 # In this case, we need the lock as we can only verify the integrity of 1981 # configuration files on MCs only if we know nobody else is modifying it. 1982 # FIXME: The check for integrity of config.data should be moved to 1983 # WConfD, which is the only one who can otherwise ensure nobody 1984 # will modify the configuration during the check. 1985 with self.cfg.GetConfigManager(shared=True, forcelock=True): 1986 feedback_fn("* Gathering information about nodes (%s nodes)" % 1987 len(self.my_node_uuids)) 1988 # Force the configuration to be fully distributed before doing any tests 1989 self.cfg.FlushConfig() 1990 # Due to the way our RPC system works, exact response times cannot be 1991 # guaranteed (e.g. a broken node could run into a timeout). By keeping 1992 # the time before and after executing the request, we can at least have 1993 # a time window. 1994 nvinfo_starttime = time.time() 1995 # Get lock on the configuration so that nobody modifies it concurrently. 1996 # Otherwise it can be modified by other jobs, failing the consistency 1997 # test. 1998 # NOTE: This is an exceptional situation, we should otherwise avoid 1999 # locking the configuration for something but very fast, pure operations. 2000 cluster_name = self.cfg.GetClusterName() 2001 hvparams = self.cfg.GetClusterInfo().hvparams 2002 all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids, 2003 node_verify_param, 2004 cluster_name, 2005 hvparams, 2006 node_group_uuids, 2007 groups_config) 2008 nvinfo_endtime = time.time() 2009 2010 if self.extra_lv_nodes and vg_name is not None: 2011 feedback_fn("* Gathering information about extra nodes (%s nodes)" % 2012 len(self.extra_lv_nodes)) 2013 extra_lv_nvinfo = \ 2014 self.rpc.call_node_verify(self.extra_lv_nodes, 2015 {constants.NV_LVLIST: vg_name}, 2016 self.cfg.GetClusterName(), 2017 self.cfg.GetClusterInfo().hvparams, 2018 node_group_uuids, 2019 groups_config) 2020 else: 2021 extra_lv_nvinfo = {} 2022 2023 # If not all nodes are being checked, we need to make sure the master 2024 # node and a non-checked vm_capable node are in the list. 2025 absent_node_uuids = set(self.all_node_info).difference(self.my_node_info) 2026 if absent_node_uuids: 2027 vf_nvinfo = all_nvinfo.copy() 2028 vf_node_info = list(self.my_node_info.values()) 2029 additional_node_uuids = [] 2030 if master_node_uuid not in self.my_node_info: 2031 additional_node_uuids.append(master_node_uuid) 2032 vf_node_info.append(self.all_node_info[master_node_uuid]) 2033 # Add the first vm_capable node we find which is not included, 2034 # excluding the master node (which we already have) 2035 for node_uuid in absent_node_uuids: 2036 nodeinfo = self.all_node_info[node_uuid] 2037 if (nodeinfo.vm_capable and not nodeinfo.offline and 2038 node_uuid != master_node_uuid): 2039 additional_node_uuids.append(node_uuid) 2040 vf_node_info.append(self.all_node_info[node_uuid]) 2041 break 2042 key = constants.NV_FILELIST 2043 2044 feedback_fn("* Gathering information about the master node") 2045 vf_nvinfo.update(self.rpc.call_node_verify( 2046 additional_node_uuids, {key: node_verify_param[key]}, 2047 self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams, 2048 node_group_uuids, 2049 groups_config)) 2050 else: 2051 vf_nvinfo = all_nvinfo 2052 vf_node_info = self.my_node_info.values() 2053 2054 all_drbd_map = self.cfg.ComputeDRBDMap() 2055 2056 feedback_fn("* Gathering disk information (%s nodes)" % 2057 len(self.my_node_uuids)) 2058 instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image, 2059 self.my_inst_info) 2060 2061 feedback_fn("* Verifying configuration file consistency") 2062 2063 self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo) 2064 if self.cfg.GetClusterInfo().modify_ssh_setup: 2065 self._VerifySshSetup(self.my_node_info.values(), all_nvinfo) 2066 self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap) 2067 2068 feedback_fn("* Verifying node status") 2069 2070 refos_img = None 2071 2072 for node_i in node_data_list: 2073 nimg = node_image[node_i.uuid] 2074 2075 if node_i.offline: 2076 if verbose: 2077 feedback_fn("* Skipping offline node %s" % (node_i.name,)) 2078 n_offline += 1 2079 continue 2080 2081 if node_i.uuid == master_node_uuid: 2082 ntype = "master" 2083 elif node_i.master_candidate: 2084 ntype = "master candidate" 2085 elif node_i.drained: 2086 ntype = "drained" 2087 n_drained += 1 2088 else: 2089 ntype = "regular" 2090 if verbose: 2091 feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype)) 2092 2093 msg = all_nvinfo[node_i.uuid].fail_msg 2094 self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name, 2095 "while contacting node: %s", msg) 2096 if msg: 2097 nimg.rpc_fail = True 2098 continue 2099 2100 nresult = all_nvinfo[node_i.uuid].payload 2101 2102 nimg.call_ok = self._VerifyNode(node_i, nresult) 2103 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime) 2104 self._VerifyNodeNetwork(node_i, nresult) 2105 self._VerifyNodeUserScripts(node_i, nresult) 2106 self._VerifyOob(node_i, nresult) 2107 self._VerifyAcceptedFileStoragePaths(node_i, nresult, 2108 node_i.uuid == master_node_uuid) 2109 self._VerifyFileStoragePaths(node_i, nresult) 2110 self._VerifySharedFileStoragePaths(node_i, nresult) 2111 self._VerifyGlusterStoragePaths(node_i, nresult) 2112 2113 if nimg.vm_capable: 2114 self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg) 2115 if constants.DT_DRBD8 in cluster.enabled_disk_templates: 2116 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, 2117 self.all_disks_info, drbd_helper, all_drbd_map) 2118 2119 if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \ 2120 (constants.DT_DRBD8 in cluster.enabled_disk_templates): 2121 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name) 2122 self._UpdateNodeInstances(node_i, nresult, nimg) 2123 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name) 2124 self._UpdateNodeOS(node_i, nresult, nimg) 2125 2126 if not nimg.os_fail: 2127 if refos_img is None: 2128 refos_img = nimg 2129 self._VerifyNodeOS(node_i, nimg, refos_img) 2130 self._VerifyNodeBridges(node_i, nresult, bridges) 2131 2132 # Check whether all running instances are primary for the node. (This 2133 # can no longer be done from _VerifyInstance below, since some of the 2134 # wrong instances could be from other node groups.) 2135 non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst) 2136 2137 for inst_uuid in non_primary_inst_uuids: 2138 test = inst_uuid in self.all_inst_info 2139 self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, 2140 self.cfg.GetInstanceName(inst_uuid), 2141 "instance should not run on node %s", node_i.name) 2142 self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name, 2143 "node is running unknown instance %s", inst_uuid) 2144 2145 self._VerifyGroupDRBDVersion(all_nvinfo) 2146 self._VerifyGroupLVM(node_image, vg_name) 2147 2148 for node_uuid, result in extra_lv_nvinfo.items(): 2149 self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload, 2150 node_image[node_uuid], vg_name) 2151 2152 feedback_fn("* Verifying instance status") 2153 for inst_uuid in self.my_inst_uuids: 2154 instance = self.my_inst_info[inst_uuid] 2155 if verbose: 2156 feedback_fn("* Verifying instance %s" % instance.name) 2157 self._VerifyInstance(instance, node_image, instdisk[inst_uuid]) 2158 2159 # If the instance is not fully redundant we cannot survive losing its 2160 # primary node, so we are not N+1 compliant. 2161 inst_disks = self.cfg.GetInstanceDisks(instance.uuid) 2162 if not utils.AllDiskOfType(inst_disks, constants.DTS_MIRRORED): 2163 i_non_redundant.append(instance) 2164 2165 if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]: 2166 i_non_a_balanced.append(instance) 2167 2168 feedback_fn("* Verifying orphan volumes") 2169 reserved = utils.FieldSet(*cluster.reserved_lvs) 2170 2171 # We will get spurious "unknown volume" warnings if any node of this group 2172 # is secondary for an instance whose primary is in another group. To avoid 2173 # them, we find these instances and add their volumes to node_vol_should. 2174 for instance in self.all_inst_info.values(): 2175 for secondary in self.cfg.GetInstanceSecondaryNodes(instance.uuid): 2176 if (secondary in self.my_node_info 2177 and instance.uuid not in self.my_inst_info): 2178 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 2179 break 2180 2181 self._VerifyOrphanVolumes(vg_name, node_vol_should, node_image, reserved) 2182 2183 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks: 2184 feedback_fn("* Verifying N+1 Memory redundancy") 2185 self._VerifyNPlusOneMemory(node_image, self.my_inst_info) 2186 2187 self._VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced, 2188 i_offline, n_offline, n_drained) 2189 2190 return not self.bad
2191
2192 - def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2193 """Analyze the post-hooks' result 2194 2195 This method analyses the hook result, handles it, and sends some 2196 nicely-formatted feedback back to the user. 2197 2198 @param phase: one of L{constants.HOOKS_PHASE_POST} or 2199 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase 2200 @param hooks_results: the results of the multi-node hooks rpc call 2201 @param feedback_fn: function used send feedback back to the caller 2202 @param lu_result: previous Exec result 2203 @return: the new Exec result, based on the previous result 2204 and hook results 2205 2206 """ 2207 # We only really run POST phase hooks, only for non-empty groups, 2208 # and are only interested in their results 2209 if not self.my_node_uuids: 2210 # empty node group 2211 pass 2212 elif phase == constants.HOOKS_PHASE_POST: 2213 # Used to change hooks' output to proper indentation 2214 feedback_fn("* Hooks Results") 2215 assert hooks_results, "invalid result from hooks" 2216 2217 for node_name in hooks_results: 2218 res = hooks_results[node_name] 2219 msg = res.fail_msg 2220 test = msg and not res.offline 2221 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, 2222 "Communication failure in hooks execution: %s", msg) 2223 if test: 2224 lu_result = False 2225 continue 2226 if res.offline: 2227 # No need to investigate payload if node is offline 2228 continue 2229 for script, hkr, output in res.payload: 2230 test = hkr == constants.HKR_FAIL 2231 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, 2232 "Script %s failed, output:", script) 2233 if test: 2234 output = self._HOOKS_INDENT_RE.sub(" ", output) 2235 feedback_fn("%s" % output) 2236 lu_result = False 2237 2238 return lu_result
2239