Package ganeti :: Package cmdlib :: Package cluster :: Module verify
[hide private]
[frames] | no frames]

Source Code for Module ganeti.cmdlib.cluster.verify

   1  # 
   2  # 
   3   
   4  # Copyright (C) 2014 Google Inc. 
   5  # All rights reserved. 
   6  # 
   7  # Redistribution and use in source and binary forms, with or without 
   8  # modification, are permitted provided that the following conditions are 
   9  # met: 
  10  # 
  11  # 1. Redistributions of source code must retain the above copyright notice, 
  12  # this list of conditions and the following disclaimer. 
  13  # 
  14  # 2. Redistributions in binary form must reproduce the above copyright 
  15  # notice, this list of conditions and the following disclaimer in the 
  16  # documentation and/or other materials provided with the distribution. 
  17  # 
  18  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
  19  # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
  20  # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
  21  # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
  22  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  23  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  24  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  25  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  26  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  27  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  28  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  29   
  30  """Logical units for cluster verification.""" 
  31   
  32  import itertools 
  33  import logging 
  34  import operator 
  35  import re 
  36  import time 
  37  import ganeti.masterd.instance 
  38  import ganeti.rpc.node as rpc 
  39   
  40  from ganeti import compat 
  41  from ganeti import constants 
  42  from ganeti import errors 
  43  from ganeti import locking 
  44  from ganeti import pathutils 
  45  from ganeti import utils 
  46  from ganeti import vcluster 
  47  from ganeti import hypervisor 
  48  from ganeti import opcodes 
  49   
  50  from ganeti.cmdlib.base import LogicalUnit, NoHooksLU, ResultWithJobs 
  51  from ganeti.cmdlib.common import ShareAll, ComputeAncillaryFiles, \ 
  52      CheckNodePVs, ComputeIPolicyInstanceViolation, AnnotateDiskParams, \ 
  53      SupportsOob 
54 55 56 -def _GetAllHypervisorParameters(cluster, instances):
57 """Compute the set of all hypervisor parameters. 58 59 @type cluster: L{objects.Cluster} 60 @param cluster: the cluster object 61 @param instances: list of L{objects.Instance} 62 @param instances: additional instances from which to obtain parameters 63 @rtype: list of (origin, hypervisor, parameters) 64 @return: a list with all parameters found, indicating the hypervisor they 65 apply to, and the origin (can be "cluster", "os X", or "instance Y") 66 67 """ 68 hvp_data = [] 69 70 for hv_name in cluster.enabled_hypervisors: 71 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name))) 72 73 for os_name, os_hvp in cluster.os_hvp.items(): 74 for hv_name, hv_params in os_hvp.items(): 75 if hv_params: 76 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name) 77 hvp_data.append(("os %s" % os_name, hv_name, full_params)) 78 79 # TODO: collapse identical parameter values in a single one 80 for instance in instances: 81 if instance.hvparams: 82 hvp_data.append(("instance %s" % instance.name, instance.hypervisor, 83 cluster.FillHV(instance))) 84 85 return hvp_data
86
87 88 -class _VerifyErrors(object):
89 """Mix-in for cluster/group verify LUs. 90 91 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects 92 self.op and self._feedback_fn to be available.) 93 94 """ 95 96 ETYPE_FIELD = "code" 97 ETYPE_ERROR = constants.CV_ERROR 98 ETYPE_WARNING = constants.CV_WARNING 99
100 - def _Error(self, ecode, item, msg, *args, **kwargs):
101 """Format an error message. 102 103 Based on the opcode's error_codes parameter, either format a 104 parseable error code, or a simpler error string. 105 106 This must be called only from Exec and functions called from Exec. 107 108 """ 109 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) 110 itype, etxt, _ = ecode 111 # If the error code is in the list of ignored errors, demote the error to a 112 # warning 113 if etxt in self.op.ignore_errors: # pylint: disable=E1101 114 ltype = self.ETYPE_WARNING 115 # first complete the msg 116 if args: 117 msg = msg % args 118 # then format the whole message 119 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101 120 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg) 121 else: 122 if item: 123 item = " " + item 124 else: 125 item = "" 126 msg = "%s: %s%s: %s" % (ltype, itype, item, msg) 127 # and finally report it via the feedback_fn 128 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101 129 # do not mark the operation as failed for WARN cases only 130 if ltype == self.ETYPE_ERROR: 131 self.bad = True
132
133 - def _ErrorIf(self, cond, *args, **kwargs):
134 """Log an error message if the passed condition is True. 135 136 """ 137 if (bool(cond) 138 or self.op.debug_simulate_errors): # pylint: disable=E1101 139 self._Error(*args, **kwargs)
140
141 142 -class LUClusterVerify(NoHooksLU):
143 """Submits all jobs necessary to verify the cluster. 144 145 """ 146 REQ_BGL = False 147
148 - def ExpandNames(self):
149 self.needed_locks = {}
150
151 - def Exec(self, feedback_fn):
152 jobs = [] 153 154 if self.op.group_name: 155 groups = [self.op.group_name] 156 depends_fn = lambda: None 157 else: 158 groups = self.cfg.GetNodeGroupList() 159 160 # Verify global configuration 161 jobs.append([ 162 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors), 163 ]) 164 165 # Always depend on global verification 166 depends_fn = lambda: [(-len(jobs), [])] 167 168 jobs.extend( 169 [opcodes.OpClusterVerifyGroup(group_name=group, 170 ignore_errors=self.op.ignore_errors, 171 depends=depends_fn(), 172 verify_clutter=self.op.verify_clutter)] 173 for group in groups) 174 175 # Fix up all parameters 176 for op in itertools.chain(*jobs): # pylint: disable=W0142 177 op.debug_simulate_errors = self.op.debug_simulate_errors 178 op.verbose = self.op.verbose 179 op.error_codes = self.op.error_codes 180 try: 181 op.skip_checks = self.op.skip_checks 182 except AttributeError: 183 assert not isinstance(op, opcodes.OpClusterVerifyGroup) 184 185 return ResultWithJobs(jobs)
186
187 188 -class LUClusterVerifyDisks(NoHooksLU):
189 """Verifies the cluster disks status. 190 191 """ 192 REQ_BGL = False 193
194 - def ExpandNames(self):
195 self.share_locks = ShareAll() 196 if self.op.group_name: 197 self.needed_locks = { 198 locking.LEVEL_NODEGROUP: [self.cfg.LookupNodeGroup(self.op.group_name)] 199 } 200 else: 201 self.needed_locks = { 202 locking.LEVEL_NODEGROUP: locking.ALL_SET, 203 }
204
205 - def Exec(self, feedback_fn):
206 group_names = self.owned_locks(locking.LEVEL_NODEGROUP) 207 208 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group 209 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)] 210 for group in group_names])
211
212 213 -class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
214 """Verifies the cluster config. 215 216 """ 217 REQ_BGL = False 218
219 - def _VerifyHVP(self, hvp_data):
220 """Verifies locally the syntax of the hypervisor parameters. 221 222 """ 223 for item, hv_name, hv_params in hvp_data: 224 msg = ("hypervisor %s parameters syntax check (source %s): %%s" % 225 (item, hv_name)) 226 try: 227 hv_class = hypervisor.GetHypervisorClass(hv_name) 228 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) 229 hv_class.CheckParameterSyntax(hv_params) 230 except errors.GenericError, err: 231 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
232
233 - def ExpandNames(self):
234 self.needed_locks = dict.fromkeys(locking.LEVELS, locking.ALL_SET) 235 self.share_locks = ShareAll()
236
237 - def CheckPrereq(self):
238 """Check prerequisites. 239 240 """ 241 # Retrieve all information 242 self.all_group_info = self.cfg.GetAllNodeGroupsInfo() 243 self.all_node_info = self.cfg.GetAllNodesInfo() 244 self.all_inst_info = self.cfg.GetAllInstancesInfo()
245
246 - def Exec(self, feedback_fn):
247 """Verify integrity of cluster, performing various test on nodes. 248 249 """ 250 self.bad = False 251 self._feedback_fn = feedback_fn 252 253 feedback_fn("* Verifying cluster config") 254 255 for msg in self.cfg.VerifyConfig(): 256 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg) 257 258 feedback_fn("* Verifying cluster certificate files") 259 260 for cert_filename in pathutils.ALL_CERT_FILES: 261 (errcode, msg) = utils.VerifyCertificate(cert_filename) 262 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode) 263 264 self._ErrorIf(not utils.CanRead(constants.LUXID_USER, 265 pathutils.NODED_CERT_FILE), 266 constants.CV_ECLUSTERCERT, 267 None, 268 pathutils.NODED_CERT_FILE + " must be accessible by the " + 269 constants.LUXID_USER + " user") 270 271 feedback_fn("* Verifying hypervisor parameters") 272 273 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(), 274 self.all_inst_info.values())) 275 276 feedback_fn("* Verifying all nodes belong to an existing group") 277 278 # We do this verification here because, should this bogus circumstance 279 # occur, it would never be caught by VerifyGroup, which only acts on 280 # nodes/instances reachable from existing node groups. 281 282 dangling_nodes = set(node for node in self.all_node_info.values() 283 if node.group not in self.all_group_info) 284 285 dangling_instances = {} 286 no_node_instances = [] 287 288 for inst in self.all_inst_info.values(): 289 if inst.primary_node in [node.uuid for node in dangling_nodes]: 290 dangling_instances.setdefault(inst.primary_node, []).append(inst) 291 elif inst.primary_node not in self.all_node_info: 292 no_node_instances.append(inst) 293 294 pretty_dangling = [ 295 "%s (%s)" % 296 (node.name, 297 utils.CommaJoin(inst.name for 298 inst in dangling_instances.get(node.uuid, []))) 299 for node in dangling_nodes] 300 301 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES, 302 None, 303 "the following nodes (and their instances) belong to a non" 304 " existing group: %s", utils.CommaJoin(pretty_dangling)) 305 306 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST, 307 None, 308 "the following instances have a non-existing primary-node:" 309 " %s", utils.CommaJoin(inst.name for 310 inst in no_node_instances)) 311 312 return not self.bad
313
314 315 -class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
316 """Verifies the status of a node group. 317 318 """ 319 HPATH = "cluster-verify" 320 HTYPE = constants.HTYPE_CLUSTER 321 REQ_BGL = False 322 323 _HOOKS_INDENT_RE = re.compile("^", re.M) 324
325 - class NodeImage(object):
326 """A class representing the logical and physical status of a node. 327 328 @type uuid: string 329 @ivar uuid: the node UUID to which this object refers 330 @ivar volumes: a structure as returned from 331 L{ganeti.backend.GetVolumeList} (runtime) 332 @ivar instances: a list of running instances (runtime) 333 @ivar pinst: list of configured primary instances (config) 334 @ivar sinst: list of configured secondary instances (config) 335 @ivar sbp: dictionary of {primary-node: list of instances} for all 336 instances for which this node is secondary (config) 337 @ivar mfree: free memory, as reported by hypervisor (runtime) 338 @ivar dfree: free disk, as reported by the node (runtime) 339 @ivar offline: the offline status (config) 340 @type rpc_fail: boolean 341 @ivar rpc_fail: whether the RPC verify call was successfull (overall, 342 not whether the individual keys were correct) (runtime) 343 @type lvm_fail: boolean 344 @ivar lvm_fail: whether the RPC call didn't return valid LVM data 345 @type hyp_fail: boolean 346 @ivar hyp_fail: whether the RPC call didn't return the instance list 347 @type ghost: boolean 348 @ivar ghost: whether this is a known node or not (config) 349 @type os_fail: boolean 350 @ivar os_fail: whether the RPC call didn't return valid OS data 351 @type oslist: list 352 @ivar oslist: list of OSes as diagnosed by DiagnoseOS 353 @type vm_capable: boolean 354 @ivar vm_capable: whether the node can host instances 355 @type pv_min: float 356 @ivar pv_min: size in MiB of the smallest PVs 357 @type pv_max: float 358 @ivar pv_max: size in MiB of the biggest PVs 359 360 """
361 - def __init__(self, offline=False, uuid=None, vm_capable=True):
362 self.uuid = uuid 363 self.volumes = {} 364 self.instances = [] 365 self.pinst = [] 366 self.sinst = [] 367 self.sbp = {} 368 self.mfree = 0 369 self.dfree = 0 370 self.offline = offline 371 self.vm_capable = vm_capable 372 self.rpc_fail = False 373 self.lvm_fail = False 374 self.hyp_fail = False 375 self.ghost = False 376 self.os_fail = False 377 self.oslist = {} 378 self.pv_min = None 379 self.pv_max = None
380
381 - def ExpandNames(self):
382 # This raises errors.OpPrereqError on its own: 383 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name) 384 385 # Get instances in node group; this is unsafe and needs verification later 386 inst_uuids = \ 387 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) 388 389 self.needed_locks = { 390 locking.LEVEL_INSTANCE: self.cfg.GetInstanceNames(inst_uuids), 391 locking.LEVEL_NODEGROUP: [self.group_uuid], 392 locking.LEVEL_NODE: [], 393 } 394 395 self.share_locks = ShareAll()
396
397 - def DeclareLocks(self, level):
398 if level == locking.LEVEL_NODE: 399 # Get members of node group; this is unsafe and needs verification later 400 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members) 401 402 # In Exec(), we warn about mirrored instances that have primary and 403 # secondary living in separate node groups. To fully verify that 404 # volumes for these instances are healthy, we will need to do an 405 # extra call to their secondaries. We ensure here those nodes will 406 # be locked. 407 for inst_name in self.owned_locks(locking.LEVEL_INSTANCE): 408 # Important: access only the instances whose lock is owned 409 instance = self.cfg.GetInstanceInfoByName(inst_name) 410 disks = self.cfg.GetInstanceDisks(instance.uuid) 411 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 412 nodes.update(self.cfg.GetInstanceSecondaryNodes(instance.uuid)) 413 414 self.needed_locks[locking.LEVEL_NODE] = nodes
415
416 - def CheckPrereq(self):
417 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP) 418 self.group_info = self.cfg.GetNodeGroup(self.group_uuid) 419 420 group_node_uuids = set(self.group_info.members) 421 group_inst_uuids = \ 422 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) 423 424 unlocked_node_uuids = \ 425 group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE)) 426 427 unlocked_inst_uuids = \ 428 group_inst_uuids.difference( 429 [self.cfg.GetInstanceInfoByName(name).uuid 430 for name in self.owned_locks(locking.LEVEL_INSTANCE)]) 431 432 if unlocked_node_uuids: 433 raise errors.OpPrereqError( 434 "Missing lock for nodes: %s" % 435 utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)), 436 errors.ECODE_STATE) 437 438 if unlocked_inst_uuids: 439 raise errors.OpPrereqError( 440 "Missing lock for instances: %s" % 441 utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)), 442 errors.ECODE_STATE) 443 444 self.all_node_info = self.cfg.GetAllNodesInfo() 445 self.all_inst_info = self.cfg.GetAllInstancesInfo() 446 self.all_disks_info = self.cfg.GetAllDisksInfo() 447 448 self.my_node_uuids = group_node_uuids 449 self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid]) 450 for node_uuid in group_node_uuids) 451 452 self.my_inst_uuids = group_inst_uuids 453 self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid]) 454 for inst_uuid in group_inst_uuids) 455 456 # We detect here the nodes that will need the extra RPC calls for verifying 457 # split LV volumes; they should be locked. 458 extra_lv_nodes = set() 459 460 for inst in self.my_inst_info.values(): 461 disks = self.cfg.GetInstanceDisks(inst.uuid) 462 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 463 inst_nodes = self.cfg.GetInstanceNodes(inst.uuid) 464 for nuuid in inst_nodes: 465 if self.all_node_info[nuuid].group != self.group_uuid: 466 extra_lv_nodes.add(nuuid) 467 468 unlocked_lv_nodes = \ 469 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE)) 470 471 if unlocked_lv_nodes: 472 raise errors.OpPrereqError("Missing node locks for LV check: %s" % 473 utils.CommaJoin(unlocked_lv_nodes), 474 errors.ECODE_STATE) 475 self.extra_lv_nodes = list(extra_lv_nodes)
476
477 - def _VerifyNode(self, ninfo, nresult):
478 """Perform some basic validation on data returned from a node. 479 480 - check the result data structure is well formed and has all the 481 mandatory fields 482 - check ganeti version 483 484 @type ninfo: L{objects.Node} 485 @param ninfo: the node to check 486 @param nresult: the results from the node 487 @rtype: boolean 488 @return: whether overall this call was successful (and we can expect 489 reasonable values in the respose) 490 491 """ 492 # main result, nresult should be a non-empty dict 493 test = not nresult or not isinstance(nresult, dict) 494 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name, 495 "unable to verify node: no data returned") 496 if test: 497 return False 498 499 # compares ganeti version 500 local_version = constants.PROTOCOL_VERSION 501 remote_version = nresult.get("version", None) 502 test = not (remote_version and 503 isinstance(remote_version, (list, tuple)) and 504 len(remote_version) == 2) 505 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name, 506 "connection to node returned invalid data") 507 if test: 508 return False 509 510 test = local_version != remote_version[0] 511 self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name, 512 "incompatible protocol versions: master %s," 513 " node %s", local_version, remote_version[0]) 514 if test: 515 return False 516 517 # node seems compatible, we can actually try to look into its results 518 519 # full package version 520 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1], 521 constants.CV_ENODEVERSION, ninfo.name, 522 "software version mismatch: master %s, node %s", 523 constants.RELEASE_VERSION, remote_version[1], 524 code=self.ETYPE_WARNING) 525 526 hyp_result = nresult.get(constants.NV_HYPERVISOR, None) 527 if ninfo.vm_capable and isinstance(hyp_result, dict): 528 for hv_name, hv_result in hyp_result.iteritems(): 529 test = hv_result is not None 530 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 531 "hypervisor %s verify failure: '%s'", hv_name, hv_result) 532 533 hvp_result = nresult.get(constants.NV_HVPARAMS, None) 534 if ninfo.vm_capable and isinstance(hvp_result, list): 535 for item, hv_name, hv_result in hvp_result: 536 self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name, 537 "hypervisor %s parameter verify failure (source %s): %s", 538 hv_name, item, hv_result) 539 540 test = nresult.get(constants.NV_NODESETUP, 541 ["Missing NODESETUP results"]) 542 self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name, 543 "node setup error: %s", "; ".join(test)) 544 545 return True
546
547 - def _VerifyNodeTime(self, ninfo, nresult, 548 nvinfo_starttime, nvinfo_endtime):
549 """Check the node time. 550 551 @type ninfo: L{objects.Node} 552 @param ninfo: the node to check 553 @param nresult: the remote results for the node 554 @param nvinfo_starttime: the start time of the RPC call 555 @param nvinfo_endtime: the end time of the RPC call 556 557 """ 558 ntime = nresult.get(constants.NV_TIME, None) 559 try: 560 ntime_merged = utils.MergeTime(ntime) 561 except (ValueError, TypeError): 562 self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name, 563 "Node returned invalid time") 564 return 565 566 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW): 567 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged) 568 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW): 569 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime) 570 else: 571 ntime_diff = None 572 573 self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name, 574 "Node time diverges by at least %s from master node time", 575 ntime_diff)
576
577 - def _UpdateVerifyNodeLVM(self, ninfo, nresult, vg_name, nimg):
578 """Check the node LVM results and update info for cross-node checks. 579 580 @type ninfo: L{objects.Node} 581 @param ninfo: the node to check 582 @param nresult: the remote results for the node 583 @param vg_name: the configured VG name 584 @type nimg: L{NodeImage} 585 @param nimg: node image 586 587 """ 588 if vg_name is None: 589 return 590 591 # checks vg existence and size > 20G 592 vglist = nresult.get(constants.NV_VGLIST, None) 593 test = not vglist 594 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name, 595 "unable to check volume groups") 596 if not test: 597 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name, 598 constants.MIN_VG_SIZE) 599 self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus) 600 601 # Check PVs 602 (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage) 603 for em in errmsgs: 604 self._Error(constants.CV_ENODELVM, ninfo.name, em) 605 if pvminmax is not None: 606 (nimg.pv_min, nimg.pv_max) = pvminmax
607
608 - def _VerifyGroupDRBDVersion(self, node_verify_infos):
609 """Check cross-node DRBD version consistency. 610 611 @type node_verify_infos: dict 612 @param node_verify_infos: infos about nodes as returned from the 613 node_verify call. 614 615 """ 616 node_versions = {} 617 for node_uuid, ndata in node_verify_infos.items(): 618 nresult = ndata.payload 619 if nresult: 620 version = nresult.get(constants.NV_DRBDVERSION, None) 621 if version: 622 node_versions[node_uuid] = version 623 624 if len(set(node_versions.values())) > 1: 625 for node_uuid, version in sorted(node_versions.items()): 626 msg = "DRBD version mismatch: %s" % version 627 self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg, 628 code=self.ETYPE_WARNING)
629
630 - def _VerifyGroupLVM(self, node_image, vg_name):
631 """Check cross-node consistency in LVM. 632 633 @type node_image: dict 634 @param node_image: info about nodes, mapping from node to names to 635 L{NodeImage} objects 636 @param vg_name: the configured VG name 637 638 """ 639 if vg_name is None: 640 return 641 642 # Only exclusive storage needs this kind of checks 643 if not self._exclusive_storage: 644 return 645 646 # exclusive_storage wants all PVs to have the same size (approximately), 647 # if the smallest and the biggest ones are okay, everything is fine. 648 # pv_min is None iff pv_max is None 649 vals = filter((lambda ni: ni.pv_min is not None), node_image.values()) 650 if not vals: 651 return 652 (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals) 653 (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals) 654 bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax) 655 self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE, self.group_info.name, 656 "PV sizes differ too much in the group; smallest (%s MB) is" 657 " on %s, biggest (%s MB) is on %s", 658 pvmin, self.cfg.GetNodeName(minnode_uuid), 659 pvmax, self.cfg.GetNodeName(maxnode_uuid))
660
661 - def _VerifyNodeBridges(self, ninfo, nresult, bridges):
662 """Check the node bridges. 663 664 @type ninfo: L{objects.Node} 665 @param ninfo: the node to check 666 @param nresult: the remote results for the node 667 @param bridges: the expected list of bridges 668 669 """ 670 if not bridges: 671 return 672 673 missing = nresult.get(constants.NV_BRIDGES, None) 674 test = not isinstance(missing, list) 675 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 676 "did not return valid bridge information") 677 if not test: 678 self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name, 679 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
680
681 - def _VerifyNodeUserScripts(self, ninfo, nresult):
682 """Check the results of user scripts presence and executability on the node 683 684 @type ninfo: L{objects.Node} 685 @param ninfo: the node to check 686 @param nresult: the remote results for the node 687 688 """ 689 test = not constants.NV_USERSCRIPTS in nresult 690 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, ninfo.name, 691 "did not return user scripts information") 692 693 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None) 694 if not test: 695 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, ninfo.name, 696 "user scripts not present or not executable: %s" % 697 utils.CommaJoin(sorted(broken_scripts)))
698
699 - def _VerifyNodeNetwork(self, ninfo, nresult):
700 """Check the node network connectivity results. 701 702 @type ninfo: L{objects.Node} 703 @param ninfo: the node to check 704 @param nresult: the remote results for the node 705 706 """ 707 test = constants.NV_NODELIST not in nresult 708 self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name, 709 "node hasn't returned node ssh connectivity data") 710 if not test: 711 if nresult[constants.NV_NODELIST]: 712 for a_node, a_msg in nresult[constants.NV_NODELIST].items(): 713 self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name, 714 "ssh communication with node '%s': %s", a_node, a_msg) 715 716 test = constants.NV_NODENETTEST not in nresult 717 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 718 "node hasn't returned node tcp connectivity data") 719 if not test: 720 if nresult[constants.NV_NODENETTEST]: 721 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys()) 722 for anode in nlist: 723 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, 724 "tcp communication with node '%s': %s", 725 anode, nresult[constants.NV_NODENETTEST][anode]) 726 727 test = constants.NV_MASTERIP not in nresult 728 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 729 "node hasn't returned node master IP reachability data") 730 if not test: 731 if not nresult[constants.NV_MASTERIP]: 732 if ninfo.uuid == self.master_node: 733 msg = "the master node cannot reach the master IP (not configured?)" 734 else: 735 msg = "cannot reach the master IP" 736 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg)
737
738 - def _VerifyInstance(self, instance, node_image, diskstatus):
739 """Verify an instance. 740 741 This function checks to see if the required block devices are 742 available on the instance's node, and that the nodes are in the correct 743 state. 744 745 """ 746 pnode_uuid = instance.primary_node 747 pnode_img = node_image[pnode_uuid] 748 groupinfo = self.cfg.GetAllNodeGroupsInfo() 749 750 node_vol_should = {} 751 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 752 753 cluster = self.cfg.GetClusterInfo() 754 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster, 755 self.group_info) 756 err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg) 757 self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name, 758 utils.CommaJoin(err), code=self.ETYPE_WARNING) 759 760 for node_uuid in node_vol_should: 761 n_img = node_image[node_uuid] 762 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail: 763 # ignore missing volumes on offline or broken nodes 764 continue 765 for volume in node_vol_should[node_uuid]: 766 test = volume not in n_img.volumes 767 self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name, 768 "volume %s missing on node %s", volume, 769 self.cfg.GetNodeName(node_uuid)) 770 771 if instance.admin_state == constants.ADMINST_UP: 772 test = instance.uuid not in pnode_img.instances and not pnode_img.offline 773 self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name, 774 "instance not running on its primary node %s", 775 self.cfg.GetNodeName(pnode_uuid)) 776 self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE, 777 instance.name, "instance is marked as running and lives on" 778 " offline node %s", self.cfg.GetNodeName(pnode_uuid)) 779 780 diskdata = [(nname, success, status, idx) 781 for (nname, disks) in diskstatus.items() 782 for idx, (success, status) in enumerate(disks)] 783 784 for nname, success, bdev_status, idx in diskdata: 785 # the 'ghost node' construction in Exec() ensures that we have a 786 # node here 787 snode = node_image[nname] 788 bad_snode = snode.ghost or snode.offline 789 self._ErrorIf(instance.disks_active and 790 not success and not bad_snode, 791 constants.CV_EINSTANCEFAULTYDISK, instance.name, 792 "couldn't retrieve status for disk/%s on %s: %s", 793 idx, self.cfg.GetNodeName(nname), bdev_status) 794 795 if instance.disks_active and success and bdev_status.is_degraded: 796 msg = "disk/%s on %s is degraded" % (idx, self.cfg.GetNodeName(nname)) 797 798 code = self.ETYPE_ERROR 799 accepted_lds = [constants.LDS_OKAY, constants.LDS_SYNC] 800 801 if bdev_status.ldisk_status in accepted_lds: 802 code = self.ETYPE_WARNING 803 804 msg += "; local disk state is '%s'" % \ 805 constants.LDS_NAMES[bdev_status.ldisk_status] 806 807 self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg, 808 code=code) 809 810 self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline, 811 constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid), 812 "instance %s, connection to primary node failed", 813 instance.name) 814 815 secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid) 816 self._ErrorIf(len(secondary_nodes) > 1, 817 constants.CV_EINSTANCELAYOUT, instance.name, 818 "instance has multiple secondary nodes: %s", 819 utils.CommaJoin(secondary_nodes), 820 code=self.ETYPE_WARNING) 821 822 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid) 823 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, inst_nodes) 824 disks = self.cfg.GetInstanceDisks(instance.uuid) 825 if any(es_flags.values()): 826 if not utils.AllDiskOfType(disks, constants.DTS_EXCL_STORAGE): 827 # Disk template not compatible with exclusive_storage: no instance 828 # node should have the flag set 829 es_nodes = [n 830 for (n, es) in es_flags.items() 831 if es] 832 unsupported = [d.dev_type for d in disks 833 if d.dev_type not in constants.DTS_EXCL_STORAGE] 834 self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name, 835 "instance uses disk types %s, which are not supported on" 836 " nodes that have exclusive storage set: %s", 837 utils.CommaJoin(unsupported), 838 utils.CommaJoin(self.cfg.GetNodeNames(es_nodes))) 839 for (idx, disk) in enumerate(disks): 840 self._ErrorIf(disk.spindles is None, 841 constants.CV_EINSTANCEMISSINGCFGPARAMETER, instance.name, 842 "number of spindles not configured for disk %s while" 843 " exclusive storage is enabled, try running" 844 " gnt-cluster repair-disk-sizes", idx) 845 846 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 847 instance_nodes = utils.NiceSort(inst_nodes) 848 instance_groups = {} 849 850 for node_uuid in instance_nodes: 851 instance_groups.setdefault(self.all_node_info[node_uuid].group, 852 []).append(node_uuid) 853 854 pretty_list = [ 855 "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)), 856 groupinfo[group].name) 857 # Sort so that we always list the primary node first. 858 for group, nodes in sorted(instance_groups.items(), 859 key=lambda (_, nodes): pnode_uuid in nodes, 860 reverse=True)] 861 862 self._ErrorIf(len(instance_groups) > 1, 863 constants.CV_EINSTANCESPLITGROUPS, 864 instance.name, "instance has primary and secondary nodes in" 865 " different groups: %s", utils.CommaJoin(pretty_list), 866 code=self.ETYPE_WARNING) 867 868 inst_nodes_offline = [] 869 for snode in secondary_nodes: 870 s_img = node_image[snode] 871 self._ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC, 872 self.cfg.GetNodeName(snode), 873 "instance %s, connection to secondary node failed", 874 instance.name) 875 876 if s_img.offline: 877 inst_nodes_offline.append(snode) 878 879 # warn that the instance lives on offline nodes 880 self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, 881 instance.name, "instance has offline secondary node(s) %s", 882 utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline))) 883 # ... or ghost/non-vm_capable nodes 884 for node_uuid in inst_nodes: 885 self._ErrorIf(node_image[node_uuid].ghost, constants.CV_EINSTANCEBADNODE, 886 instance.name, "instance lives on ghost node %s", 887 self.cfg.GetNodeName(node_uuid)) 888 self._ErrorIf(not node_image[node_uuid].vm_capable, 889 constants.CV_EINSTANCEBADNODE, instance.name, 890 "instance lives on non-vm_capable node %s", 891 self.cfg.GetNodeName(node_uuid))
892
893 - def _VerifyOrphanVolumes(self, vg_name, node_vol_should, node_image, 894 reserved):
895 """Verify if there are any unknown volumes in the cluster. 896 897 The .os, .swap and backup volumes are ignored. All other volumes are 898 reported as unknown. 899 900 @type vg_name: string 901 @param vg_name: the name of the Ganeti-administered volume group 902 @type reserved: L{ganeti.utils.FieldSet} 903 @param reserved: a FieldSet of reserved volume names 904 905 """ 906 for node_uuid, n_img in node_image.items(): 907 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or 908 self.all_node_info[node_uuid].group != self.group_uuid): 909 # skip non-healthy nodes 910 continue 911 for volume in n_img.volumes: 912 # skip volumes not belonging to the ganeti-administered volume group 913 if volume.split('/')[0] != vg_name: 914 continue 915 916 test = ((node_uuid not in node_vol_should or 917 volume not in node_vol_should[node_uuid]) and 918 not reserved.Matches(volume)) 919 self._ErrorIf(test, constants.CV_ENODEORPHANLV, 920 self.cfg.GetNodeName(node_uuid), 921 "volume %s is unknown", volume, 922 code=_VerifyErrors.ETYPE_WARNING)
923
924 - def _VerifyNPlusOneMemory(self, node_image, all_insts):
925 """Verify N+1 Memory Resilience. 926 927 Check that if one single node dies we can still start all the 928 instances it was primary for. 929 930 """ 931 cluster_info = self.cfg.GetClusterInfo() 932 for node_uuid, n_img in node_image.items(): 933 # This code checks that every node which is now listed as 934 # secondary has enough memory to host all instances it is 935 # supposed to should a single other node in the cluster fail. 936 # FIXME: not ready for failover to an arbitrary node 937 # FIXME: does not support file-backed instances 938 # WARNING: we currently take into account down instances as well 939 # as up ones, considering that even if they're down someone 940 # might want to start them even in the event of a node failure. 941 if n_img.offline or \ 942 self.all_node_info[node_uuid].group != self.group_uuid: 943 # we're skipping nodes marked offline and nodes in other groups from 944 # the N+1 warning, since most likely we don't have good memory 945 # information from them; we already list instances living on such 946 # nodes, and that's enough warning 947 continue 948 #TODO(dynmem): also consider ballooning out other instances 949 for prinode, inst_uuids in n_img.sbp.items(): 950 needed_mem = 0 951 for inst_uuid in inst_uuids: 952 bep = cluster_info.FillBE(all_insts[inst_uuid]) 953 if bep[constants.BE_AUTO_BALANCE]: 954 needed_mem += bep[constants.BE_MINMEM] 955 test = n_img.mfree < needed_mem 956 self._ErrorIf(test, constants.CV_ENODEN1, 957 self.cfg.GetNodeName(node_uuid), 958 "not enough memory to accomodate instance failovers" 959 " should node %s fail (%dMiB needed, %dMiB available)", 960 self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
961
962 - def _VerifyClientCertificates(self, nodes, all_nvinfo):
963 """Verifies the consistency of the client certificates. 964 965 This includes several aspects: 966 - the individual validation of all nodes' certificates 967 - the consistency of the master candidate certificate map 968 - the consistency of the master candidate certificate map with the 969 certificates that the master candidates are actually using. 970 971 @param nodes: the list of nodes to consider in this verification 972 @param all_nvinfo: the map of results of the verify_node call to 973 all nodes 974 975 """ 976 candidate_certs = self.cfg.GetClusterInfo().candidate_certs 977 if candidate_certs is None or len(candidate_certs) == 0: 978 self._ErrorIf( 979 True, constants.CV_ECLUSTERCLIENTCERT, None, 980 "The cluster's list of master candidate certificates is empty." 981 " If you just updated the cluster, please run" 982 " 'gnt-cluster renew-crypto --new-node-certificates'.") 983 return 984 985 self._ErrorIf( 986 len(candidate_certs) != len(set(candidate_certs.values())), 987 constants.CV_ECLUSTERCLIENTCERT, None, 988 "There are at least two master candidates configured to use the same" 989 " certificate.") 990 991 # collect the client certificate 992 for node in nodes: 993 if node.offline: 994 continue 995 996 nresult = all_nvinfo[node.uuid] 997 if nresult.fail_msg or not nresult.payload: 998 continue 999 1000 (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None) 1001 1002 self._ErrorIf( 1003 errcode is not None, constants.CV_ECLUSTERCLIENTCERT, None, 1004 "Client certificate of node '%s' failed validation: %s (code '%s')", 1005 node.uuid, msg, errcode) 1006 1007 if not errcode: 1008 digest = msg 1009 if node.master_candidate: 1010 if node.uuid in candidate_certs: 1011 self._ErrorIf( 1012 digest != candidate_certs[node.uuid], 1013 constants.CV_ECLUSTERCLIENTCERT, None, 1014 "Client certificate digest of master candidate '%s' does not" 1015 " match its entry in the cluster's map of master candidate" 1016 " certificates. Expected: %s Got: %s", node.uuid, 1017 digest, candidate_certs[node.uuid]) 1018 else: 1019 self._ErrorIf( 1020 True, constants.CV_ECLUSTERCLIENTCERT, None, 1021 "The master candidate '%s' does not have an entry in the" 1022 " map of candidate certificates.", node.uuid) 1023 self._ErrorIf( 1024 digest in candidate_certs.values(), 1025 constants.CV_ECLUSTERCLIENTCERT, None, 1026 "Master candidate '%s' is using a certificate of another node.", 1027 node.uuid) 1028 else: 1029 self._ErrorIf( 1030 node.uuid in candidate_certs, 1031 constants.CV_ECLUSTERCLIENTCERT, None, 1032 "Node '%s' is not a master candidate, but still listed in the" 1033 " map of master candidate certificates.", node.uuid) 1034 self._ErrorIf( 1035 (node.uuid not in candidate_certs) and 1036 (digest in candidate_certs.values()), 1037 constants.CV_ECLUSTERCLIENTCERT, None, 1038 "Node '%s' is not a master candidate and is incorrectly using a" 1039 " certificate of another node which is master candidate.", 1040 node.uuid)
1041
1042 - def _VerifySshSetup(self, nodes, all_nvinfo):
1043 """Evaluates the verification results of the SSH setup and clutter test. 1044 1045 @param nodes: List of L{objects.Node} objects 1046 @param all_nvinfo: RPC results 1047 1048 """ 1049 for node in nodes: 1050 if not node.offline: 1051 nresult = all_nvinfo[node.uuid] 1052 if nresult.fail_msg or not nresult.payload: 1053 self._ErrorIf(True, constants.CV_ENODESSH, node.name, 1054 "Could not verify the SSH setup of this node.") 1055 return 1056 for ssh_test in [constants.NV_SSH_SETUP, constants.NV_SSH_CLUTTER]: 1057 result = nresult.payload.get(ssh_test, None) 1058 error_msg = "" 1059 if isinstance(result, list): 1060 error_msg = " ".join(result) 1061 self._ErrorIf(result, 1062 constants.CV_ENODESSH, None, error_msg)
1063
1064 - def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo, 1065 (files_all, files_opt, files_mc, files_vm)):
1066 """Verifies file checksums collected from all nodes. 1067 1068 @param nodes: List of L{objects.Node} objects 1069 @param master_node_uuid: UUID of master node 1070 @param all_nvinfo: RPC results 1071 1072 """ 1073 # Define functions determining which nodes to consider for a file 1074 files2nodefn = [ 1075 (files_all, None), 1076 (files_mc, lambda node: (node.master_candidate or 1077 node.uuid == master_node_uuid)), 1078 (files_vm, lambda node: node.vm_capable), 1079 ] 1080 1081 # Build mapping from filename to list of nodes which should have the file 1082 nodefiles = {} 1083 for (files, fn) in files2nodefn: 1084 if fn is None: 1085 filenodes = nodes 1086 else: 1087 filenodes = filter(fn, nodes) 1088 nodefiles.update((filename, 1089 frozenset(map(operator.attrgetter("uuid"), filenodes))) 1090 for filename in files) 1091 1092 assert set(nodefiles) == (files_all | files_mc | files_vm) 1093 1094 fileinfo = dict((filename, {}) for filename in nodefiles) 1095 ignore_nodes = set() 1096 1097 for node in nodes: 1098 if node.offline: 1099 ignore_nodes.add(node.uuid) 1100 continue 1101 1102 nresult = all_nvinfo[node.uuid] 1103 1104 if nresult.fail_msg or not nresult.payload: 1105 node_files = None 1106 else: 1107 fingerprints = nresult.payload.get(constants.NV_FILELIST, {}) 1108 node_files = dict((vcluster.LocalizeVirtualPath(key), value) 1109 for (key, value) in fingerprints.items()) 1110 del fingerprints 1111 1112 test = not (node_files and isinstance(node_files, dict)) 1113 self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name, 1114 "Node did not return file checksum data") 1115 if test: 1116 ignore_nodes.add(node.uuid) 1117 continue 1118 1119 # Build per-checksum mapping from filename to nodes having it 1120 for (filename, checksum) in node_files.items(): 1121 assert filename in nodefiles 1122 fileinfo[filename].setdefault(checksum, set()).add(node.uuid) 1123 1124 for (filename, checksums) in fileinfo.items(): 1125 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum" 1126 1127 # Nodes having the file 1128 with_file = frozenset(node_uuid 1129 for node_uuids in fileinfo[filename].values() 1130 for node_uuid in node_uuids) - ignore_nodes 1131 1132 expected_nodes = nodefiles[filename] - ignore_nodes 1133 1134 # Nodes missing file 1135 missing_file = expected_nodes - with_file 1136 1137 if filename in files_opt: 1138 # All or no nodes 1139 self._ErrorIf(missing_file and missing_file != expected_nodes, 1140 constants.CV_ECLUSTERFILECHECK, None, 1141 "File %s is optional, but it must exist on all or no" 1142 " nodes (not found on %s)", 1143 filename, 1144 utils.CommaJoin( 1145 utils.NiceSort( 1146 map(self.cfg.GetNodeName, missing_file)))) 1147 else: 1148 self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None, 1149 "File %s is missing from node(s) %s", filename, 1150 utils.CommaJoin( 1151 utils.NiceSort( 1152 map(self.cfg.GetNodeName, missing_file)))) 1153 1154 # Warn if a node has a file it shouldn't 1155 unexpected = with_file - expected_nodes 1156 self._ErrorIf(unexpected, 1157 constants.CV_ECLUSTERFILECHECK, None, 1158 "File %s should not exist on node(s) %s", 1159 filename, utils.CommaJoin( 1160 utils.NiceSort(map(self.cfg.GetNodeName, unexpected)))) 1161 1162 # See if there are multiple versions of the file 1163 test = len(checksums) > 1 1164 if test: 1165 variants = ["variant %s on %s" % 1166 (idx + 1, 1167 utils.CommaJoin(utils.NiceSort( 1168 map(self.cfg.GetNodeName, node_uuids)))) 1169 for (idx, (checksum, node_uuids)) in 1170 enumerate(sorted(checksums.items()))] 1171 else: 1172 variants = [] 1173 1174 self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None, 1175 "File %s found with %s different checksums (%s)", 1176 filename, len(checksums), "; ".join(variants))
1177
1178 - def _VerifyNodeDrbdHelper(self, ninfo, nresult, drbd_helper):
1179 """Verify the drbd helper. 1180 1181 """ 1182 if drbd_helper: 1183 helper_result = nresult.get(constants.NV_DRBDHELPER, None) 1184 test = (helper_result is None) 1185 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1186 "no drbd usermode helper returned") 1187 if helper_result: 1188 status, payload = helper_result 1189 test = not status 1190 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1191 "drbd usermode helper check unsuccessful: %s", payload) 1192 test = status and (payload != drbd_helper) 1193 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1194 "wrong drbd usermode helper: %s", payload)
1195 1196 @staticmethod
1197 - def _ComputeDrbdMinors(ninfo, instanceinfo, disks_info, drbd_map, error_if):
1198 """Gives the DRBD information in a map for a node. 1199 1200 @type ninfo: L{objects.Node} 1201 @param ninfo: the node to check 1202 @param instanceinfo: the dict of instances 1203 @param disks_info: the dict of disks 1204 @param drbd_map: the DRBD map as returned by 1205 L{ganeti.config.ConfigWriter.ComputeDRBDMap} 1206 @type error_if: callable like L{_ErrorIf} 1207 @param error_if: The error reporting function 1208 @return: dict from minor number to (disk_uuid, instance_uuid, active) 1209 1210 """ 1211 node_drbd = {} 1212 for minor, disk_uuid in drbd_map[ninfo.uuid].items(): 1213 test = disk_uuid not in disks_info 1214 error_if(test, constants.CV_ECLUSTERCFG, None, 1215 "ghost disk '%s' in temporary DRBD map", disk_uuid) 1216 # ghost disk should not be active, but otherwise we 1217 # don't give double warnings (both ghost disk and 1218 # unallocated minor in use) 1219 if test: 1220 node_drbd[minor] = (disk_uuid, None, False) 1221 else: 1222 disk_active = False 1223 disk_instance = None 1224 for (inst_uuid, inst) in instanceinfo.items(): 1225 if disk_uuid in inst.disks: 1226 disk_active = inst.disks_active 1227 disk_instance = inst_uuid 1228 break 1229 node_drbd[minor] = (disk_uuid, disk_instance, disk_active) 1230 return node_drbd
1231
1232 - def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, disks_info, 1233 drbd_helper, drbd_map):
1234 """Verifies and the node DRBD status. 1235 1236 @type ninfo: L{objects.Node} 1237 @param ninfo: the node to check 1238 @param nresult: the remote results for the node 1239 @param instanceinfo: the dict of instances 1240 @param disks_info: the dict of disks 1241 @param drbd_helper: the configured DRBD usermode helper 1242 @param drbd_map: the DRBD map as returned by 1243 L{ganeti.config.ConfigWriter.ComputeDRBDMap} 1244 1245 """ 1246 self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper) 1247 1248 # compute the DRBD minors 1249 node_drbd = self._ComputeDrbdMinors(ninfo, instanceinfo, disks_info, 1250 drbd_map, self._ErrorIf) 1251 1252 # and now check them 1253 used_minors = nresult.get(constants.NV_DRBDLIST, []) 1254 test = not isinstance(used_minors, (tuple, list)) 1255 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1256 "cannot parse drbd status file: %s", str(used_minors)) 1257 if test: 1258 # we cannot check drbd status 1259 return 1260 1261 for minor, (disk_uuid, inst_uuid, must_exist) in node_drbd.items(): 1262 test = minor not in used_minors and must_exist 1263 if inst_uuid is not None: 1264 attached = "(attached in instance '%s')" % \ 1265 self.cfg.GetInstanceName(inst_uuid) 1266 else: 1267 attached = "(detached)" 1268 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1269 "drbd minor %d of disk %s %s is not active", 1270 minor, disk_uuid, attached) 1271 for minor in used_minors: 1272 test = minor not in node_drbd 1273 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1274 "unallocated drbd minor %d is in use", minor)
1275
1276 - def _UpdateNodeOS(self, ninfo, nresult, nimg):
1277 """Builds the node OS structures. 1278 1279 @type ninfo: L{objects.Node} 1280 @param ninfo: the node to check 1281 @param nresult: the remote results for the node 1282 @param nimg: the node image object 1283 1284 """ 1285 remote_os = nresult.get(constants.NV_OSLIST, None) 1286 test = (not isinstance(remote_os, list) or 1287 not compat.all(isinstance(v, list) and len(v) == 8 1288 for v in remote_os)) 1289 1290 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name, 1291 "node hasn't returned valid OS data") 1292 1293 nimg.os_fail = test 1294 1295 if test: 1296 return 1297 1298 os_dict = {} 1299 1300 for (name, os_path, status, diagnose, 1301 variants, parameters, api_ver, 1302 trusted) in nresult[constants.NV_OSLIST]: 1303 1304 if name not in os_dict: 1305 os_dict[name] = [] 1306 1307 # parameters is a list of lists instead of list of tuples due to 1308 # JSON lacking a real tuple type, fix it: 1309 parameters = [tuple(v) for v in parameters] 1310 os_dict[name].append((os_path, status, diagnose, 1311 set(variants), set(parameters), set(api_ver), 1312 trusted)) 1313 1314 nimg.oslist = os_dict
1315
1316 - def _VerifyNodeOS(self, ninfo, nimg, base):
1317 """Verifies the node OS list. 1318 1319 @type ninfo: L{objects.Node} 1320 @param ninfo: the node to check 1321 @param nimg: the node image object 1322 @param base: the 'template' node we match against (e.g. from the master) 1323 1324 """ 1325 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?" 1326 1327 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l] 1328 for os_name, os_data in nimg.oslist.items(): 1329 assert os_data, "Empty OS status for OS %s?!" % os_name 1330 f_path, f_status, f_diag, f_var, f_param, f_api, f_trusted = os_data[0] 1331 self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name, 1332 "Invalid OS %s (located at %s): %s", 1333 os_name, f_path, f_diag) 1334 self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name, 1335 "OS '%s' has multiple entries" 1336 " (first one shadows the rest): %s", 1337 os_name, utils.CommaJoin([v[0] for v in os_data])) 1338 # comparisons with the 'base' image 1339 test = os_name not in base.oslist 1340 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name, 1341 "Extra OS %s not present on reference node (%s)", 1342 os_name, self.cfg.GetNodeName(base.uuid)) 1343 if test: 1344 continue 1345 assert base.oslist[os_name], "Base node has empty OS status?" 1346 _, b_status, _, b_var, b_param, b_api, b_trusted = base.oslist[os_name][0] 1347 if not b_status: 1348 # base OS is invalid, skipping 1349 continue 1350 for kind, a, b in [("API version", f_api, b_api), 1351 ("variants list", f_var, b_var), 1352 ("parameters", beautify_params(f_param), 1353 beautify_params(b_param))]: 1354 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name, 1355 "OS %s for %s differs from reference node %s:" 1356 " [%s] vs. [%s]", kind, os_name, 1357 self.cfg.GetNodeName(base.uuid), 1358 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b))) 1359 for kind, a, b in [("trusted", f_trusted, b_trusted)]: 1360 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name, 1361 "OS %s for %s differs from reference node %s:" 1362 " %s vs. %s", kind, os_name, 1363 self.cfg.GetNodeName(base.uuid), a, b) 1364 1365 # check any missing OSes 1366 missing = set(base.oslist.keys()).difference(nimg.oslist.keys()) 1367 self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name, 1368 "OSes present on reference node %s" 1369 " but missing on this node: %s", 1370 self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
1371
1372 - def _VerifyAcceptedFileStoragePaths(self, ninfo, nresult, is_master):
1373 """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}. 1374 1375 @type ninfo: L{objects.Node} 1376 @param ninfo: the node to check 1377 @param nresult: the remote results for the node 1378 @type is_master: bool 1379 @param is_master: Whether node is the master node 1380 1381 """ 1382 cluster = self.cfg.GetClusterInfo() 1383 if (is_master and 1384 (cluster.IsFileStorageEnabled() or 1385 cluster.IsSharedFileStorageEnabled())): 1386 try: 1387 fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS] 1388 except KeyError: 1389 # This should never happen 1390 self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1391 "Node did not return forbidden file storage paths") 1392 else: 1393 self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1394 "Found forbidden file storage paths: %s", 1395 utils.CommaJoin(fspaths)) 1396 else: 1397 self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult, 1398 constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1399 "Node should not have returned forbidden file storage" 1400 " paths")
1401
1402 - def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template, 1403 verify_key, error_key):
1404 """Verifies (file) storage paths. 1405 1406 @type ninfo: L{objects.Node} 1407 @param ninfo: the node to check 1408 @param nresult: the remote results for the node 1409 @type file_disk_template: string 1410 @param file_disk_template: file-based disk template, whose directory 1411 is supposed to be verified 1412 @type verify_key: string 1413 @param verify_key: key for the verification map of this file 1414 verification step 1415 @param error_key: error key to be added to the verification results 1416 in case something goes wrong in this verification step 1417 1418 """ 1419 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes( 1420 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER 1421 )) 1422 1423 cluster = self.cfg.GetClusterInfo() 1424 if cluster.IsDiskTemplateEnabled(file_disk_template): 1425 self._ErrorIf( 1426 verify_key in nresult, 1427 error_key, ninfo.name, 1428 "The configured %s storage path is unusable: %s" % 1429 (file_disk_template, nresult.get(verify_key)))
1430
1431 - def _VerifyFileStoragePaths(self, ninfo, nresult):
1432 """Verifies (file) storage paths. 1433 1434 @see: C{_VerifyStoragePaths} 1435 1436 """ 1437 self._VerifyStoragePaths( 1438 ninfo, nresult, constants.DT_FILE, 1439 constants.NV_FILE_STORAGE_PATH, 1440 constants.CV_ENODEFILESTORAGEPATHUNUSABLE)
1441
1442 - def _VerifySharedFileStoragePaths(self, ninfo, nresult):
1443 """Verifies (file) storage paths. 1444 1445 @see: C{_VerifyStoragePaths} 1446 1447 """ 1448 self._VerifyStoragePaths( 1449 ninfo, nresult, constants.DT_SHARED_FILE, 1450 constants.NV_SHARED_FILE_STORAGE_PATH, 1451 constants.CV_ENODESHAREDFILESTORAGEPATHUNUSABLE)
1452
1453 - def _VerifyGlusterStoragePaths(self, ninfo, nresult):
1454 """Verifies (file) storage paths. 1455 1456 @see: C{_VerifyStoragePaths} 1457 1458 """ 1459 self._VerifyStoragePaths( 1460 ninfo, nresult, constants.DT_GLUSTER, 1461 constants.NV_GLUSTER_STORAGE_PATH, 1462 constants.CV_ENODEGLUSTERSTORAGEPATHUNUSABLE)
1463
1464 - def _VerifyOob(self, ninfo, nresult):
1465 """Verifies out of band functionality of a node. 1466 1467 @type ninfo: L{objects.Node} 1468 @param ninfo: the node to check 1469 @param nresult: the remote results for the node 1470 1471 """ 1472 # We just have to verify the paths on master and/or master candidates 1473 # as the oob helper is invoked on the master 1474 if ((ninfo.master_candidate or ninfo.master_capable) and 1475 constants.NV_OOB_PATHS in nresult): 1476 for path_result in nresult[constants.NV_OOB_PATHS]: 1477 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, 1478 ninfo.name, path_result)
1479
1480 - def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1481 """Verifies and updates the node volume data. 1482 1483 This function will update a L{NodeImage}'s internal structures 1484 with data from the remote call. 1485 1486 @type ninfo: L{objects.Node} 1487 @param ninfo: the node to check 1488 @param nresult: the remote results for the node 1489 @param nimg: the node image object 1490 @param vg_name: the configured VG name 1491 1492 """ 1493 nimg.lvm_fail = True 1494 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data") 1495 if vg_name is None: 1496 pass 1497 elif isinstance(lvdata, basestring): 1498 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name, 1499 "LVM problem on node: %s", utils.SafeEncode(lvdata)) 1500 elif not isinstance(lvdata, dict): 1501 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name, 1502 "rpc call to node failed (lvlist)") 1503 else: 1504 nimg.volumes = lvdata 1505 nimg.lvm_fail = False
1506
1507 - def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1508 """Verifies and updates the node instance list. 1509 1510 If the listing was successful, then updates this node's instance 1511 list. Otherwise, it marks the RPC call as failed for the instance 1512 list key. 1513 1514 @type ninfo: L{objects.Node} 1515 @param ninfo: the node to check 1516 @param nresult: the remote results for the node 1517 @param nimg: the node image object 1518 1519 """ 1520 idata = nresult.get(constants.NV_INSTANCELIST, None) 1521 test = not isinstance(idata, list) 1522 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 1523 "rpc call to node failed (instancelist): %s", 1524 utils.SafeEncode(str(idata))) 1525 if test: 1526 nimg.hyp_fail = True 1527 else: 1528 nimg.instances = [uuid for (uuid, _) in 1529 self.cfg.GetMultiInstanceInfoByName(idata)]
1530
1531 - def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1532 """Verifies and computes a node information map 1533 1534 @type ninfo: L{objects.Node} 1535 @param ninfo: the node to check 1536 @param nresult: the remote results for the node 1537 @param nimg: the node image object 1538 @param vg_name: the configured VG name 1539 1540 """ 1541 # try to read free memory (from the hypervisor) 1542 hv_info = nresult.get(constants.NV_HVINFO, None) 1543 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info 1544 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 1545 "rpc call to node failed (hvinfo)") 1546 if not test: 1547 try: 1548 nimg.mfree = int(hv_info["memory_free"]) 1549 except (ValueError, TypeError): 1550 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, 1551 "node returned invalid nodeinfo, check hypervisor") 1552 1553 # FIXME: devise a free space model for file based instances as well 1554 if vg_name is not None: 1555 test = (constants.NV_VGLIST not in nresult or 1556 vg_name not in nresult[constants.NV_VGLIST]) 1557 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name, 1558 "node didn't return data for the volume group '%s'" 1559 " - it is either missing or broken", vg_name) 1560 if not test: 1561 try: 1562 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name]) 1563 except (ValueError, TypeError): 1564 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, 1565 "node returned invalid LVM info, check LVM status")
1566
1567 - def _CollectDiskInfo(self, node_uuids, node_image, instanceinfo):
1568 """Gets per-disk status information for all instances. 1569 1570 @type node_uuids: list of strings 1571 @param node_uuids: Node UUIDs 1572 @type node_image: dict of (UUID, L{objects.Node}) 1573 @param node_image: Node objects 1574 @type instanceinfo: dict of (UUID, L{objects.Instance}) 1575 @param instanceinfo: Instance objects 1576 @rtype: {instance: {node: [(succes, payload)]}} 1577 @return: a dictionary of per-instance dictionaries with nodes as 1578 keys and disk information as values; the disk information is a 1579 list of tuples (success, payload) 1580 1581 """ 1582 node_disks = {} 1583 node_disks_dev_inst_only = {} 1584 diskless_instances = set() 1585 nodisk_instances = set() 1586 1587 for nuuid in node_uuids: 1588 node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst, 1589 node_image[nuuid].sinst)) 1590 diskless_instances.update(uuid for uuid in node_inst_uuids 1591 if not instanceinfo[uuid].disks) 1592 disks = [(inst_uuid, disk) 1593 for inst_uuid in node_inst_uuids 1594 for disk in self.cfg.GetInstanceDisks(inst_uuid)] 1595 1596 if not disks: 1597 nodisk_instances.update(uuid for uuid in node_inst_uuids 1598 if instanceinfo[uuid].disks) 1599 # No need to collect data 1600 continue 1601 1602 node_disks[nuuid] = disks 1603 1604 # _AnnotateDiskParams makes already copies of the disks 1605 dev_inst_only = [] 1606 for (inst_uuid, dev) in disks: 1607 (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev], 1608 self.cfg) 1609 dev_inst_only.append((anno_disk, instanceinfo[inst_uuid])) 1610 1611 node_disks_dev_inst_only[nuuid] = dev_inst_only 1612 1613 assert len(node_disks) == len(node_disks_dev_inst_only) 1614 1615 # Collect data from all nodes with disks 1616 result = self.rpc.call_blockdev_getmirrorstatus_multi( 1617 node_disks.keys(), node_disks_dev_inst_only) 1618 1619 assert len(result) == len(node_disks) 1620 1621 instdisk = {} 1622 1623 for (nuuid, nres) in result.items(): 1624 node = self.cfg.GetNodeInfo(nuuid) 1625 disks = node_disks[node.uuid] 1626 1627 if nres.offline: 1628 # No data from this node 1629 data = len(disks) * [(False, "node offline")] 1630 else: 1631 msg = nres.fail_msg 1632 self._ErrorIf(msg, constants.CV_ENODERPC, node.name, 1633 "while getting disk information: %s", msg) 1634 if msg: 1635 # No data from this node 1636 data = len(disks) * [(False, msg)] 1637 else: 1638 data = [] 1639 for idx, i in enumerate(nres.payload): 1640 if isinstance(i, (tuple, list)) and len(i) == 2: 1641 data.append(i) 1642 else: 1643 logging.warning("Invalid result from node %s, entry %d: %s", 1644 node.name, idx, i) 1645 data.append((False, "Invalid result from the remote node")) 1646 1647 for ((inst_uuid, _), status) in zip(disks, data): 1648 instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \ 1649 .append(status) 1650 1651 # Add empty entries for diskless instances. 1652 for inst_uuid in diskless_instances: 1653 assert inst_uuid not in instdisk 1654 instdisk[inst_uuid] = {} 1655 # ...and disk-full instances that happen to have no disks 1656 for inst_uuid in nodisk_instances: 1657 assert inst_uuid not in instdisk 1658 instdisk[inst_uuid] = {} 1659 1660 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and 1661 len(nuuids) <= len( 1662 self.cfg.GetInstanceNodes(instanceinfo[inst].uuid)) and 1663 compat.all(isinstance(s, (tuple, list)) and 1664 len(s) == 2 for s in statuses) 1665 for inst, nuuids in instdisk.items() 1666 for nuuid, statuses in nuuids.items()) 1667 if __debug__: 1668 instdisk_keys = set(instdisk) 1669 instanceinfo_keys = set(instanceinfo) 1670 assert instdisk_keys == instanceinfo_keys, \ 1671 ("instdisk keys (%s) do not match instanceinfo keys (%s)" % 1672 (instdisk_keys, instanceinfo_keys)) 1673 1674 return instdisk
1675 1676 @staticmethod
1677 - def _SshNodeSelector(group_uuid, all_nodes):
1678 """Create endless iterators for all potential SSH check hosts. 1679 1680 """ 1681 nodes = [node for node in all_nodes 1682 if (node.group != group_uuid and 1683 not node.offline)] 1684 keyfunc = operator.attrgetter("group") 1685 1686 return map(itertools.cycle, 1687 [sorted(map(operator.attrgetter("name"), names)) 1688 for _, names in itertools.groupby(sorted(nodes, key=keyfunc), 1689 keyfunc)])
1690 1691 @classmethod
1692 - def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
1693 """Choose which nodes should talk to which other nodes. 1694 1695 We will make nodes contact all nodes in their group, and one node from 1696 every other group. 1697 1698 @rtype: tuple of (string, dict of strings to list of strings, string) 1699 @return: a tuple containing the list of all online nodes, a dictionary 1700 mapping node names to additional nodes of other node groups to which 1701 connectivity should be tested, and a list of all online master 1702 candidates 1703 1704 @warning: This algorithm has a known issue if one node group is much 1705 smaller than others (e.g. just one node). In such a case all other 1706 nodes will talk to the single node. 1707 1708 """ 1709 online_nodes = sorted(node.name for node in group_nodes if not node.offline) 1710 online_mcs = sorted(node.name for node in group_nodes 1711 if (node.master_candidate and not node.offline)) 1712 sel = cls._SshNodeSelector(group_uuid, all_nodes) 1713 1714 return (online_nodes, 1715 dict((name, sorted([i.next() for i in sel])) 1716 for name in online_nodes), 1717 online_mcs)
1718
1719 - def _PrepareSshSetupCheck(self):
1720 """Prepare the input data for the SSH setup verification. 1721 1722 """ 1723 all_nodes_info = self.cfg.GetAllNodesInfo() 1724 potential_master_candidates = self.cfg.GetPotentialMasterCandidates() 1725 node_status = [ 1726 (uuid, node_info.name, node_info.master_candidate, 1727 node_info.name in potential_master_candidates, not node_info.offline) 1728 for (uuid, node_info) in all_nodes_info.items()] 1729 return node_status
1730
1731 - def BuildHooksEnv(self):
1732 """Build hooks env. 1733 1734 Cluster-Verify hooks just ran in the post phase and their failure makes 1735 the output be logged in the verify output and the verification to fail. 1736 1737 """ 1738 env = { 1739 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()), 1740 } 1741 1742 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags())) 1743 for node in self.my_node_info.values()) 1744 1745 return env
1746
1747 - def BuildHooksNodes(self):
1748 """Build hooks nodes. 1749 1750 """ 1751 return ([], list(self.my_node_info.keys()))
1752 1753 @staticmethod
1754 - def _VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced, 1755 i_offline, n_offline, n_drained):
1756 feedback_fn("* Other Notes") 1757 if i_non_redundant: 1758 feedback_fn(" - NOTICE: %d non-redundant instance(s) found." 1759 % len(i_non_redundant)) 1760 1761 if i_non_a_balanced: 1762 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found." 1763 % len(i_non_a_balanced)) 1764 1765 if i_offline: 1766 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline) 1767 1768 if n_offline: 1769 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline) 1770 1771 if n_drained: 1772 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
1773
1774 - def _VerifyExclusionTags(self, nodename, pinst, ctags):
1775 """Verify that all instances have different exclusion tags. 1776 1777 @type nodename: string 1778 @param nodename: the name of the node for which the check is done 1779 @type pinst: list of string 1780 @param pinst: list of UUIDs of those instances having the given node 1781 as primary node 1782 @type ctags: list of string 1783 @param ctags: tags of the cluster 1784 1785 """ 1786 exclusion_prefixes = utils.GetExclusionPrefixes(ctags) 1787 tags_seen = set([]) 1788 conflicting_tags = set([]) 1789 for iuuid in pinst: 1790 allitags = self.my_inst_info[iuuid].tags 1791 if allitags is None: 1792 allitags = [] 1793 itags = set([tag for tag in allitags 1794 if utils.IsGoodTag(exclusion_prefixes, tag)]) 1795 conflicts = itags.intersection(tags_seen) 1796 if len(conflicts) > 0: 1797 conflicting_tags = conflicting_tags.union(conflicts) 1798 tags_seen = tags_seen.union(itags) 1799 1800 self._ErrorIf(len(conflicting_tags) > 0, constants.CV_EEXTAGS, nodename, 1801 "Tags where there is more than one instance: %s", 1802 list(conflicting_tags), code=constants.CV_WARNING)
1803
1804 - def Exec(self, feedback_fn): # pylint: disable=R0915
1805 """Verify integrity of the node group, performing various test on nodes. 1806 1807 """ 1808 # This method has too many local variables. pylint: disable=R0914 1809 feedback_fn("* Verifying group '%s'" % self.group_info.name) 1810 1811 if not self.my_node_uuids: 1812 # empty node group 1813 feedback_fn("* Empty node group, skipping verification") 1814 return True 1815 1816 self.bad = False 1817 verbose = self.op.verbose 1818 self._feedback_fn = feedback_fn 1819 1820 vg_name = self.cfg.GetVGName() 1821 drbd_helper = self.cfg.GetDRBDHelper() 1822 cluster = self.cfg.GetClusterInfo() 1823 hypervisors = cluster.enabled_hypervisors 1824 node_data_list = self.my_node_info.values() 1825 1826 i_non_redundant = [] # Non redundant instances 1827 i_non_a_balanced = [] # Non auto-balanced instances 1828 i_offline = 0 # Count of offline instances 1829 n_offline = 0 # Count of offline nodes 1830 n_drained = 0 # Count of nodes being drained 1831 node_vol_should = {} 1832 1833 # FIXME: verify OS list 1834 1835 # File verification 1836 filemap = ComputeAncillaryFiles(cluster, False) 1837 1838 # do local checksums 1839 master_node_uuid = self.master_node = self.cfg.GetMasterNode() 1840 master_ip = self.cfg.GetMasterIP() 1841 1842 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids)) 1843 1844 user_scripts = [] 1845 if self.cfg.GetUseExternalMipScript(): 1846 user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT) 1847 1848 node_verify_param = { 1849 constants.NV_FILELIST: 1850 map(vcluster.MakeVirtualPath, 1851 utils.UniqueSequence(filename 1852 for files in filemap 1853 for filename in files)), 1854 constants.NV_NODELIST: 1855 self._SelectSshCheckNodes(node_data_list, self.group_uuid, 1856 self.all_node_info.values()), 1857 constants.NV_HYPERVISOR: hypervisors, 1858 constants.NV_HVPARAMS: 1859 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()), 1860 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip) 1861 for node in node_data_list 1862 if not node.offline], 1863 constants.NV_INSTANCELIST: hypervisors, 1864 constants.NV_VERSION: None, 1865 constants.NV_HVINFO: self.cfg.GetHypervisorType(), 1866 constants.NV_NODESETUP: None, 1867 constants.NV_TIME: None, 1868 constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip), 1869 constants.NV_OSLIST: None, 1870 constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(), 1871 constants.NV_USERSCRIPTS: user_scripts, 1872 constants.NV_CLIENT_CERT: None, 1873 } 1874 1875 if self.cfg.GetClusterInfo().modify_ssh_setup: 1876 node_verify_param[constants.NV_SSH_SETUP] = \ 1877 (self._PrepareSshSetupCheck(), self.cfg.GetClusterInfo().ssh_key_type) 1878 if self.op.verify_clutter: 1879 node_verify_param[constants.NV_SSH_CLUTTER] = True 1880 1881 if vg_name is not None: 1882 node_verify_param[constants.NV_VGLIST] = None 1883 node_verify_param[constants.NV_LVLIST] = vg_name 1884 node_verify_param[constants.NV_PVLIST] = [vg_name] 1885 1886 if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8): 1887 if drbd_helper: 1888 node_verify_param[constants.NV_DRBDVERSION] = None 1889 node_verify_param[constants.NV_DRBDLIST] = None 1890 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper 1891 1892 if cluster.IsFileStorageEnabled() or \ 1893 cluster.IsSharedFileStorageEnabled(): 1894 # Load file storage paths only from master node 1895 node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \ 1896 self.cfg.GetMasterNodeName() 1897 if cluster.IsFileStorageEnabled(): 1898 node_verify_param[constants.NV_FILE_STORAGE_PATH] = \ 1899 cluster.file_storage_dir 1900 if cluster.IsSharedFileStorageEnabled(): 1901 node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \ 1902 cluster.shared_file_storage_dir 1903 1904 # bridge checks 1905 # FIXME: this needs to be changed per node-group, not cluster-wide 1906 bridges = set() 1907 default_nicpp = cluster.nicparams[constants.PP_DEFAULT] 1908 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: 1909 bridges.add(default_nicpp[constants.NIC_LINK]) 1910 for inst_uuid in self.my_inst_info.values(): 1911 for nic in inst_uuid.nics: 1912 full_nic = cluster.SimpleFillNIC(nic.nicparams) 1913 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: 1914 bridges.add(full_nic[constants.NIC_LINK]) 1915 1916 if bridges: 1917 node_verify_param[constants.NV_BRIDGES] = list(bridges) 1918 1919 # Build our expected cluster state 1920 node_image = dict((node.uuid, self.NodeImage(offline=node.offline, 1921 uuid=node.uuid, 1922 vm_capable=node.vm_capable)) 1923 for node in node_data_list) 1924 1925 # Gather OOB paths 1926 oob_paths = [] 1927 for node in self.all_node_info.values(): 1928 path = SupportsOob(self.cfg, node) 1929 if path and path not in oob_paths: 1930 oob_paths.append(path) 1931 1932 if oob_paths: 1933 node_verify_param[constants.NV_OOB_PATHS] = oob_paths 1934 1935 for inst_uuid in self.my_inst_uuids: 1936 instance = self.my_inst_info[inst_uuid] 1937 if instance.admin_state == constants.ADMINST_OFFLINE: 1938 i_offline += 1 1939 1940 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid) 1941 for nuuid in inst_nodes: 1942 if nuuid not in node_image: 1943 gnode = self.NodeImage(uuid=nuuid) 1944 gnode.ghost = (nuuid not in self.all_node_info) 1945 node_image[nuuid] = gnode 1946 1947 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 1948 1949 pnode = instance.primary_node 1950 node_image[pnode].pinst.append(instance.uuid) 1951 1952 for snode in self.cfg.GetInstanceSecondaryNodes(instance.uuid): 1953 nimg = node_image[snode] 1954 nimg.sinst.append(instance.uuid) 1955 if pnode not in nimg.sbp: 1956 nimg.sbp[pnode] = [] 1957 nimg.sbp[pnode].append(instance.uuid) 1958 1959 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, 1960 self.my_node_info.keys()) 1961 # The value of exclusive_storage should be the same across the group, so if 1962 # it's True for at least a node, we act as if it were set for all the nodes 1963 self._exclusive_storage = compat.any(es_flags.values()) 1964 if self._exclusive_storage: 1965 node_verify_param[constants.NV_EXCLUSIVEPVS] = True 1966 1967 # At this point, we have the in-memory data structures complete, 1968 # except for the runtime information, which we'll gather next 1969 1970 # NOTE: Here we lock the configuration for the duration of RPC calls, 1971 # which means that the cluster configuration changes are blocked during 1972 # this period. 1973 # This is something that should be done only exceptionally and only for 1974 # justified cases! 1975 # In this case, we need the lock as we can only verify the integrity of 1976 # configuration files on MCs only if we know nobody else is modifying it. 1977 # FIXME: The check for integrity of config.data should be moved to 1978 # WConfD, which is the only one who can otherwise ensure nobody 1979 # will modify the configuration during the check. 1980 with self.cfg.GetConfigManager(shared=True, forcelock=True): 1981 feedback_fn("* Gathering information about nodes (%s nodes)" % 1982 len(self.my_node_uuids)) 1983 # Force the configuration to be fully distributed before doing any tests 1984 self.cfg.FlushConfigGroup(self.group_uuid) 1985 # Due to the way our RPC system works, exact response times cannot be 1986 # guaranteed (e.g. a broken node could run into a timeout). By keeping 1987 # the time before and after executing the request, we can at least have 1988 # a time window. 1989 nvinfo_starttime = time.time() 1990 # Get lock on the configuration so that nobody modifies it concurrently. 1991 # Otherwise it can be modified by other jobs, failing the consistency 1992 # test. 1993 # NOTE: This is an exceptional situation, we should otherwise avoid 1994 # locking the configuration for something but very fast, pure operations. 1995 cluster_name = self.cfg.GetClusterName() 1996 hvparams = self.cfg.GetClusterInfo().hvparams 1997 all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids, 1998 node_verify_param, 1999 cluster_name, 2000 hvparams) 2001 nvinfo_endtime = time.time() 2002 2003 if self.extra_lv_nodes and vg_name is not None: 2004 feedback_fn("* Gathering information about extra nodes (%s nodes)" % 2005 len(self.extra_lv_nodes)) 2006 extra_lv_nvinfo = \ 2007 self.rpc.call_node_verify(self.extra_lv_nodes, 2008 {constants.NV_LVLIST: vg_name}, 2009 self.cfg.GetClusterName(), 2010 self.cfg.GetClusterInfo().hvparams) 2011 else: 2012 extra_lv_nvinfo = {} 2013 2014 # If not all nodes are being checked, we need to make sure the master 2015 # node and a non-checked vm_capable node are in the list. 2016 absent_node_uuids = set(self.all_node_info).difference(self.my_node_info) 2017 if absent_node_uuids: 2018 vf_nvinfo = all_nvinfo.copy() 2019 vf_node_info = list(self.my_node_info.values()) 2020 additional_node_uuids = [] 2021 if master_node_uuid not in self.my_node_info: 2022 additional_node_uuids.append(master_node_uuid) 2023 vf_node_info.append(self.all_node_info[master_node_uuid]) 2024 # Add the first vm_capable node we find which is not included, 2025 # excluding the master node (which we already have) 2026 for node_uuid in absent_node_uuids: 2027 nodeinfo = self.all_node_info[node_uuid] 2028 if (nodeinfo.vm_capable and not nodeinfo.offline and 2029 node_uuid != master_node_uuid): 2030 additional_node_uuids.append(node_uuid) 2031 vf_node_info.append(self.all_node_info[node_uuid]) 2032 break 2033 key = constants.NV_FILELIST 2034 2035 feedback_fn("* Gathering information about the master node") 2036 vf_nvinfo.update(self.rpc.call_node_verify( 2037 additional_node_uuids, {key: node_verify_param[key]}, 2038 self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams)) 2039 else: 2040 vf_nvinfo = all_nvinfo 2041 vf_node_info = self.my_node_info.values() 2042 2043 all_drbd_map = self.cfg.ComputeDRBDMap() 2044 2045 feedback_fn("* Gathering disk information (%s nodes)" % 2046 len(self.my_node_uuids)) 2047 instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image, 2048 self.my_inst_info) 2049 2050 feedback_fn("* Verifying configuration file consistency") 2051 2052 self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo) 2053 if self.cfg.GetClusterInfo().modify_ssh_setup: 2054 self._VerifySshSetup(self.my_node_info.values(), all_nvinfo) 2055 self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap) 2056 2057 feedback_fn("* Verifying node status") 2058 2059 refos_img = None 2060 2061 for node_i in node_data_list: 2062 nimg = node_image[node_i.uuid] 2063 2064 if node_i.offline: 2065 if verbose: 2066 feedback_fn("* Skipping offline node %s" % (node_i.name,)) 2067 n_offline += 1 2068 continue 2069 2070 if node_i.uuid == master_node_uuid: 2071 ntype = "master" 2072 elif node_i.master_candidate: 2073 ntype = "master candidate" 2074 elif node_i.drained: 2075 ntype = "drained" 2076 n_drained += 1 2077 else: 2078 ntype = "regular" 2079 if verbose: 2080 feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype)) 2081 2082 msg = all_nvinfo[node_i.uuid].fail_msg 2083 self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name, 2084 "while contacting node: %s", msg) 2085 if msg: 2086 nimg.rpc_fail = True 2087 continue 2088 2089 nresult = all_nvinfo[node_i.uuid].payload 2090 2091 nimg.call_ok = self._VerifyNode(node_i, nresult) 2092 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime) 2093 self._VerifyNodeNetwork(node_i, nresult) 2094 self._VerifyNodeUserScripts(node_i, nresult) 2095 self._VerifyOob(node_i, nresult) 2096 self._VerifyAcceptedFileStoragePaths(node_i, nresult, 2097 node_i.uuid == master_node_uuid) 2098 self._VerifyFileStoragePaths(node_i, nresult) 2099 self._VerifySharedFileStoragePaths(node_i, nresult) 2100 self._VerifyGlusterStoragePaths(node_i, nresult) 2101 2102 if nimg.vm_capable: 2103 self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg) 2104 if constants.DT_DRBD8 in cluster.enabled_disk_templates: 2105 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, 2106 self.all_disks_info, drbd_helper, all_drbd_map) 2107 2108 if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \ 2109 (constants.DT_DRBD8 in cluster.enabled_disk_templates): 2110 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name) 2111 self._UpdateNodeInstances(node_i, nresult, nimg) 2112 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name) 2113 self._UpdateNodeOS(node_i, nresult, nimg) 2114 2115 if not nimg.os_fail: 2116 if refos_img is None: 2117 refos_img = nimg 2118 self._VerifyNodeOS(node_i, nimg, refos_img) 2119 self._VerifyNodeBridges(node_i, nresult, bridges) 2120 2121 # Check whether all running instances are primary for the node. (This 2122 # can no longer be done from _VerifyInstance below, since some of the 2123 # wrong instances could be from other node groups.) 2124 non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst) 2125 2126 for inst_uuid in non_primary_inst_uuids: 2127 test = inst_uuid in self.all_inst_info 2128 self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, 2129 self.cfg.GetInstanceName(inst_uuid), 2130 "instance should not run on node %s", node_i.name) 2131 self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name, 2132 "node is running unknown instance %s", inst_uuid) 2133 2134 self._VerifyExclusionTags(node_i.name, nimg.pinst, cluster.tags) 2135 2136 self._VerifyGroupDRBDVersion(all_nvinfo) 2137 self._VerifyGroupLVM(node_image, vg_name) 2138 2139 for node_uuid, result in extra_lv_nvinfo.items(): 2140 self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload, 2141 node_image[node_uuid], vg_name) 2142 2143 feedback_fn("* Verifying instance status") 2144 for inst_uuid in self.my_inst_uuids: 2145 instance = self.my_inst_info[inst_uuid] 2146 if verbose: 2147 feedback_fn("* Verifying instance %s" % instance.name) 2148 self._VerifyInstance(instance, node_image, instdisk[inst_uuid]) 2149 2150 # If the instance is not fully redundant we cannot survive losing its 2151 # primary node, so we are not N+1 compliant. 2152 inst_disks = self.cfg.GetInstanceDisks(instance.uuid) 2153 if not utils.AllDiskOfType(inst_disks, constants.DTS_MIRRORED): 2154 i_non_redundant.append(instance) 2155 2156 if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]: 2157 i_non_a_balanced.append(instance) 2158 2159 feedback_fn("* Verifying orphan volumes") 2160 reserved = utils.FieldSet(*cluster.reserved_lvs) 2161 2162 # We will get spurious "unknown volume" warnings if any node of this group 2163 # is secondary for an instance whose primary is in another group. To avoid 2164 # them, we find these instances and add their volumes to node_vol_should. 2165 for instance in self.all_inst_info.values(): 2166 for secondary in self.cfg.GetInstanceSecondaryNodes(instance.uuid): 2167 if (secondary in self.my_node_info 2168 and instance.uuid not in self.my_inst_info): 2169 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 2170 break 2171 2172 self._VerifyOrphanVolumes(vg_name, node_vol_should, node_image, reserved) 2173 2174 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks: 2175 feedback_fn("* Verifying N+1 Memory redundancy") 2176 self._VerifyNPlusOneMemory(node_image, self.my_inst_info) 2177 2178 self._VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced, 2179 i_offline, n_offline, n_drained) 2180 2181 return not self.bad
2182
2183 - def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2184 """Analyze the post-hooks' result 2185 2186 This method analyses the hook result, handles it, and sends some 2187 nicely-formatted feedback back to the user. 2188 2189 @param phase: one of L{constants.HOOKS_PHASE_POST} or 2190 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase 2191 @param hooks_results: the results of the multi-node hooks rpc call 2192 @param feedback_fn: function used send feedback back to the caller 2193 @param lu_result: previous Exec result 2194 @return: the new Exec result, based on the previous result 2195 and hook results 2196 2197 """ 2198 # We only really run POST phase hooks, only for non-empty groups, 2199 # and are only interested in their results 2200 if not self.my_node_uuids: 2201 # empty node group 2202 pass 2203 elif phase == constants.HOOKS_PHASE_POST: 2204 # Used to change hooks' output to proper indentation 2205 feedback_fn("* Hooks Results") 2206 assert hooks_results, "invalid result from hooks" 2207 2208 for node_name in hooks_results: 2209 res = hooks_results[node_name] 2210 msg = res.fail_msg 2211 test = msg and not res.offline 2212 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, 2213 "Communication failure in hooks execution: %s", msg) 2214 if test: 2215 lu_result = False 2216 continue 2217 if res.offline: 2218 # No need to investigate payload if node is offline 2219 continue 2220 for script, hkr, output in res.payload: 2221 test = hkr == constants.HKR_FAIL 2222 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, 2223 "Script %s failed, output:", script) 2224 if test: 2225 output = self._HOOKS_INDENT_RE.sub(" ", output) 2226 feedback_fn("%s" % output) 2227 lu_result = False 2228 2229 return lu_result
2230