Package ganeti :: Package cmdlib :: Package cluster :: Module verify
[hide private]
[frames] | no frames]

Source Code for Module ganeti.cmdlib.cluster.verify

   1  # 
   2  # 
   3   
   4  # Copyright (C) 2014 Google Inc. 
   5  # All rights reserved. 
   6  # 
   7  # Redistribution and use in source and binary forms, with or without 
   8  # modification, are permitted provided that the following conditions are 
   9  # met: 
  10  # 
  11  # 1. Redistributions of source code must retain the above copyright notice, 
  12  # this list of conditions and the following disclaimer. 
  13  # 
  14  # 2. Redistributions in binary form must reproduce the above copyright 
  15  # notice, this list of conditions and the following disclaimer in the 
  16  # documentation and/or other materials provided with the distribution. 
  17  # 
  18  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
  19  # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
  20  # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
  21  # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
  22  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  23  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  24  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  25  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  26  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  27  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  28  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  29   
  30  """Logical units for cluster verification.""" 
  31   
  32  import itertools 
  33  import logging 
  34  import operator 
  35  import re 
  36  import time 
  37  import ganeti.masterd.instance 
  38  import ganeti.rpc.node as rpc 
  39   
  40  from ganeti import compat 
  41  from ganeti import constants 
  42  from ganeti import errors 
  43  from ganeti import locking 
  44  from ganeti import pathutils 
  45  from ganeti import utils 
  46  from ganeti import vcluster 
  47  from ganeti import hypervisor 
  48  from ganeti import opcodes 
  49   
  50  from ganeti.cmdlib.base import LogicalUnit, NoHooksLU, ResultWithJobs 
  51  from ganeti.cmdlib.common import ShareAll, ComputeAncillaryFiles, \ 
  52      CheckNodePVs, ComputeIPolicyInstanceViolation, AnnotateDiskParams, \ 
  53      SupportsOob 
54 55 56 -def _GetAllHypervisorParameters(cluster, instances):
57 """Compute the set of all hypervisor parameters. 58 59 @type cluster: L{objects.Cluster} 60 @param cluster: the cluster object 61 @param instances: list of L{objects.Instance} 62 @param instances: additional instances from which to obtain parameters 63 @rtype: list of (origin, hypervisor, parameters) 64 @return: a list with all parameters found, indicating the hypervisor they 65 apply to, and the origin (can be "cluster", "os X", or "instance Y") 66 67 """ 68 hvp_data = [] 69 70 for hv_name in cluster.enabled_hypervisors: 71 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name))) 72 73 for os_name, os_hvp in cluster.os_hvp.items(): 74 for hv_name, hv_params in os_hvp.items(): 75 if hv_params: 76 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name) 77 hvp_data.append(("os %s" % os_name, hv_name, full_params)) 78 79 # TODO: collapse identical parameter values in a single one 80 for instance in instances: 81 if instance.hvparams: 82 hvp_data.append(("instance %s" % instance.name, instance.hypervisor, 83 cluster.FillHV(instance))) 84 85 return hvp_data
86
87 88 -class _VerifyErrors(object):
89 """Mix-in for cluster/group verify LUs. 90 91 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects 92 self.op and self._feedback_fn to be available.) 93 94 """ 95 96 ETYPE_FIELD = "code" 97 ETYPE_ERROR = constants.CV_ERROR 98 ETYPE_WARNING = constants.CV_WARNING 99
100 - def _Error(self, ecode, item, msg, *args, **kwargs):
101 """Format an error message. 102 103 Based on the opcode's error_codes parameter, either format a 104 parseable error code, or a simpler error string. 105 106 This must be called only from Exec and functions called from Exec. 107 108 """ 109 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) 110 itype, etxt, _ = ecode 111 # If the error code is in the list of ignored errors, demote the error to a 112 # warning 113 if etxt in self.op.ignore_errors: # pylint: disable=E1101 114 ltype = self.ETYPE_WARNING 115 # first complete the msg 116 if args: 117 msg = msg % args 118 # then format the whole message 119 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101 120 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg) 121 else: 122 if item: 123 item = " " + item 124 else: 125 item = "" 126 msg = "%s: %s%s: %s" % (ltype, itype, item, msg) 127 # and finally report it via the feedback_fn 128 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101 129 # do not mark the operation as failed for WARN cases only 130 if ltype == self.ETYPE_ERROR: 131 self.bad = True
132
133 - def _ErrorIf(self, cond, *args, **kwargs):
134 """Log an error message if the passed condition is True. 135 136 """ 137 if (bool(cond) 138 or self.op.debug_simulate_errors): # pylint: disable=E1101 139 self._Error(*args, **kwargs)
140
141 142 -class LUClusterVerify(NoHooksLU):
143 """Submits all jobs necessary to verify the cluster. 144 145 """ 146 REQ_BGL = False 147
148 - def ExpandNames(self):
149 self.needed_locks = {}
150
151 - def Exec(self, feedback_fn):
152 jobs = [] 153 154 if self.op.group_name: 155 groups = [self.op.group_name] 156 depends_fn = lambda: None 157 else: 158 groups = self.cfg.GetNodeGroupList() 159 160 # Verify global configuration 161 jobs.append([ 162 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors), 163 ]) 164 165 # Always depend on global verification 166 depends_fn = lambda: [(-len(jobs), [])] 167 168 jobs.extend( 169 [opcodes.OpClusterVerifyGroup(group_name=group, 170 ignore_errors=self.op.ignore_errors, 171 depends=depends_fn(), 172 verify_clutter=self.op.verify_clutter)] 173 for group in groups) 174 175 # Fix up all parameters 176 for op in itertools.chain(*jobs): # pylint: disable=W0142 177 op.debug_simulate_errors = self.op.debug_simulate_errors 178 op.verbose = self.op.verbose 179 op.error_codes = self.op.error_codes 180 try: 181 op.skip_checks = self.op.skip_checks 182 except AttributeError: 183 assert not isinstance(op, opcodes.OpClusterVerifyGroup) 184 185 return ResultWithJobs(jobs)
186
187 188 -class LUClusterVerifyDisks(NoHooksLU):
189 """Verifies the cluster disks status. 190 191 """ 192 REQ_BGL = False 193
194 - def ExpandNames(self):
195 self.share_locks = ShareAll() 196 if self.op.group_name: 197 self.needed_locks = { 198 locking.LEVEL_NODEGROUP: [self.cfg.LookupNodeGroup(self.op.group_name)] 199 } 200 else: 201 self.needed_locks = { 202 locking.LEVEL_NODEGROUP: locking.ALL_SET, 203 }
204
205 - def Exec(self, feedback_fn):
206 group_names = self.owned_locks(locking.LEVEL_NODEGROUP) 207 208 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group 209 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)] 210 for group in group_names])
211
212 213 -class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
214 """Verifies the cluster config. 215 216 """ 217 REQ_BGL = False 218
219 - def _VerifyHVP(self, hvp_data):
220 """Verifies locally the syntax of the hypervisor parameters. 221 222 """ 223 for item, hv_name, hv_params in hvp_data: 224 msg = ("hypervisor %s parameters syntax check (source %s): %%s" % 225 (item, hv_name)) 226 try: 227 hv_class = hypervisor.GetHypervisorClass(hv_name) 228 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) 229 hv_class.CheckParameterSyntax(hv_params) 230 except errors.GenericError, err: 231 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
232
233 - def ExpandNames(self):
234 self.needed_locks = dict.fromkeys(locking.LEVELS, locking.ALL_SET) 235 self.share_locks = ShareAll()
236
237 - def CheckPrereq(self):
238 """Check prerequisites. 239 240 """ 241 # Retrieve all information 242 self.all_group_info = self.cfg.GetAllNodeGroupsInfo() 243 self.all_node_info = self.cfg.GetAllNodesInfo() 244 self.all_inst_info = self.cfg.GetAllInstancesInfo()
245
246 - def Exec(self, feedback_fn):
247 """Verify integrity of cluster, performing various test on nodes. 248 249 """ 250 self.bad = False 251 self._feedback_fn = feedback_fn 252 253 feedback_fn("* Verifying cluster config") 254 255 for msg in self.cfg.VerifyConfig(): 256 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg) 257 258 feedback_fn("* Verifying cluster certificate files") 259 260 for cert_filename in pathutils.ALL_CERT_FILES: 261 (errcode, msg) = utils.VerifyCertificate(cert_filename) 262 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode) 263 264 self._ErrorIf(not utils.CanRead(constants.LUXID_USER, 265 pathutils.NODED_CERT_FILE), 266 constants.CV_ECLUSTERCERT, 267 None, 268 pathutils.NODED_CERT_FILE + " must be accessible by the " + 269 constants.LUXID_USER + " user") 270 271 feedback_fn("* Verifying hypervisor parameters") 272 273 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(), 274 self.all_inst_info.values())) 275 276 feedback_fn("* Verifying all nodes belong to an existing group") 277 278 # We do this verification here because, should this bogus circumstance 279 # occur, it would never be caught by VerifyGroup, which only acts on 280 # nodes/instances reachable from existing node groups. 281 282 dangling_nodes = set(node for node in self.all_node_info.values() 283 if node.group not in self.all_group_info) 284 285 dangling_instances = {} 286 no_node_instances = [] 287 288 for inst in self.all_inst_info.values(): 289 if inst.primary_node in [node.uuid for node in dangling_nodes]: 290 dangling_instances.setdefault(inst.primary_node, []).append(inst) 291 elif inst.primary_node not in self.all_node_info: 292 no_node_instances.append(inst) 293 294 pretty_dangling = [ 295 "%s (%s)" % 296 (node.name, 297 utils.CommaJoin(inst.name for 298 inst in dangling_instances.get(node.uuid, []))) 299 for node in dangling_nodes] 300 301 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES, 302 None, 303 "the following nodes (and their instances) belong to a non" 304 " existing group: %s", utils.CommaJoin(pretty_dangling)) 305 306 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST, 307 None, 308 "the following instances have a non-existing primary-node:" 309 " %s", utils.CommaJoin(inst.name for 310 inst in no_node_instances)) 311 312 return not self.bad
313
314 315 -class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
316 """Verifies the status of a node group. 317 318 """ 319 HPATH = "cluster-verify" 320 HTYPE = constants.HTYPE_CLUSTER 321 REQ_BGL = False 322 323 _HOOKS_INDENT_RE = re.compile("^", re.M) 324
325 - class NodeImage(object):
326 """A class representing the logical and physical status of a node. 327 328 @type uuid: string 329 @ivar uuid: the node UUID to which this object refers 330 @ivar volumes: a structure as returned from 331 L{ganeti.backend.GetVolumeList} (runtime) 332 @ivar instances: a list of running instances (runtime) 333 @ivar pinst: list of configured primary instances (config) 334 @ivar sinst: list of configured secondary instances (config) 335 @ivar sbp: dictionary of {primary-node: list of instances} for all 336 instances for which this node is secondary (config) 337 @ivar mfree: free memory, as reported by hypervisor (runtime) 338 @ivar mtotal: total memory, as reported by hypervisor (runtime) 339 @ivar mdom0: domain0 memory, as reported by hypervisor (runtime) 340 @ivar dfree: free disk, as reported by the node (runtime) 341 @ivar offline: the offline status (config) 342 @type rpc_fail: boolean 343 @ivar rpc_fail: whether the RPC verify call was successfull (overall, 344 not whether the individual keys were correct) (runtime) 345 @type lvm_fail: boolean 346 @ivar lvm_fail: whether the RPC call didn't return valid LVM data 347 @type hyp_fail: boolean 348 @ivar hyp_fail: whether the RPC call didn't return the instance list 349 @type ghost: boolean 350 @ivar ghost: whether this is a known node or not (config) 351 @type os_fail: boolean 352 @ivar os_fail: whether the RPC call didn't return valid OS data 353 @type oslist: list 354 @ivar oslist: list of OSes as diagnosed by DiagnoseOS 355 @type vm_capable: boolean 356 @ivar vm_capable: whether the node can host instances 357 @type pv_min: float 358 @ivar pv_min: size in MiB of the smallest PVs 359 @type pv_max: float 360 @ivar pv_max: size in MiB of the biggest PVs 361 362 """
363 - def __init__(self, offline=False, uuid=None, vm_capable=True):
364 self.uuid = uuid 365 self.volumes = {} 366 self.instances = [] 367 self.pinst = [] 368 self.sinst = [] 369 self.sbp = {} 370 self.mfree = 0 371 self.mtotal = 0 372 self.mdom0 = 0 373 self.dfree = 0 374 self.offline = offline 375 self.vm_capable = vm_capable 376 self.rpc_fail = False 377 self.lvm_fail = False 378 self.hyp_fail = False 379 self.ghost = False 380 self.os_fail = False 381 self.oslist = {} 382 self.pv_min = None 383 self.pv_max = None
384
385 - def ExpandNames(self):
386 # This raises errors.OpPrereqError on its own: 387 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name) 388 389 # Get instances in node group; this is unsafe and needs verification later 390 inst_uuids = \ 391 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) 392 393 self.needed_locks = { 394 locking.LEVEL_INSTANCE: self.cfg.GetInstanceNames(inst_uuids), 395 locking.LEVEL_NODEGROUP: [self.group_uuid], 396 locking.LEVEL_NODE: [], 397 } 398 399 self.share_locks = ShareAll()
400
401 - def DeclareLocks(self, level):
402 if level == locking.LEVEL_NODE: 403 # Get members of node group; this is unsafe and needs verification later 404 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members) 405 406 # In Exec(), we warn about mirrored instances that have primary and 407 # secondary living in separate node groups. To fully verify that 408 # volumes for these instances are healthy, we will need to do an 409 # extra call to their secondaries. We ensure here those nodes will 410 # be locked. 411 for inst_name in self.owned_locks(locking.LEVEL_INSTANCE): 412 # Important: access only the instances whose lock is owned 413 instance = self.cfg.GetInstanceInfoByName(inst_name) 414 disks = self.cfg.GetInstanceDisks(instance.uuid) 415 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 416 nodes.update(self.cfg.GetInstanceSecondaryNodes(instance.uuid)) 417 418 self.needed_locks[locking.LEVEL_NODE] = nodes
419
420 - def CheckPrereq(self):
421 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP) 422 self.group_info = self.cfg.GetNodeGroup(self.group_uuid) 423 424 group_node_uuids = set(self.group_info.members) 425 group_inst_uuids = \ 426 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) 427 428 unlocked_node_uuids = \ 429 group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE)) 430 431 unlocked_inst_uuids = \ 432 group_inst_uuids.difference( 433 [self.cfg.GetInstanceInfoByName(name).uuid 434 for name in self.owned_locks(locking.LEVEL_INSTANCE)]) 435 436 if unlocked_node_uuids: 437 raise errors.OpPrereqError( 438 "Missing lock for nodes: %s" % 439 utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)), 440 errors.ECODE_STATE) 441 442 if unlocked_inst_uuids: 443 raise errors.OpPrereqError( 444 "Missing lock for instances: %s" % 445 utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)), 446 errors.ECODE_STATE) 447 448 self.all_node_info = self.cfg.GetAllNodesInfo() 449 self.all_inst_info = self.cfg.GetAllInstancesInfo() 450 self.all_disks_info = self.cfg.GetAllDisksInfo() 451 452 self.my_node_uuids = group_node_uuids 453 self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid]) 454 for node_uuid in group_node_uuids) 455 456 self.my_inst_uuids = group_inst_uuids 457 self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid]) 458 for inst_uuid in group_inst_uuids) 459 460 # We detect here the nodes that will need the extra RPC calls for verifying 461 # split LV volumes; they should be locked. 462 extra_lv_nodes = set() 463 464 for inst in self.my_inst_info.values(): 465 disks = self.cfg.GetInstanceDisks(inst.uuid) 466 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 467 inst_nodes = self.cfg.GetInstanceNodes(inst.uuid) 468 for nuuid in inst_nodes: 469 if self.all_node_info[nuuid].group != self.group_uuid: 470 extra_lv_nodes.add(nuuid) 471 472 unlocked_lv_nodes = \ 473 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE)) 474 475 if unlocked_lv_nodes: 476 raise errors.OpPrereqError("Missing node locks for LV check: %s" % 477 utils.CommaJoin(unlocked_lv_nodes), 478 errors.ECODE_STATE) 479 self.extra_lv_nodes = list(extra_lv_nodes)
480
481 - def _VerifyNode(self, ninfo, nresult):
482 """Perform some basic validation on data returned from a node. 483 484 - check the result data structure is well formed and has all the 485 mandatory fields 486 - check ganeti version 487 488 @type ninfo: L{objects.Node} 489 @param ninfo: the node to check 490 @param nresult: the results from the node 491 @rtype: boolean 492 @return: whether overall this call was successful (and we can expect 493 reasonable values in the respose) 494 495 """ 496 # main result, nresult should be a non-empty dict 497 test = not nresult or not isinstance(nresult, dict) 498 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name, 499 "unable to verify node: no data returned") 500 if test: 501 return False 502 503 # compares ganeti version 504 local_version = constants.PROTOCOL_VERSION 505 remote_version = nresult.get("version", None) 506 test = not (remote_version and 507 isinstance(remote_version, (list, tuple)) and 508 len(remote_version) == 2) 509 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name, 510 "connection to node returned invalid data") 511 if test: 512 return False 513 514 test = local_version != remote_version[0] 515 self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name, 516 "incompatible protocol versions: master %s," 517 " node %s", local_version, remote_version[0]) 518 if test: 519 return False 520 521 # node seems compatible, we can actually try to look into its results 522 523 # full package version 524 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1], 525 constants.CV_ENODEVERSION, ninfo.name, 526 "software version mismatch: master %s, node %s", 527 constants.RELEASE_VERSION, remote_version[1], 528 code=self.ETYPE_WARNING) 529 530 hyp_result = nresult.get(constants.NV_HYPERVISOR, None) 531 if ninfo.vm_capable and isinstance(hyp_result, dict): 532 for hv_name, hv_result in hyp_result.iteritems(): 533 test = hv_result is not None 534 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 535 "hypervisor %s verify failure: '%s'", hv_name, hv_result) 536 537 hvp_result = nresult.get(constants.NV_HVPARAMS, None) 538 if ninfo.vm_capable and isinstance(hvp_result, list): 539 for item, hv_name, hv_result in hvp_result: 540 self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name, 541 "hypervisor %s parameter verify failure (source %s): %s", 542 hv_name, item, hv_result) 543 544 test = nresult.get(constants.NV_NODESETUP, 545 ["Missing NODESETUP results"]) 546 self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name, 547 "node setup error: %s", "; ".join(test)) 548 549 return True
550
551 - def _VerifyNodeTime(self, ninfo, nresult, 552 nvinfo_starttime, nvinfo_endtime):
553 """Check the node time. 554 555 @type ninfo: L{objects.Node} 556 @param ninfo: the node to check 557 @param nresult: the remote results for the node 558 @param nvinfo_starttime: the start time of the RPC call 559 @param nvinfo_endtime: the end time of the RPC call 560 561 """ 562 ntime = nresult.get(constants.NV_TIME, None) 563 try: 564 ntime_merged = utils.MergeTime(ntime) 565 except (ValueError, TypeError): 566 self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name, 567 "Node returned invalid time") 568 return 569 570 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW): 571 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged) 572 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW): 573 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime) 574 else: 575 ntime_diff = None 576 577 self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name, 578 "Node time diverges by at least %s from master node time", 579 ntime_diff)
580
581 - def _UpdateVerifyNodeLVM(self, ninfo, nresult, vg_name, nimg):
582 """Check the node LVM results and update info for cross-node checks. 583 584 @type ninfo: L{objects.Node} 585 @param ninfo: the node to check 586 @param nresult: the remote results for the node 587 @param vg_name: the configured VG name 588 @type nimg: L{NodeImage} 589 @param nimg: node image 590 591 """ 592 if vg_name is None: 593 return 594 595 # checks vg existence and size > 20G 596 vglist = nresult.get(constants.NV_VGLIST, None) 597 test = not vglist 598 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name, 599 "unable to check volume groups") 600 if not test: 601 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name, 602 constants.MIN_VG_SIZE) 603 self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus) 604 605 # Check PVs 606 (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage) 607 for em in errmsgs: 608 self._Error(constants.CV_ENODELVM, ninfo.name, em) 609 if pvminmax is not None: 610 (nimg.pv_min, nimg.pv_max) = pvminmax
611
612 - def _VerifyGroupDRBDVersion(self, node_verify_infos):
613 """Check cross-node DRBD version consistency. 614 615 @type node_verify_infos: dict 616 @param node_verify_infos: infos about nodes as returned from the 617 node_verify call. 618 619 """ 620 node_versions = {} 621 for node_uuid, ndata in node_verify_infos.items(): 622 nresult = ndata.payload 623 if nresult: 624 version = nresult.get(constants.NV_DRBDVERSION, None) 625 if version: 626 node_versions[node_uuid] = version 627 628 if len(set(node_versions.values())) > 1: 629 for node_uuid, version in sorted(node_versions.items()): 630 msg = "DRBD version mismatch: %s" % version 631 self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg, 632 code=self.ETYPE_WARNING)
633
634 - def _VerifyGroupLVM(self, node_image, vg_name):
635 """Check cross-node consistency in LVM. 636 637 @type node_image: dict 638 @param node_image: info about nodes, mapping from node to names to 639 L{NodeImage} objects 640 @param vg_name: the configured VG name 641 642 """ 643 if vg_name is None: 644 return 645 646 # Only exclusive storage needs this kind of checks 647 if not self._exclusive_storage: 648 return 649 650 # exclusive_storage wants all PVs to have the same size (approximately), 651 # if the smallest and the biggest ones are okay, everything is fine. 652 # pv_min is None iff pv_max is None 653 vals = filter((lambda ni: ni.pv_min is not None), node_image.values()) 654 if not vals: 655 return 656 (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals) 657 (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals) 658 bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax) 659 self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE, self.group_info.name, 660 "PV sizes differ too much in the group; smallest (%s MB) is" 661 " on %s, biggest (%s MB) is on %s", 662 pvmin, self.cfg.GetNodeName(minnode_uuid), 663 pvmax, self.cfg.GetNodeName(maxnode_uuid))
664
665 - def _VerifyNodeBridges(self, ninfo, nresult, bridges):
666 """Check the node bridges. 667 668 @type ninfo: L{objects.Node} 669 @param ninfo: the node to check 670 @param nresult: the remote results for the node 671 @param bridges: the expected list of bridges 672 673 """ 674 if not bridges: 675 return 676 677 missing = nresult.get(constants.NV_BRIDGES, None) 678 test = not isinstance(missing, list) 679 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 680 "did not return valid bridge information") 681 if not test: 682 self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name, 683 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
684
685 - def _VerifyNodeUserScripts(self, ninfo, nresult):
686 """Check the results of user scripts presence and executability on the node 687 688 @type ninfo: L{objects.Node} 689 @param ninfo: the node to check 690 @param nresult: the remote results for the node 691 692 """ 693 test = not constants.NV_USERSCRIPTS in nresult 694 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, ninfo.name, 695 "did not return user scripts information") 696 697 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None) 698 if not test: 699 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, ninfo.name, 700 "user scripts not present or not executable: %s" % 701 utils.CommaJoin(sorted(broken_scripts)))
702
703 - def _VerifyNodeNetwork(self, ninfo, nresult):
704 """Check the node network connectivity results. 705 706 @type ninfo: L{objects.Node} 707 @param ninfo: the node to check 708 @param nresult: the remote results for the node 709 710 """ 711 test = constants.NV_NODELIST not in nresult 712 self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name, 713 "node hasn't returned node ssh connectivity data") 714 if not test: 715 if nresult[constants.NV_NODELIST]: 716 for a_node, a_msg in nresult[constants.NV_NODELIST].items(): 717 self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name, 718 "ssh communication with node '%s': %s", a_node, a_msg) 719 720 test = constants.NV_NODENETTEST not in nresult 721 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 722 "node hasn't returned node tcp connectivity data") 723 if not test: 724 if nresult[constants.NV_NODENETTEST]: 725 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys()) 726 for anode in nlist: 727 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, 728 "tcp communication with node '%s': %s", 729 anode, nresult[constants.NV_NODENETTEST][anode]) 730 731 test = constants.NV_MASTERIP not in nresult 732 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 733 "node hasn't returned node master IP reachability data") 734 if not test: 735 if not nresult[constants.NV_MASTERIP]: 736 if ninfo.uuid == self.master_node: 737 msg = "the master node cannot reach the master IP (not configured?)" 738 else: 739 msg = "cannot reach the master IP" 740 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg)
741
742 - def _VerifyInstance(self, instance, node_image, diskstatus):
743 """Verify an instance. 744 745 This function checks to see if the required block devices are 746 available on the instance's node, and that the nodes are in the correct 747 state. 748 749 """ 750 pnode_uuid = instance.primary_node 751 pnode_img = node_image[pnode_uuid] 752 groupinfo = self.cfg.GetAllNodeGroupsInfo() 753 754 node_vol_should = {} 755 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 756 757 cluster = self.cfg.GetClusterInfo() 758 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster, 759 self.group_info) 760 err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg) 761 self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name, 762 utils.CommaJoin(err), code=self.ETYPE_WARNING) 763 764 for node_uuid in node_vol_should: 765 n_img = node_image[node_uuid] 766 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail: 767 # ignore missing volumes on offline or broken nodes 768 continue 769 for volume in node_vol_should[node_uuid]: 770 test = volume not in n_img.volumes 771 self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name, 772 "volume %s missing on node %s", volume, 773 self.cfg.GetNodeName(node_uuid)) 774 775 if instance.admin_state == constants.ADMINST_UP: 776 test = instance.uuid not in pnode_img.instances and not pnode_img.offline 777 self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name, 778 "instance not running on its primary node %s", 779 self.cfg.GetNodeName(pnode_uuid)) 780 self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE, 781 instance.name, "instance is marked as running and lives on" 782 " offline node %s", self.cfg.GetNodeName(pnode_uuid)) 783 784 diskdata = [(nname, success, status, idx) 785 for (nname, disks) in diskstatus.items() 786 for idx, (success, status) in enumerate(disks)] 787 788 for nname, success, bdev_status, idx in diskdata: 789 # the 'ghost node' construction in Exec() ensures that we have a 790 # node here 791 snode = node_image[nname] 792 bad_snode = snode.ghost or snode.offline 793 self._ErrorIf(instance.disks_active and 794 not success and not bad_snode, 795 constants.CV_EINSTANCEFAULTYDISK, instance.name, 796 "couldn't retrieve status for disk/%s on %s: %s", 797 idx, self.cfg.GetNodeName(nname), bdev_status) 798 799 if instance.disks_active and success and bdev_status.is_degraded: 800 msg = "disk/%s on %s is degraded" % (idx, self.cfg.GetNodeName(nname)) 801 802 code = self.ETYPE_ERROR 803 accepted_lds = [constants.LDS_OKAY, constants.LDS_SYNC] 804 805 if bdev_status.ldisk_status in accepted_lds: 806 code = self.ETYPE_WARNING 807 808 msg += "; local disk state is '%s'" % \ 809 constants.LDS_NAMES[bdev_status.ldisk_status] 810 811 self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg, 812 code=code) 813 814 self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline, 815 constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid), 816 "instance %s, connection to primary node failed", 817 instance.name) 818 819 secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid) 820 self._ErrorIf(len(secondary_nodes) > 1, 821 constants.CV_EINSTANCELAYOUT, instance.name, 822 "instance has multiple secondary nodes: %s", 823 utils.CommaJoin(secondary_nodes), 824 code=self.ETYPE_WARNING) 825 826 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid) 827 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, inst_nodes) 828 disks = self.cfg.GetInstanceDisks(instance.uuid) 829 if any(es_flags.values()): 830 if not utils.AllDiskOfType(disks, constants.DTS_EXCL_STORAGE): 831 # Disk template not compatible with exclusive_storage: no instance 832 # node should have the flag set 833 es_nodes = [n 834 for (n, es) in es_flags.items() 835 if es] 836 unsupported = [d.dev_type for d in disks 837 if d.dev_type not in constants.DTS_EXCL_STORAGE] 838 self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name, 839 "instance uses disk types %s, which are not supported on" 840 " nodes that have exclusive storage set: %s", 841 utils.CommaJoin(unsupported), 842 utils.CommaJoin(self.cfg.GetNodeNames(es_nodes))) 843 for (idx, disk) in enumerate(disks): 844 self._ErrorIf(disk.spindles is None, 845 constants.CV_EINSTANCEMISSINGCFGPARAMETER, instance.name, 846 "number of spindles not configured for disk %s while" 847 " exclusive storage is enabled, try running" 848 " gnt-cluster repair-disk-sizes", idx) 849 850 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 851 instance_nodes = utils.NiceSort(inst_nodes) 852 instance_groups = {} 853 854 for node_uuid in instance_nodes: 855 instance_groups.setdefault(self.all_node_info[node_uuid].group, 856 []).append(node_uuid) 857 858 pretty_list = [ 859 "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)), 860 groupinfo[group].name) 861 # Sort so that we always list the primary node first. 862 for group, nodes in sorted(instance_groups.items(), 863 key=lambda (_, nodes): pnode_uuid in nodes, 864 reverse=True)] 865 866 self._ErrorIf(len(instance_groups) > 1, 867 constants.CV_EINSTANCESPLITGROUPS, 868 instance.name, "instance has primary and secondary nodes in" 869 " different groups: %s", utils.CommaJoin(pretty_list), 870 code=self.ETYPE_WARNING) 871 872 inst_nodes_offline = [] 873 for snode in secondary_nodes: 874 s_img = node_image[snode] 875 self._ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC, 876 self.cfg.GetNodeName(snode), 877 "instance %s, connection to secondary node failed", 878 instance.name) 879 880 if s_img.offline: 881 inst_nodes_offline.append(snode) 882 883 # warn that the instance lives on offline nodes 884 self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, 885 instance.name, "instance has offline secondary node(s) %s", 886 utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline))) 887 # ... or ghost/non-vm_capable nodes 888 for node_uuid in inst_nodes: 889 self._ErrorIf(node_image[node_uuid].ghost, constants.CV_EINSTANCEBADNODE, 890 instance.name, "instance lives on ghost node %s", 891 self.cfg.GetNodeName(node_uuid)) 892 self._ErrorIf(not node_image[node_uuid].vm_capable, 893 constants.CV_EINSTANCEBADNODE, instance.name, 894 "instance lives on non-vm_capable node %s", 895 self.cfg.GetNodeName(node_uuid))
896
897 - def _VerifyOrphanVolumes(self, vg_name, node_vol_should, node_image, 898 reserved):
899 """Verify if there are any unknown volumes in the cluster. 900 901 The .os, .swap and backup volumes are ignored. All other volumes are 902 reported as unknown. 903 904 @type vg_name: string 905 @param vg_name: the name of the Ganeti-administered volume group 906 @type reserved: L{ganeti.utils.FieldSet} 907 @param reserved: a FieldSet of reserved volume names 908 909 """ 910 for node_uuid, n_img in node_image.items(): 911 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or 912 self.all_node_info[node_uuid].group != self.group_uuid): 913 # skip non-healthy nodes 914 continue 915 for volume in n_img.volumes: 916 # skip volumes not belonging to the ganeti-administered volume group 917 if volume.split('/')[0] != vg_name: 918 continue 919 920 test = ((node_uuid not in node_vol_should or 921 volume not in node_vol_should[node_uuid]) and 922 not reserved.Matches(volume)) 923 self._ErrorIf(test, constants.CV_ENODEORPHANLV, 924 self.cfg.GetNodeName(node_uuid), 925 "volume %s is unknown", volume, 926 code=_VerifyErrors.ETYPE_WARNING)
927
928 - def _VerifyNPlusOneMemory(self, node_image, all_insts):
929 """Verify N+1 Memory Resilience. 930 931 Check that if one single node dies we can still start all the 932 instances it was primary for. 933 934 """ 935 cluster_info = self.cfg.GetClusterInfo() 936 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster_info, 937 self.group_info) 938 memory_ratio = ipolicy[constants.IPOLICY_MEMORY_RATIO] 939 940 for node_uuid, n_img in node_image.items(): 941 # This code checks that every node which is now listed as 942 # secondary has enough memory to host all instances it is 943 # supposed to should a single other node in the cluster fail. 944 # FIXME: not ready for failover to an arbitrary node 945 # FIXME: does not support file-backed instances 946 # WARNING: we currently take into account down instances as well 947 # as up ones, considering that even if they're down someone 948 # might want to start them even in the event of a node failure. 949 node_cfg = self.all_node_info[node_uuid] 950 if n_img.offline or \ 951 node_cfg.group != self.group_uuid: 952 # we're skipping nodes marked offline and nodes in other groups from 953 # the N+1 warning, since most likely we don't have good memory 954 # information from them; we already list instances living on such 955 # nodes, and that's enough warning 956 continue 957 #TODO(dynmem): also consider ballooning out other instances 958 for prinode, inst_uuids in n_img.sbp.items(): 959 needed_mem = 0 960 for inst_uuid in inst_uuids: 961 bep = cluster_info.FillBE(all_insts[inst_uuid]) 962 if bep[constants.BE_AUTO_BALANCE]: 963 needed_mem += bep[constants.BE_MINMEM] 964 mnode = n_img.mdom0 965 (hv, hv_state) = self.cfg.GetFilledHvStateParams(node_cfg).items()[0] 966 if hv != constants.HT_XEN_PVM and hv != constants.HT_XEN_HVM: 967 mnode = hv_state["mem_node"] 968 # minimum allowed free memory (it's negative due to over-commitment) 969 mem_treshold = (n_img.mtotal - mnode) * (memory_ratio - 1) 970 test = n_img.mfree - needed_mem < mem_treshold 971 self._ErrorIf(test, constants.CV_ENODEN1, 972 self.cfg.GetNodeName(node_uuid), 973 "not enough memory to accomodate instance failovers" 974 " should node %s fail (%dMiB needed, %dMiB available)", 975 self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
976
977 - def _VerifyClientCertificates(self, nodes, all_nvinfo):
978 """Verifies the consistency of the client certificates. 979 980 This includes several aspects: 981 - the individual validation of all nodes' certificates 982 - the consistency of the master candidate certificate map 983 - the consistency of the master candidate certificate map with the 984 certificates that the master candidates are actually using. 985 986 @param nodes: the list of nodes to consider in this verification 987 @param all_nvinfo: the map of results of the verify_node call to 988 all nodes 989 990 """ 991 candidate_certs = self.cfg.GetClusterInfo().candidate_certs 992 if candidate_certs is None or len(candidate_certs) == 0: 993 self._ErrorIf( 994 True, constants.CV_ECLUSTERCLIENTCERT, None, 995 "The cluster's list of master candidate certificates is empty." 996 " If you just updated the cluster, please run" 997 " 'gnt-cluster renew-crypto --new-node-certificates'.") 998 return 999 1000 self._ErrorIf( 1001 len(candidate_certs) != len(set(candidate_certs.values())), 1002 constants.CV_ECLUSTERCLIENTCERT, None, 1003 "There are at least two master candidates configured to use the same" 1004 " certificate.") 1005 1006 # collect the client certificate 1007 for node in nodes: 1008 if node.offline: 1009 continue 1010 1011 nresult = all_nvinfo[node.uuid] 1012 if nresult.fail_msg or not nresult.payload: 1013 continue 1014 1015 (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None) 1016 1017 self._ErrorIf( 1018 errcode is not None, constants.CV_ECLUSTERCLIENTCERT, None, 1019 "Client certificate of node '%s' failed validation: %s (code '%s')", 1020 node.uuid, msg, errcode) 1021 1022 if not errcode: 1023 digest = msg 1024 if node.master_candidate: 1025 if node.uuid in candidate_certs: 1026 self._ErrorIf( 1027 digest != candidate_certs[node.uuid], 1028 constants.CV_ECLUSTERCLIENTCERT, None, 1029 "Client certificate digest of master candidate '%s' does not" 1030 " match its entry in the cluster's map of master candidate" 1031 " certificates. Expected: %s Got: %s", node.uuid, 1032 digest, candidate_certs[node.uuid]) 1033 else: 1034 self._ErrorIf( 1035 True, constants.CV_ECLUSTERCLIENTCERT, None, 1036 "The master candidate '%s' does not have an entry in the" 1037 " map of candidate certificates.", node.uuid) 1038 self._ErrorIf( 1039 digest in candidate_certs.values(), 1040 constants.CV_ECLUSTERCLIENTCERT, None, 1041 "Master candidate '%s' is using a certificate of another node.", 1042 node.uuid) 1043 else: 1044 self._ErrorIf( 1045 node.uuid in candidate_certs, 1046 constants.CV_ECLUSTERCLIENTCERT, None, 1047 "Node '%s' is not a master candidate, but still listed in the" 1048 " map of master candidate certificates.", node.uuid) 1049 self._ErrorIf( 1050 (node.uuid not in candidate_certs) and 1051 (digest in candidate_certs.values()), 1052 constants.CV_ECLUSTERCLIENTCERT, None, 1053 "Node '%s' is not a master candidate and is incorrectly using a" 1054 " certificate of another node which is master candidate.", 1055 node.uuid)
1056
1057 - def _VerifySshSetup(self, nodes, all_nvinfo):
1058 """Evaluates the verification results of the SSH setup and clutter test. 1059 1060 @param nodes: List of L{objects.Node} objects 1061 @param all_nvinfo: RPC results 1062 1063 """ 1064 for node in nodes: 1065 if not node.offline: 1066 nresult = all_nvinfo[node.uuid] 1067 if nresult.fail_msg or not nresult.payload: 1068 self._ErrorIf(True, constants.CV_ENODESSH, node.name, 1069 "Could not verify the SSH setup of this node.") 1070 return 1071 for ssh_test in [constants.NV_SSH_SETUP, constants.NV_SSH_CLUTTER]: 1072 result = nresult.payload.get(ssh_test, None) 1073 error_msg = "" 1074 if isinstance(result, list): 1075 error_msg = " ".join(result) 1076 self._ErrorIf(result, 1077 constants.CV_ENODESSH, None, error_msg)
1078
1079 - def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo, 1080 (files_all, files_opt, files_mc, files_vm)):
1081 """Verifies file checksums collected from all nodes. 1082 1083 @param nodes: List of L{objects.Node} objects 1084 @param master_node_uuid: UUID of master node 1085 @param all_nvinfo: RPC results 1086 1087 """ 1088 # Define functions determining which nodes to consider for a file 1089 files2nodefn = [ 1090 (files_all, None), 1091 (files_mc, lambda node: (node.master_candidate or 1092 node.uuid == master_node_uuid)), 1093 (files_vm, lambda node: node.vm_capable), 1094 ] 1095 1096 # Build mapping from filename to list of nodes which should have the file 1097 nodefiles = {} 1098 for (files, fn) in files2nodefn: 1099 if fn is None: 1100 filenodes = nodes 1101 else: 1102 filenodes = filter(fn, nodes) 1103 nodefiles.update((filename, 1104 frozenset(map(operator.attrgetter("uuid"), filenodes))) 1105 for filename in files) 1106 1107 assert set(nodefiles) == (files_all | files_mc | files_vm) 1108 1109 fileinfo = dict((filename, {}) for filename in nodefiles) 1110 ignore_nodes = set() 1111 1112 for node in nodes: 1113 if node.offline: 1114 ignore_nodes.add(node.uuid) 1115 continue 1116 1117 nresult = all_nvinfo[node.uuid] 1118 1119 if nresult.fail_msg or not nresult.payload: 1120 node_files = None 1121 else: 1122 fingerprints = nresult.payload.get(constants.NV_FILELIST, {}) 1123 node_files = dict((vcluster.LocalizeVirtualPath(key), value) 1124 for (key, value) in fingerprints.items()) 1125 del fingerprints 1126 1127 test = not (node_files and isinstance(node_files, dict)) 1128 self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name, 1129 "Node did not return file checksum data") 1130 if test: 1131 ignore_nodes.add(node.uuid) 1132 continue 1133 1134 # Build per-checksum mapping from filename to nodes having it 1135 for (filename, checksum) in node_files.items(): 1136 assert filename in nodefiles 1137 fileinfo[filename].setdefault(checksum, set()).add(node.uuid) 1138 1139 for (filename, checksums) in fileinfo.items(): 1140 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum" 1141 1142 # Nodes having the file 1143 with_file = frozenset(node_uuid 1144 for node_uuids in fileinfo[filename].values() 1145 for node_uuid in node_uuids) - ignore_nodes 1146 1147 expected_nodes = nodefiles[filename] - ignore_nodes 1148 1149 # Nodes missing file 1150 missing_file = expected_nodes - with_file 1151 1152 if filename in files_opt: 1153 # All or no nodes 1154 self._ErrorIf(missing_file and missing_file != expected_nodes, 1155 constants.CV_ECLUSTERFILECHECK, None, 1156 "File %s is optional, but it must exist on all or no" 1157 " nodes (not found on %s)", 1158 filename, 1159 utils.CommaJoin( 1160 utils.NiceSort( 1161 map(self.cfg.GetNodeName, missing_file)))) 1162 else: 1163 self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None, 1164 "File %s is missing from node(s) %s", filename, 1165 utils.CommaJoin( 1166 utils.NiceSort( 1167 map(self.cfg.GetNodeName, missing_file)))) 1168 1169 # Warn if a node has a file it shouldn't 1170 unexpected = with_file - expected_nodes 1171 self._ErrorIf(unexpected, 1172 constants.CV_ECLUSTERFILECHECK, None, 1173 "File %s should not exist on node(s) %s", 1174 filename, utils.CommaJoin( 1175 utils.NiceSort(map(self.cfg.GetNodeName, unexpected)))) 1176 1177 # See if there are multiple versions of the file 1178 test = len(checksums) > 1 1179 if test: 1180 variants = ["variant %s on %s" % 1181 (idx + 1, 1182 utils.CommaJoin(utils.NiceSort( 1183 map(self.cfg.GetNodeName, node_uuids)))) 1184 for (idx, (checksum, node_uuids)) in 1185 enumerate(sorted(checksums.items()))] 1186 else: 1187 variants = [] 1188 1189 self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None, 1190 "File %s found with %s different checksums (%s)", 1191 filename, len(checksums), "; ".join(variants))
1192
1193 - def _VerifyNodeDrbdHelper(self, ninfo, nresult, drbd_helper):
1194 """Verify the drbd helper. 1195 1196 """ 1197 if drbd_helper: 1198 helper_result = nresult.get(constants.NV_DRBDHELPER, None) 1199 test = (helper_result is None) 1200 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1201 "no drbd usermode helper returned") 1202 if helper_result: 1203 status, payload = helper_result 1204 test = not status 1205 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1206 "drbd usermode helper check unsuccessful: %s", payload) 1207 test = status and (payload != drbd_helper) 1208 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1209 "wrong drbd usermode helper: %s", payload)
1210 1211 @staticmethod
1212 - def _ComputeDrbdMinors(ninfo, instanceinfo, disks_info, drbd_map, error_if):
1213 """Gives the DRBD information in a map for a node. 1214 1215 @type ninfo: L{objects.Node} 1216 @param ninfo: the node to check 1217 @param instanceinfo: the dict of instances 1218 @param disks_info: the dict of disks 1219 @param drbd_map: the DRBD map as returned by 1220 L{ganeti.config.ConfigWriter.ComputeDRBDMap} 1221 @type error_if: callable like L{_ErrorIf} 1222 @param error_if: The error reporting function 1223 @return: dict from minor number to (disk_uuid, instance_uuid, active) 1224 1225 """ 1226 node_drbd = {} 1227 for minor, disk_uuid in drbd_map[ninfo.uuid].items(): 1228 test = disk_uuid not in disks_info 1229 error_if(test, constants.CV_ECLUSTERCFG, None, 1230 "ghost disk '%s' in temporary DRBD map", disk_uuid) 1231 # ghost disk should not be active, but otherwise we 1232 # don't give double warnings (both ghost disk and 1233 # unallocated minor in use) 1234 if test: 1235 node_drbd[minor] = (disk_uuid, None, False) 1236 else: 1237 disk_active = False 1238 disk_instance = None 1239 for (inst_uuid, inst) in instanceinfo.items(): 1240 if disk_uuid in inst.disks: 1241 disk_active = inst.disks_active 1242 disk_instance = inst_uuid 1243 break 1244 node_drbd[minor] = (disk_uuid, disk_instance, disk_active) 1245 return node_drbd
1246
1247 - def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, disks_info, 1248 drbd_helper, drbd_map):
1249 """Verifies and the node DRBD status. 1250 1251 @type ninfo: L{objects.Node} 1252 @param ninfo: the node to check 1253 @param nresult: the remote results for the node 1254 @param instanceinfo: the dict of instances 1255 @param disks_info: the dict of disks 1256 @param drbd_helper: the configured DRBD usermode helper 1257 @param drbd_map: the DRBD map as returned by 1258 L{ganeti.config.ConfigWriter.ComputeDRBDMap} 1259 1260 """ 1261 self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper) 1262 1263 # compute the DRBD minors 1264 node_drbd = self._ComputeDrbdMinors(ninfo, instanceinfo, disks_info, 1265 drbd_map, self._ErrorIf) 1266 1267 # and now check them 1268 used_minors = nresult.get(constants.NV_DRBDLIST, []) 1269 test = not isinstance(used_minors, (tuple, list)) 1270 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1271 "cannot parse drbd status file: %s", str(used_minors)) 1272 if test: 1273 # we cannot check drbd status 1274 return 1275 1276 for minor, (disk_uuid, inst_uuid, must_exist) in node_drbd.items(): 1277 test = minor not in used_minors and must_exist 1278 if inst_uuid is not None: 1279 attached = "(attached in instance '%s')" % \ 1280 self.cfg.GetInstanceName(inst_uuid) 1281 else: 1282 attached = "(detached)" 1283 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1284 "drbd minor %d of disk %s %s is not active", 1285 minor, disk_uuid, attached) 1286 for minor in used_minors: 1287 test = minor not in node_drbd 1288 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1289 "unallocated drbd minor %d is in use", minor)
1290
1291 - def _UpdateNodeOS(self, ninfo, nresult, nimg):
1292 """Builds the node OS structures. 1293 1294 @type ninfo: L{objects.Node} 1295 @param ninfo: the node to check 1296 @param nresult: the remote results for the node 1297 @param nimg: the node image object 1298 1299 """ 1300 remote_os = nresult.get(constants.NV_OSLIST, None) 1301 test = (not isinstance(remote_os, list) or 1302 not compat.all(isinstance(v, list) and len(v) == 8 1303 for v in remote_os)) 1304 1305 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name, 1306 "node hasn't returned valid OS data") 1307 1308 nimg.os_fail = test 1309 1310 if test: 1311 return 1312 1313 os_dict = {} 1314 1315 for (name, os_path, status, diagnose, 1316 variants, parameters, api_ver, 1317 trusted) in nresult[constants.NV_OSLIST]: 1318 1319 if name not in os_dict: 1320 os_dict[name] = [] 1321 1322 # parameters is a list of lists instead of list of tuples due to 1323 # JSON lacking a real tuple type, fix it: 1324 parameters = [tuple(v) for v in parameters] 1325 os_dict[name].append((os_path, status, diagnose, 1326 set(variants), set(parameters), set(api_ver), 1327 trusted)) 1328 1329 nimg.oslist = os_dict
1330
1331 - def _VerifyNodeOS(self, ninfo, nimg, base):
1332 """Verifies the node OS list. 1333 1334 @type ninfo: L{objects.Node} 1335 @param ninfo: the node to check 1336 @param nimg: the node image object 1337 @param base: the 'template' node we match against (e.g. from the master) 1338 1339 """ 1340 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?" 1341 1342 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l] 1343 for os_name, os_data in nimg.oslist.items(): 1344 assert os_data, "Empty OS status for OS %s?!" % os_name 1345 f_path, f_status, f_diag, f_var, f_param, f_api, f_trusted = os_data[0] 1346 self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name, 1347 "Invalid OS %s (located at %s): %s", 1348 os_name, f_path, f_diag) 1349 self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name, 1350 "OS '%s' has multiple entries" 1351 " (first one shadows the rest): %s", 1352 os_name, utils.CommaJoin([v[0] for v in os_data])) 1353 # comparisons with the 'base' image 1354 test = os_name not in base.oslist 1355 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name, 1356 "Extra OS %s not present on reference node (%s)", 1357 os_name, self.cfg.GetNodeName(base.uuid)) 1358 if test: 1359 continue 1360 assert base.oslist[os_name], "Base node has empty OS status?" 1361 _, b_status, _, b_var, b_param, b_api, b_trusted = base.oslist[os_name][0] 1362 if not b_status: 1363 # base OS is invalid, skipping 1364 continue 1365 for kind, a, b in [("API version", f_api, b_api), 1366 ("variants list", f_var, b_var), 1367 ("parameters", beautify_params(f_param), 1368 beautify_params(b_param))]: 1369 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name, 1370 "OS %s for %s differs from reference node %s:" 1371 " [%s] vs. [%s]", kind, os_name, 1372 self.cfg.GetNodeName(base.uuid), 1373 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b))) 1374 for kind, a, b in [("trusted", f_trusted, b_trusted)]: 1375 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name, 1376 "OS %s for %s differs from reference node %s:" 1377 " %s vs. %s", kind, os_name, 1378 self.cfg.GetNodeName(base.uuid), a, b) 1379 1380 # check any missing OSes 1381 missing = set(base.oslist.keys()).difference(nimg.oslist.keys()) 1382 self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name, 1383 "OSes present on reference node %s" 1384 " but missing on this node: %s", 1385 self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
1386
1387 - def _VerifyAcceptedFileStoragePaths(self, ninfo, nresult, is_master):
1388 """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}. 1389 1390 @type ninfo: L{objects.Node} 1391 @param ninfo: the node to check 1392 @param nresult: the remote results for the node 1393 @type is_master: bool 1394 @param is_master: Whether node is the master node 1395 1396 """ 1397 cluster = self.cfg.GetClusterInfo() 1398 if (is_master and 1399 (cluster.IsFileStorageEnabled() or 1400 cluster.IsSharedFileStorageEnabled())): 1401 try: 1402 fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS] 1403 except KeyError: 1404 # This should never happen 1405 self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1406 "Node did not return forbidden file storage paths") 1407 else: 1408 self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1409 "Found forbidden file storage paths: %s", 1410 utils.CommaJoin(fspaths)) 1411 else: 1412 self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult, 1413 constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1414 "Node should not have returned forbidden file storage" 1415 " paths")
1416
1417 - def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template, 1418 verify_key, error_key):
1419 """Verifies (file) storage paths. 1420 1421 @type ninfo: L{objects.Node} 1422 @param ninfo: the node to check 1423 @param nresult: the remote results for the node 1424 @type file_disk_template: string 1425 @param file_disk_template: file-based disk template, whose directory 1426 is supposed to be verified 1427 @type verify_key: string 1428 @param verify_key: key for the verification map of this file 1429 verification step 1430 @param error_key: error key to be added to the verification results 1431 in case something goes wrong in this verification step 1432 1433 """ 1434 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes( 1435 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER 1436 )) 1437 1438 cluster = self.cfg.GetClusterInfo() 1439 if cluster.IsDiskTemplateEnabled(file_disk_template): 1440 self._ErrorIf( 1441 verify_key in nresult, 1442 error_key, ninfo.name, 1443 "The configured %s storage path is unusable: %s" % 1444 (file_disk_template, nresult.get(verify_key)))
1445
1446 - def _VerifyFileStoragePaths(self, ninfo, nresult):
1447 """Verifies (file) storage paths. 1448 1449 @see: C{_VerifyStoragePaths} 1450 1451 """ 1452 self._VerifyStoragePaths( 1453 ninfo, nresult, constants.DT_FILE, 1454 constants.NV_FILE_STORAGE_PATH, 1455 constants.CV_ENODEFILESTORAGEPATHUNUSABLE)
1456
1457 - def _VerifySharedFileStoragePaths(self, ninfo, nresult):
1458 """Verifies (file) storage paths. 1459 1460 @see: C{_VerifyStoragePaths} 1461 1462 """ 1463 self._VerifyStoragePaths( 1464 ninfo, nresult, constants.DT_SHARED_FILE, 1465 constants.NV_SHARED_FILE_STORAGE_PATH, 1466 constants.CV_ENODESHAREDFILESTORAGEPATHUNUSABLE)
1467
1468 - def _VerifyGlusterStoragePaths(self, ninfo, nresult):
1469 """Verifies (file) storage paths. 1470 1471 @see: C{_VerifyStoragePaths} 1472 1473 """ 1474 self._VerifyStoragePaths( 1475 ninfo, nresult, constants.DT_GLUSTER, 1476 constants.NV_GLUSTER_STORAGE_PATH, 1477 constants.CV_ENODEGLUSTERSTORAGEPATHUNUSABLE)
1478
1479 - def _VerifyOob(self, ninfo, nresult):
1480 """Verifies out of band functionality of a node. 1481 1482 @type ninfo: L{objects.Node} 1483 @param ninfo: the node to check 1484 @param nresult: the remote results for the node 1485 1486 """ 1487 # We just have to verify the paths on master and/or master candidates 1488 # as the oob helper is invoked on the master 1489 if ((ninfo.master_candidate or ninfo.master_capable) and 1490 constants.NV_OOB_PATHS in nresult): 1491 for path_result in nresult[constants.NV_OOB_PATHS]: 1492 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, 1493 ninfo.name, path_result)
1494
1495 - def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1496 """Verifies and updates the node volume data. 1497 1498 This function will update a L{NodeImage}'s internal structures 1499 with data from the remote call. 1500 1501 @type ninfo: L{objects.Node} 1502 @param ninfo: the node to check 1503 @param nresult: the remote results for the node 1504 @param nimg: the node image object 1505 @param vg_name: the configured VG name 1506 1507 """ 1508 nimg.lvm_fail = True 1509 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data") 1510 if vg_name is None: 1511 pass 1512 elif isinstance(lvdata, basestring): 1513 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name, 1514 "LVM problem on node: %s", utils.SafeEncode(lvdata)) 1515 elif not isinstance(lvdata, dict): 1516 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name, 1517 "rpc call to node failed (lvlist)") 1518 else: 1519 nimg.volumes = lvdata 1520 nimg.lvm_fail = False
1521
1522 - def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1523 """Verifies and updates the node instance list. 1524 1525 If the listing was successful, then updates this node's instance 1526 list. Otherwise, it marks the RPC call as failed for the instance 1527 list key. 1528 1529 @type ninfo: L{objects.Node} 1530 @param ninfo: the node to check 1531 @param nresult: the remote results for the node 1532 @param nimg: the node image object 1533 1534 """ 1535 idata = nresult.get(constants.NV_INSTANCELIST, None) 1536 test = not isinstance(idata, list) 1537 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 1538 "rpc call to node failed (instancelist): %s", 1539 utils.SafeEncode(str(idata))) 1540 if test: 1541 nimg.hyp_fail = True 1542 else: 1543 nimg.instances = [uuid for (uuid, _) in 1544 self.cfg.GetMultiInstanceInfoByName(idata)]
1545
1546 - def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1547 """Verifies and computes a node information map 1548 1549 @type ninfo: L{objects.Node} 1550 @param ninfo: the node to check 1551 @param nresult: the remote results for the node 1552 @param nimg: the node image object 1553 @param vg_name: the configured VG name 1554 1555 """ 1556 # try to read free memory (from the hypervisor) 1557 hv_info = nresult.get(constants.NV_HVINFO, None) 1558 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info \ 1559 or "memory_total" not in hv_info \ 1560 or "memory_dom0" not in hv_info 1561 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 1562 "rpc call to node failed (hvinfo)") 1563 if not test: 1564 try: 1565 nimg.mfree = int(hv_info["memory_free"]) 1566 nimg.mtotal = int(hv_info["memory_total"]) 1567 nimg.mdom0 = int(hv_info["memory_dom0"]) 1568 except (ValueError, TypeError): 1569 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, 1570 "node returned invalid nodeinfo, check hypervisor") 1571 1572 # FIXME: devise a free space model for file based instances as well 1573 if vg_name is not None: 1574 test = (constants.NV_VGLIST not in nresult or 1575 vg_name not in nresult[constants.NV_VGLIST]) 1576 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name, 1577 "node didn't return data for the volume group '%s'" 1578 " - it is either missing or broken", vg_name) 1579 if not test: 1580 try: 1581 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name]) 1582 except (ValueError, TypeError): 1583 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, 1584 "node returned invalid LVM info, check LVM status")
1585
1586 - def _CollectDiskInfo(self, node_uuids, node_image, instanceinfo):
1587 """Gets per-disk status information for all instances. 1588 1589 @type node_uuids: list of strings 1590 @param node_uuids: Node UUIDs 1591 @type node_image: dict of (UUID, L{objects.Node}) 1592 @param node_image: Node objects 1593 @type instanceinfo: dict of (UUID, L{objects.Instance}) 1594 @param instanceinfo: Instance objects 1595 @rtype: {instance: {node: [(succes, payload)]}} 1596 @return: a dictionary of per-instance dictionaries with nodes as 1597 keys and disk information as values; the disk information is a 1598 list of tuples (success, payload) 1599 1600 """ 1601 node_disks = {} 1602 node_disks_dev_inst_only = {} 1603 diskless_instances = set() 1604 nodisk_instances = set() 1605 1606 for nuuid in node_uuids: 1607 node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst, 1608 node_image[nuuid].sinst)) 1609 diskless_instances.update(uuid for uuid in node_inst_uuids 1610 if not instanceinfo[uuid].disks) 1611 disks = [(inst_uuid, disk) 1612 for inst_uuid in node_inst_uuids 1613 for disk in self.cfg.GetInstanceDisks(inst_uuid)] 1614 1615 if not disks: 1616 nodisk_instances.update(uuid for uuid in node_inst_uuids 1617 if instanceinfo[uuid].disks) 1618 # No need to collect data 1619 continue 1620 1621 node_disks[nuuid] = disks 1622 1623 # _AnnotateDiskParams makes already copies of the disks 1624 dev_inst_only = [] 1625 for (inst_uuid, dev) in disks: 1626 (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev], 1627 self.cfg) 1628 dev_inst_only.append((anno_disk, instanceinfo[inst_uuid])) 1629 1630 node_disks_dev_inst_only[nuuid] = dev_inst_only 1631 1632 assert len(node_disks) == len(node_disks_dev_inst_only) 1633 1634 # Collect data from all nodes with disks 1635 result = self.rpc.call_blockdev_getmirrorstatus_multi( 1636 node_disks.keys(), node_disks_dev_inst_only) 1637 1638 assert len(result) == len(node_disks) 1639 1640 instdisk = {} 1641 1642 for (nuuid, nres) in result.items(): 1643 node = self.cfg.GetNodeInfo(nuuid) 1644 disks = node_disks[node.uuid] 1645 1646 if nres.offline: 1647 # No data from this node 1648 data = len(disks) * [(False, "node offline")] 1649 else: 1650 msg = nres.fail_msg 1651 self._ErrorIf(msg, constants.CV_ENODERPC, node.name, 1652 "while getting disk information: %s", msg) 1653 if msg: 1654 # No data from this node 1655 data = len(disks) * [(False, msg)] 1656 else: 1657 data = [] 1658 for idx, i in enumerate(nres.payload): 1659 if isinstance(i, (tuple, list)) and len(i) == 2: 1660 data.append(i) 1661 else: 1662 logging.warning("Invalid result from node %s, entry %d: %s", 1663 node.name, idx, i) 1664 data.append((False, "Invalid result from the remote node")) 1665 1666 for ((inst_uuid, _), status) in zip(disks, data): 1667 instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \ 1668 .append(status) 1669 1670 # Add empty entries for diskless instances. 1671 for inst_uuid in diskless_instances: 1672 assert inst_uuid not in instdisk 1673 instdisk[inst_uuid] = {} 1674 # ...and disk-full instances that happen to have no disks 1675 for inst_uuid in nodisk_instances: 1676 assert inst_uuid not in instdisk 1677 instdisk[inst_uuid] = {} 1678 1679 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and 1680 len(nuuids) <= len( 1681 self.cfg.GetInstanceNodes(instanceinfo[inst].uuid)) and 1682 compat.all(isinstance(s, (tuple, list)) and 1683 len(s) == 2 for s in statuses) 1684 for inst, nuuids in instdisk.items() 1685 for nuuid, statuses in nuuids.items()) 1686 if __debug__: 1687 instdisk_keys = set(instdisk) 1688 instanceinfo_keys = set(instanceinfo) 1689 assert instdisk_keys == instanceinfo_keys, \ 1690 ("instdisk keys (%s) do not match instanceinfo keys (%s)" % 1691 (instdisk_keys, instanceinfo_keys)) 1692 1693 return instdisk
1694 1695 @staticmethod
1696 - def _SshNodeSelector(group_uuid, all_nodes):
1697 """Create endless iterators for all potential SSH check hosts. 1698 1699 """ 1700 nodes = [node for node in all_nodes 1701 if (node.group != group_uuid and 1702 not node.offline)] 1703 keyfunc = operator.attrgetter("group") 1704 1705 return map(itertools.cycle, 1706 [sorted(map(operator.attrgetter("name"), names)) 1707 for _, names in itertools.groupby(sorted(nodes, key=keyfunc), 1708 keyfunc)])
1709 1710 @classmethod
1711 - def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
1712 """Choose which nodes should talk to which other nodes. 1713 1714 We will make nodes contact all nodes in their group, and one node from 1715 every other group. 1716 1717 @rtype: tuple of (string, dict of strings to list of strings, string) 1718 @return: a tuple containing the list of all online nodes, a dictionary 1719 mapping node names to additional nodes of other node groups to which 1720 connectivity should be tested, and a list of all online master 1721 candidates 1722 1723 @warning: This algorithm has a known issue if one node group is much 1724 smaller than others (e.g. just one node). In such a case all other 1725 nodes will talk to the single node. 1726 1727 """ 1728 online_nodes = sorted(node.name for node in group_nodes if not node.offline) 1729 online_mcs = sorted(node.name for node in group_nodes 1730 if (node.master_candidate and not node.offline)) 1731 sel = cls._SshNodeSelector(group_uuid, all_nodes) 1732 1733 return (online_nodes, 1734 dict((name, sorted([i.next() for i in sel])) 1735 for name in online_nodes), 1736 online_mcs)
1737
1738 - def _PrepareSshSetupCheck(self):
1739 """Prepare the input data for the SSH setup verification. 1740 1741 """ 1742 all_nodes_info = self.cfg.GetAllNodesInfo() 1743 potential_master_candidates = self.cfg.GetPotentialMasterCandidates() 1744 node_status = [ 1745 (uuid, node_info.name, node_info.master_candidate, 1746 node_info.name in potential_master_candidates, not node_info.offline) 1747 for (uuid, node_info) in all_nodes_info.items()] 1748 return node_status
1749
1750 - def BuildHooksEnv(self):
1751 """Build hooks env. 1752 1753 Cluster-Verify hooks just ran in the post phase and their failure makes 1754 the output be logged in the verify output and the verification to fail. 1755 1756 """ 1757 env = { 1758 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()), 1759 } 1760 1761 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags())) 1762 for node in self.my_node_info.values()) 1763 1764 return env
1765
1766 - def BuildHooksNodes(self):
1767 """Build hooks nodes. 1768 1769 """ 1770 return ([], list(self.my_node_info.keys()))
1771 1772 @staticmethod
1773 - def _VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced, 1774 i_offline, n_offline, n_drained):
1775 feedback_fn("* Other Notes") 1776 if i_non_redundant: 1777 feedback_fn(" - NOTICE: %d non-redundant instance(s) found." 1778 % len(i_non_redundant)) 1779 1780 if i_non_a_balanced: 1781 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found." 1782 % len(i_non_a_balanced)) 1783 1784 if i_offline: 1785 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline) 1786 1787 if n_offline: 1788 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline) 1789 1790 if n_drained: 1791 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
1792
1793 - def _VerifyExclusionTags(self, nodename, pinst, ctags):
1794 """Verify that all instances have different exclusion tags. 1795 1796 @type nodename: string 1797 @param nodename: the name of the node for which the check is done 1798 @type pinst: list of string 1799 @param pinst: list of UUIDs of those instances having the given node 1800 as primary node 1801 @type ctags: list of string 1802 @param ctags: tags of the cluster 1803 1804 """ 1805 exclusion_prefixes = utils.GetExclusionPrefixes(ctags) 1806 tags_seen = set([]) 1807 conflicting_tags = set([]) 1808 for iuuid in pinst: 1809 allitags = self.my_inst_info[iuuid].tags 1810 if allitags is None: 1811 allitags = [] 1812 itags = set([tag for tag in allitags 1813 if utils.IsGoodTag(exclusion_prefixes, tag)]) 1814 conflicts = itags.intersection(tags_seen) 1815 if len(conflicts) > 0: 1816 conflicting_tags = conflicting_tags.union(conflicts) 1817 tags_seen = tags_seen.union(itags) 1818 1819 self._ErrorIf(len(conflicting_tags) > 0, constants.CV_EEXTAGS, nodename, 1820 "Tags where there is more than one instance: %s", 1821 list(conflicting_tags), code=constants.CV_WARNING)
1822
1823 - def Exec(self, feedback_fn): # pylint: disable=R0915
1824 """Verify integrity of the node group, performing various test on nodes. 1825 1826 """ 1827 # This method has too many local variables. pylint: disable=R0914 1828 feedback_fn("* Verifying group '%s'" % self.group_info.name) 1829 1830 if not self.my_node_uuids: 1831 # empty node group 1832 feedback_fn("* Empty node group, skipping verification") 1833 return True 1834 1835 self.bad = False 1836 verbose = self.op.verbose 1837 self._feedback_fn = feedback_fn 1838 1839 vg_name = self.cfg.GetVGName() 1840 drbd_helper = self.cfg.GetDRBDHelper() 1841 cluster = self.cfg.GetClusterInfo() 1842 hypervisors = cluster.enabled_hypervisors 1843 node_data_list = self.my_node_info.values() 1844 1845 i_non_redundant = [] # Non redundant instances 1846 i_non_a_balanced = [] # Non auto-balanced instances 1847 i_offline = 0 # Count of offline instances 1848 n_offline = 0 # Count of offline nodes 1849 n_drained = 0 # Count of nodes being drained 1850 node_vol_should = {} 1851 1852 # FIXME: verify OS list 1853 1854 # File verification 1855 filemap = ComputeAncillaryFiles(cluster, False) 1856 1857 # do local checksums 1858 master_node_uuid = self.master_node = self.cfg.GetMasterNode() 1859 master_ip = self.cfg.GetMasterIP() 1860 1861 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids)) 1862 1863 user_scripts = [] 1864 if self.cfg.GetUseExternalMipScript(): 1865 user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT) 1866 1867 node_verify_param = { 1868 constants.NV_FILELIST: 1869 map(vcluster.MakeVirtualPath, 1870 utils.UniqueSequence(filename 1871 for files in filemap 1872 for filename in files)), 1873 constants.NV_NODELIST: 1874 self._SelectSshCheckNodes(node_data_list, self.group_uuid, 1875 self.all_node_info.values()), 1876 constants.NV_HYPERVISOR: hypervisors, 1877 constants.NV_HVPARAMS: 1878 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()), 1879 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip) 1880 for node in node_data_list 1881 if not node.offline], 1882 constants.NV_INSTANCELIST: hypervisors, 1883 constants.NV_VERSION: None, 1884 constants.NV_HVINFO: self.cfg.GetHypervisorType(), 1885 constants.NV_NODESETUP: None, 1886 constants.NV_TIME: None, 1887 constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip), 1888 constants.NV_OSLIST: None, 1889 constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(), 1890 constants.NV_USERSCRIPTS: user_scripts, 1891 constants.NV_CLIENT_CERT: None, 1892 } 1893 1894 if self.cfg.GetClusterInfo().modify_ssh_setup: 1895 node_verify_param[constants.NV_SSH_SETUP] = \ 1896 (self._PrepareSshSetupCheck(), self.cfg.GetClusterInfo().ssh_key_type) 1897 if self.op.verify_clutter: 1898 node_verify_param[constants.NV_SSH_CLUTTER] = True 1899 1900 if vg_name is not None: 1901 node_verify_param[constants.NV_VGLIST] = None 1902 node_verify_param[constants.NV_LVLIST] = vg_name 1903 node_verify_param[constants.NV_PVLIST] = [vg_name] 1904 1905 if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8): 1906 if drbd_helper: 1907 node_verify_param[constants.NV_DRBDVERSION] = None 1908 node_verify_param[constants.NV_DRBDLIST] = None 1909 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper 1910 1911 if cluster.IsFileStorageEnabled() or \ 1912 cluster.IsSharedFileStorageEnabled(): 1913 # Load file storage paths only from master node 1914 node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \ 1915 self.cfg.GetMasterNodeName() 1916 if cluster.IsFileStorageEnabled(): 1917 node_verify_param[constants.NV_FILE_STORAGE_PATH] = \ 1918 cluster.file_storage_dir 1919 if cluster.IsSharedFileStorageEnabled(): 1920 node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \ 1921 cluster.shared_file_storage_dir 1922 1923 # bridge checks 1924 # FIXME: this needs to be changed per node-group, not cluster-wide 1925 bridges = set() 1926 default_nicpp = cluster.nicparams[constants.PP_DEFAULT] 1927 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: 1928 bridges.add(default_nicpp[constants.NIC_LINK]) 1929 for inst_uuid in self.my_inst_info.values(): 1930 for nic in inst_uuid.nics: 1931 full_nic = cluster.SimpleFillNIC(nic.nicparams) 1932 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: 1933 bridges.add(full_nic[constants.NIC_LINK]) 1934 1935 if bridges: 1936 node_verify_param[constants.NV_BRIDGES] = list(bridges) 1937 1938 # Build our expected cluster state 1939 node_image = dict((node.uuid, self.NodeImage(offline=node.offline, 1940 uuid=node.uuid, 1941 vm_capable=node.vm_capable)) 1942 for node in node_data_list) 1943 1944 # Gather OOB paths 1945 oob_paths = [] 1946 for node in self.all_node_info.values(): 1947 path = SupportsOob(self.cfg, node) 1948 if path and path not in oob_paths: 1949 oob_paths.append(path) 1950 1951 if oob_paths: 1952 node_verify_param[constants.NV_OOB_PATHS] = oob_paths 1953 1954 for inst_uuid in self.my_inst_uuids: 1955 instance = self.my_inst_info[inst_uuid] 1956 if instance.admin_state == constants.ADMINST_OFFLINE: 1957 i_offline += 1 1958 1959 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid) 1960 for nuuid in inst_nodes: 1961 if nuuid not in node_image: 1962 gnode = self.NodeImage(uuid=nuuid) 1963 gnode.ghost = (nuuid not in self.all_node_info) 1964 node_image[nuuid] = gnode 1965 1966 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 1967 1968 pnode = instance.primary_node 1969 node_image[pnode].pinst.append(instance.uuid) 1970 1971 for snode in self.cfg.GetInstanceSecondaryNodes(instance.uuid): 1972 nimg = node_image[snode] 1973 nimg.sinst.append(instance.uuid) 1974 if pnode not in nimg.sbp: 1975 nimg.sbp[pnode] = [] 1976 nimg.sbp[pnode].append(instance.uuid) 1977 1978 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, 1979 self.my_node_info.keys()) 1980 # The value of exclusive_storage should be the same across the group, so if 1981 # it's True for at least a node, we act as if it were set for all the nodes 1982 self._exclusive_storage = compat.any(es_flags.values()) 1983 if self._exclusive_storage: 1984 node_verify_param[constants.NV_EXCLUSIVEPVS] = True 1985 1986 # At this point, we have the in-memory data structures complete, 1987 # except for the runtime information, which we'll gather next 1988 1989 # NOTE: Here we lock the configuration for the duration of RPC calls, 1990 # which means that the cluster configuration changes are blocked during 1991 # this period. 1992 # This is something that should be done only exceptionally and only for 1993 # justified cases! 1994 # In this case, we need the lock as we can only verify the integrity of 1995 # configuration files on MCs only if we know nobody else is modifying it. 1996 # FIXME: The check for integrity of config.data should be moved to 1997 # WConfD, which is the only one who can otherwise ensure nobody 1998 # will modify the configuration during the check. 1999 with self.cfg.GetConfigManager(shared=True, forcelock=True): 2000 feedback_fn("* Gathering information about nodes (%s nodes)" % 2001 len(self.my_node_uuids)) 2002 # Force the configuration to be fully distributed before doing any tests 2003 self.cfg.FlushConfigGroup(self.group_uuid) 2004 # Due to the way our RPC system works, exact response times cannot be 2005 # guaranteed (e.g. a broken node could run into a timeout). By keeping 2006 # the time before and after executing the request, we can at least have 2007 # a time window. 2008 nvinfo_starttime = time.time() 2009 # Get lock on the configuration so that nobody modifies it concurrently. 2010 # Otherwise it can be modified by other jobs, failing the consistency 2011 # test. 2012 # NOTE: This is an exceptional situation, we should otherwise avoid 2013 # locking the configuration for something but very fast, pure operations. 2014 cluster_name = self.cfg.GetClusterName() 2015 hvparams = self.cfg.GetClusterInfo().hvparams 2016 all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids, 2017 node_verify_param, 2018 cluster_name, 2019 hvparams) 2020 nvinfo_endtime = time.time() 2021 2022 if self.extra_lv_nodes and vg_name is not None: 2023 feedback_fn("* Gathering information about extra nodes (%s nodes)" % 2024 len(self.extra_lv_nodes)) 2025 extra_lv_nvinfo = \ 2026 self.rpc.call_node_verify(self.extra_lv_nodes, 2027 {constants.NV_LVLIST: vg_name}, 2028 self.cfg.GetClusterName(), 2029 self.cfg.GetClusterInfo().hvparams) 2030 else: 2031 extra_lv_nvinfo = {} 2032 2033 # If not all nodes are being checked, we need to make sure the master 2034 # node and a non-checked vm_capable node are in the list. 2035 absent_node_uuids = set(self.all_node_info).difference(self.my_node_info) 2036 if absent_node_uuids: 2037 vf_nvinfo = all_nvinfo.copy() 2038 vf_node_info = list(self.my_node_info.values()) 2039 additional_node_uuids = [] 2040 if master_node_uuid not in self.my_node_info: 2041 additional_node_uuids.append(master_node_uuid) 2042 vf_node_info.append(self.all_node_info[master_node_uuid]) 2043 # Add the first vm_capable node we find which is not included, 2044 # excluding the master node (which we already have) 2045 for node_uuid in absent_node_uuids: 2046 nodeinfo = self.all_node_info[node_uuid] 2047 if (nodeinfo.vm_capable and not nodeinfo.offline and 2048 node_uuid != master_node_uuid): 2049 additional_node_uuids.append(node_uuid) 2050 vf_node_info.append(self.all_node_info[node_uuid]) 2051 break 2052 key = constants.NV_FILELIST 2053 2054 feedback_fn("* Gathering information about the master node") 2055 vf_nvinfo.update(self.rpc.call_node_verify( 2056 additional_node_uuids, {key: node_verify_param[key]}, 2057 self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams)) 2058 else: 2059 vf_nvinfo = all_nvinfo 2060 vf_node_info = self.my_node_info.values() 2061 2062 all_drbd_map = self.cfg.ComputeDRBDMap() 2063 2064 feedback_fn("* Gathering disk information (%s nodes)" % 2065 len(self.my_node_uuids)) 2066 instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image, 2067 self.my_inst_info) 2068 2069 feedback_fn("* Verifying configuration file consistency") 2070 2071 self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo) 2072 if self.cfg.GetClusterInfo().modify_ssh_setup: 2073 self._VerifySshSetup(self.my_node_info.values(), all_nvinfo) 2074 self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap) 2075 2076 feedback_fn("* Verifying node status") 2077 2078 refos_img = None 2079 2080 for node_i in node_data_list: 2081 nimg = node_image[node_i.uuid] 2082 2083 if node_i.offline: 2084 if verbose: 2085 feedback_fn("* Skipping offline node %s" % (node_i.name,)) 2086 n_offline += 1 2087 continue 2088 2089 if node_i.uuid == master_node_uuid: 2090 ntype = "master" 2091 elif node_i.master_candidate: 2092 ntype = "master candidate" 2093 elif node_i.drained: 2094 ntype = "drained" 2095 n_drained += 1 2096 else: 2097 ntype = "regular" 2098 if verbose: 2099 feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype)) 2100 2101 msg = all_nvinfo[node_i.uuid].fail_msg 2102 self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name, 2103 "while contacting node: %s", msg) 2104 if msg: 2105 nimg.rpc_fail = True 2106 continue 2107 2108 nresult = all_nvinfo[node_i.uuid].payload 2109 2110 nimg.call_ok = self._VerifyNode(node_i, nresult) 2111 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime) 2112 self._VerifyNodeNetwork(node_i, nresult) 2113 self._VerifyNodeUserScripts(node_i, nresult) 2114 self._VerifyOob(node_i, nresult) 2115 self._VerifyAcceptedFileStoragePaths(node_i, nresult, 2116 node_i.uuid == master_node_uuid) 2117 self._VerifyFileStoragePaths(node_i, nresult) 2118 self._VerifySharedFileStoragePaths(node_i, nresult) 2119 self._VerifyGlusterStoragePaths(node_i, nresult) 2120 2121 if nimg.vm_capable: 2122 self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg) 2123 if constants.DT_DRBD8 in cluster.enabled_disk_templates: 2124 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, 2125 self.all_disks_info, drbd_helper, all_drbd_map) 2126 2127 if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \ 2128 (constants.DT_DRBD8 in cluster.enabled_disk_templates): 2129 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name) 2130 self._UpdateNodeInstances(node_i, nresult, nimg) 2131 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name) 2132 self._UpdateNodeOS(node_i, nresult, nimg) 2133 2134 if not nimg.os_fail: 2135 if refos_img is None: 2136 refos_img = nimg 2137 self._VerifyNodeOS(node_i, nimg, refos_img) 2138 self._VerifyNodeBridges(node_i, nresult, bridges) 2139 2140 # Check whether all running instances are primary for the node. (This 2141 # can no longer be done from _VerifyInstance below, since some of the 2142 # wrong instances could be from other node groups.) 2143 non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst) 2144 2145 for inst_uuid in non_primary_inst_uuids: 2146 test = inst_uuid in self.all_inst_info 2147 self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, 2148 self.cfg.GetInstanceName(inst_uuid), 2149 "instance should not run on node %s", node_i.name) 2150 self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name, 2151 "node is running unknown instance %s", inst_uuid) 2152 2153 self._VerifyExclusionTags(node_i.name, nimg.pinst, cluster.tags) 2154 2155 self._VerifyGroupDRBDVersion(all_nvinfo) 2156 self._VerifyGroupLVM(node_image, vg_name) 2157 2158 for node_uuid, result in extra_lv_nvinfo.items(): 2159 self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload, 2160 node_image[node_uuid], vg_name) 2161 2162 feedback_fn("* Verifying instance status") 2163 for inst_uuid in self.my_inst_uuids: 2164 instance = self.my_inst_info[inst_uuid] 2165 if verbose: 2166 feedback_fn("* Verifying instance %s" % instance.name) 2167 self._VerifyInstance(instance, node_image, instdisk[inst_uuid]) 2168 2169 # If the instance is not fully redundant we cannot survive losing its 2170 # primary node, so we are not N+1 compliant. 2171 inst_disks = self.cfg.GetInstanceDisks(instance.uuid) 2172 if not utils.AllDiskOfType(inst_disks, constants.DTS_MIRRORED): 2173 i_non_redundant.append(instance) 2174 2175 if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]: 2176 i_non_a_balanced.append(instance) 2177 2178 feedback_fn("* Verifying orphan volumes") 2179 reserved = utils.FieldSet(*cluster.reserved_lvs) 2180 2181 # We will get spurious "unknown volume" warnings if any node of this group 2182 # is secondary for an instance whose primary is in another group. To avoid 2183 # them, we find these instances and add their volumes to node_vol_should. 2184 for instance in self.all_inst_info.values(): 2185 for secondary in self.cfg.GetInstanceSecondaryNodes(instance.uuid): 2186 if (secondary in self.my_node_info 2187 and instance.uuid not in self.my_inst_info): 2188 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 2189 break 2190 2191 self._VerifyOrphanVolumes(vg_name, node_vol_should, node_image, reserved) 2192 2193 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks: 2194 feedback_fn("* Verifying N+1 Memory redundancy") 2195 self._VerifyNPlusOneMemory(node_image, self.my_inst_info) 2196 2197 self._VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced, 2198 i_offline, n_offline, n_drained) 2199 2200 return not self.bad
2201
2202 - def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2203 """Analyze the post-hooks' result 2204 2205 This method analyses the hook result, handles it, and sends some 2206 nicely-formatted feedback back to the user. 2207 2208 @param phase: one of L{constants.HOOKS_PHASE_POST} or 2209 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase 2210 @param hooks_results: the results of the multi-node hooks rpc call 2211 @param feedback_fn: function used send feedback back to the caller 2212 @param lu_result: previous Exec result 2213 @return: the new Exec result, based on the previous result 2214 and hook results 2215 2216 """ 2217 # We only really run POST phase hooks, only for non-empty groups, 2218 # and are only interested in their results 2219 if not self.my_node_uuids: 2220 # empty node group 2221 pass 2222 elif phase == constants.HOOKS_PHASE_POST: 2223 # Used to change hooks' output to proper indentation 2224 feedback_fn("* Hooks Results") 2225 assert hooks_results, "invalid result from hooks" 2226 2227 for node_name in hooks_results: 2228 res = hooks_results[node_name] 2229 msg = res.fail_msg 2230 test = msg and not res.offline 2231 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, 2232 "Communication failure in hooks execution: %s", msg) 2233 if test: 2234 lu_result = False 2235 continue 2236 if res.offline: 2237 # No need to investigate payload if node is offline 2238 continue 2239 for script, hkr, output in res.payload: 2240 test = hkr == constants.HKR_FAIL 2241 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, 2242 "Script %s failed, output:", script) 2243 if test: 2244 output = self._HOOKS_INDENT_RE.sub(" ", output) 2245 feedback_fn("%s" % output) 2246 lu_result = False 2247 2248 return lu_result
2249