Package ganeti :: Package cmdlib :: Package cluster :: Module verify
[hide private]
[frames] | no frames]

Source Code for Module ganeti.cmdlib.cluster.verify

   1  # 
   2  # 
   3   
   4  # Copyright (C) 2014 Google Inc. 
   5  # All rights reserved. 
   6  # 
   7  # Redistribution and use in source and binary forms, with or without 
   8  # modification, are permitted provided that the following conditions are 
   9  # met: 
  10  # 
  11  # 1. Redistributions of source code must retain the above copyright notice, 
  12  # this list of conditions and the following disclaimer. 
  13  # 
  14  # 2. Redistributions in binary form must reproduce the above copyright 
  15  # notice, this list of conditions and the following disclaimer in the 
  16  # documentation and/or other materials provided with the distribution. 
  17  # 
  18  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
  19  # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
  20  # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
  21  # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
  22  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  23  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  24  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  25  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  26  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  27  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  28  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  29   
  30  """Logical units for cluster verification.""" 
  31   
  32  import itertools 
  33  import logging 
  34  import operator 
  35  import re 
  36  import time 
  37  import ganeti.masterd.instance 
  38  import ganeti.rpc.node as rpc 
  39   
  40  from ganeti import compat 
  41  from ganeti import constants 
  42  from ganeti import errors 
  43  from ganeti import locking 
  44  from ganeti import pathutils 
  45  from ganeti import utils 
  46  from ganeti import vcluster 
  47  from ganeti import hypervisor 
  48  from ganeti import opcodes 
  49   
  50  from ganeti.cmdlib.base import LogicalUnit, NoHooksLU, ResultWithJobs 
  51  from ganeti.cmdlib.common import ShareAll, ComputeAncillaryFiles, \ 
  52      CheckNodePVs, ComputeIPolicyInstanceViolation, AnnotateDiskParams, \ 
  53      SupportsOob 
54 55 56 -def _GetAllHypervisorParameters(cluster, instances):
57 """Compute the set of all hypervisor parameters. 58 59 @type cluster: L{objects.Cluster} 60 @param cluster: the cluster object 61 @param instances: list of L{objects.Instance} 62 @param instances: additional instances from which to obtain parameters 63 @rtype: list of (origin, hypervisor, parameters) 64 @return: a list with all parameters found, indicating the hypervisor they 65 apply to, and the origin (can be "cluster", "os X", or "instance Y") 66 67 """ 68 hvp_data = [] 69 70 for hv_name in cluster.enabled_hypervisors: 71 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name))) 72 73 for os_name, os_hvp in cluster.os_hvp.items(): 74 for hv_name, hv_params in os_hvp.items(): 75 if hv_params: 76 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name) 77 hvp_data.append(("os %s" % os_name, hv_name, full_params)) 78 79 # TODO: collapse identical parameter values in a single one 80 for instance in instances: 81 if instance.hvparams: 82 hvp_data.append(("instance %s" % instance.name, instance.hypervisor, 83 cluster.FillHV(instance))) 84 85 return hvp_data
86
87 88 -class _VerifyErrors(object):
89 """Mix-in for cluster/group verify LUs. 90 91 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects 92 self.op and self._feedback_fn to be available.) 93 94 """ 95 96 ETYPE_FIELD = "code" 97 ETYPE_ERROR = constants.CV_ERROR 98 ETYPE_WARNING = constants.CV_WARNING 99
100 - def _Error(self, ecode, item, msg, *args, **kwargs):
101 """Format an error message. 102 103 Based on the opcode's error_codes parameter, either format a 104 parseable error code, or a simpler error string. 105 106 This must be called only from Exec and functions called from Exec. 107 108 """ 109 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) 110 itype, etxt, _ = ecode 111 # If the error code is in the list of ignored errors, demote the error to a 112 # warning 113 if etxt in self.op.ignore_errors: # pylint: disable=E1101 114 ltype = self.ETYPE_WARNING 115 # first complete the msg 116 if args: 117 msg = msg % args 118 # then format the whole message 119 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101 120 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg) 121 else: 122 if item: 123 item = " " + item 124 else: 125 item = "" 126 msg = "%s: %s%s: %s" % (ltype, itype, item, msg) 127 # and finally report it via the feedback_fn 128 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101 129 # do not mark the operation as failed for WARN cases only 130 if ltype == self.ETYPE_ERROR: 131 self.bad = True
132
133 - def _ErrorIf(self, cond, *args, **kwargs):
134 """Log an error message if the passed condition is True. 135 136 """ 137 if (bool(cond) 138 or self.op.debug_simulate_errors): # pylint: disable=E1101 139 self._Error(*args, **kwargs)
140
141 142 -class LUClusterVerify(NoHooksLU):
143 """Submits all jobs necessary to verify the cluster. 144 145 """ 146 REQ_BGL = False 147
148 - def ExpandNames(self):
149 self.needed_locks = {}
150
151 - def Exec(self, feedback_fn):
152 jobs = [] 153 154 if self.op.group_name: 155 groups = [self.op.group_name] 156 depends_fn = lambda: None 157 else: 158 groups = self.cfg.GetNodeGroupList() 159 160 # Verify global configuration 161 jobs.append([ 162 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors), 163 ]) 164 165 # Always depend on global verification 166 depends_fn = lambda: [(-len(jobs), [])] 167 168 jobs.extend( 169 [opcodes.OpClusterVerifyGroup(group_name=group, 170 ignore_errors=self.op.ignore_errors, 171 depends=depends_fn(), 172 verify_clutter=self.op.verify_clutter)] 173 for group in groups) 174 175 # Fix up all parameters 176 for op in itertools.chain(*jobs): # pylint: disable=W0142 177 op.debug_simulate_errors = self.op.debug_simulate_errors 178 op.verbose = self.op.verbose 179 op.error_codes = self.op.error_codes 180 try: 181 op.skip_checks = self.op.skip_checks 182 except AttributeError: 183 assert not isinstance(op, opcodes.OpClusterVerifyGroup) 184 185 return ResultWithJobs(jobs)
186
187 188 -class LUClusterVerifyDisks(NoHooksLU):
189 """Verifies the cluster disks status. 190 191 """ 192 REQ_BGL = False 193
194 - def ExpandNames(self):
195 self.share_locks = ShareAll() 196 self.needed_locks = { 197 locking.LEVEL_NODEGROUP: locking.ALL_SET, 198 }
199
200 - def Exec(self, feedback_fn):
201 group_names = self.owned_locks(locking.LEVEL_NODEGROUP) 202 203 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group 204 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)] 205 for group in group_names])
206
207 208 -class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
209 """Verifies the cluster config. 210 211 """ 212 REQ_BGL = False 213
214 - def _VerifyHVP(self, hvp_data):
215 """Verifies locally the syntax of the hypervisor parameters. 216 217 """ 218 for item, hv_name, hv_params in hvp_data: 219 msg = ("hypervisor %s parameters syntax check (source %s): %%s" % 220 (item, hv_name)) 221 try: 222 hv_class = hypervisor.GetHypervisorClass(hv_name) 223 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) 224 hv_class.CheckParameterSyntax(hv_params) 225 except errors.GenericError, err: 226 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
227
228 - def ExpandNames(self):
229 self.needed_locks = dict.fromkeys(locking.LEVELS, locking.ALL_SET) 230 self.share_locks = ShareAll()
231
232 - def CheckPrereq(self):
233 """Check prerequisites. 234 235 """ 236 # Retrieve all information 237 self.all_group_info = self.cfg.GetAllNodeGroupsInfo() 238 self.all_node_info = self.cfg.GetAllNodesInfo() 239 self.all_inst_info = self.cfg.GetAllInstancesInfo()
240
241 - def Exec(self, feedback_fn):
242 """Verify integrity of cluster, performing various test on nodes. 243 244 """ 245 self.bad = False 246 self._feedback_fn = feedback_fn 247 248 feedback_fn("* Verifying cluster config") 249 250 for msg in self.cfg.VerifyConfig(): 251 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg) 252 253 feedback_fn("* Verifying cluster certificate files") 254 255 for cert_filename in pathutils.ALL_CERT_FILES: 256 (errcode, msg) = utils.VerifyCertificate(cert_filename) 257 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode) 258 259 self._ErrorIf(not utils.CanRead(constants.LUXID_USER, 260 pathutils.NODED_CERT_FILE), 261 constants.CV_ECLUSTERCERT, 262 None, 263 pathutils.NODED_CERT_FILE + " must be accessible by the " + 264 constants.LUXID_USER + " user") 265 266 feedback_fn("* Verifying hypervisor parameters") 267 268 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(), 269 self.all_inst_info.values())) 270 271 feedback_fn("* Verifying all nodes belong to an existing group") 272 273 # We do this verification here because, should this bogus circumstance 274 # occur, it would never be caught by VerifyGroup, which only acts on 275 # nodes/instances reachable from existing node groups. 276 277 dangling_nodes = set(node for node in self.all_node_info.values() 278 if node.group not in self.all_group_info) 279 280 dangling_instances = {} 281 no_node_instances = [] 282 283 for inst in self.all_inst_info.values(): 284 if inst.primary_node in [node.uuid for node in dangling_nodes]: 285 dangling_instances.setdefault(inst.primary_node, []).append(inst) 286 elif inst.primary_node not in self.all_node_info: 287 no_node_instances.append(inst) 288 289 pretty_dangling = [ 290 "%s (%s)" % 291 (node.name, 292 utils.CommaJoin(inst.name for 293 inst in dangling_instances.get(node.uuid, []))) 294 for node in dangling_nodes] 295 296 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES, 297 None, 298 "the following nodes (and their instances) belong to a non" 299 " existing group: %s", utils.CommaJoin(pretty_dangling)) 300 301 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST, 302 None, 303 "the following instances have a non-existing primary-node:" 304 " %s", utils.CommaJoin(inst.name for 305 inst in no_node_instances)) 306 307 return not self.bad
308
309 310 -class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
311 """Verifies the status of a node group. 312 313 """ 314 HPATH = "cluster-verify" 315 HTYPE = constants.HTYPE_CLUSTER 316 REQ_BGL = False 317 318 _HOOKS_INDENT_RE = re.compile("^", re.M) 319
320 - class NodeImage(object):
321 """A class representing the logical and physical status of a node. 322 323 @type uuid: string 324 @ivar uuid: the node UUID to which this object refers 325 @ivar volumes: a structure as returned from 326 L{ganeti.backend.GetVolumeList} (runtime) 327 @ivar instances: a list of running instances (runtime) 328 @ivar pinst: list of configured primary instances (config) 329 @ivar sinst: list of configured secondary instances (config) 330 @ivar sbp: dictionary of {primary-node: list of instances} for all 331 instances for which this node is secondary (config) 332 @ivar mfree: free memory, as reported by hypervisor (runtime) 333 @ivar dfree: free disk, as reported by the node (runtime) 334 @ivar offline: the offline status (config) 335 @type rpc_fail: boolean 336 @ivar rpc_fail: whether the RPC verify call was successfull (overall, 337 not whether the individual keys were correct) (runtime) 338 @type lvm_fail: boolean 339 @ivar lvm_fail: whether the RPC call didn't return valid LVM data 340 @type hyp_fail: boolean 341 @ivar hyp_fail: whether the RPC call didn't return the instance list 342 @type ghost: boolean 343 @ivar ghost: whether this is a known node or not (config) 344 @type os_fail: boolean 345 @ivar os_fail: whether the RPC call didn't return valid OS data 346 @type oslist: list 347 @ivar oslist: list of OSes as diagnosed by DiagnoseOS 348 @type vm_capable: boolean 349 @ivar vm_capable: whether the node can host instances 350 @type pv_min: float 351 @ivar pv_min: size in MiB of the smallest PVs 352 @type pv_max: float 353 @ivar pv_max: size in MiB of the biggest PVs 354 355 """
356 - def __init__(self, offline=False, uuid=None, vm_capable=True):
357 self.uuid = uuid 358 self.volumes = {} 359 self.instances = [] 360 self.pinst = [] 361 self.sinst = [] 362 self.sbp = {} 363 self.mfree = 0 364 self.dfree = 0 365 self.offline = offline 366 self.vm_capable = vm_capable 367 self.rpc_fail = False 368 self.lvm_fail = False 369 self.hyp_fail = False 370 self.ghost = False 371 self.os_fail = False 372 self.oslist = {} 373 self.pv_min = None 374 self.pv_max = None
375
376 - def ExpandNames(self):
377 # This raises errors.OpPrereqError on its own: 378 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name) 379 380 # Get instances in node group; this is unsafe and needs verification later 381 inst_uuids = \ 382 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) 383 384 self.needed_locks = { 385 locking.LEVEL_INSTANCE: self.cfg.GetInstanceNames(inst_uuids), 386 locking.LEVEL_NODEGROUP: [self.group_uuid], 387 locking.LEVEL_NODE: [], 388 } 389 390 self.share_locks = ShareAll()
391
392 - def DeclareLocks(self, level):
393 if level == locking.LEVEL_NODE: 394 # Get members of node group; this is unsafe and needs verification later 395 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members) 396 397 # In Exec(), we warn about mirrored instances that have primary and 398 # secondary living in separate node groups. To fully verify that 399 # volumes for these instances are healthy, we will need to do an 400 # extra call to their secondaries. We ensure here those nodes will 401 # be locked. 402 for inst_name in self.owned_locks(locking.LEVEL_INSTANCE): 403 # Important: access only the instances whose lock is owned 404 instance = self.cfg.GetInstanceInfoByName(inst_name) 405 disks = self.cfg.GetInstanceDisks(instance.uuid) 406 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 407 nodes.update(self.cfg.GetInstanceSecondaryNodes(instance.uuid)) 408 409 self.needed_locks[locking.LEVEL_NODE] = nodes
410
411 - def CheckPrereq(self):
412 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP) 413 self.group_info = self.cfg.GetNodeGroup(self.group_uuid) 414 415 group_node_uuids = set(self.group_info.members) 416 group_inst_uuids = \ 417 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) 418 419 unlocked_node_uuids = \ 420 group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE)) 421 422 unlocked_inst_uuids = \ 423 group_inst_uuids.difference( 424 [self.cfg.GetInstanceInfoByName(name).uuid 425 for name in self.owned_locks(locking.LEVEL_INSTANCE)]) 426 427 if unlocked_node_uuids: 428 raise errors.OpPrereqError( 429 "Missing lock for nodes: %s" % 430 utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)), 431 errors.ECODE_STATE) 432 433 if unlocked_inst_uuids: 434 raise errors.OpPrereqError( 435 "Missing lock for instances: %s" % 436 utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)), 437 errors.ECODE_STATE) 438 439 self.all_node_info = self.cfg.GetAllNodesInfo() 440 self.all_inst_info = self.cfg.GetAllInstancesInfo() 441 self.all_disks_info = self.cfg.GetAllDisksInfo() 442 443 self.my_node_uuids = group_node_uuids 444 self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid]) 445 for node_uuid in group_node_uuids) 446 447 self.my_inst_uuids = group_inst_uuids 448 self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid]) 449 for inst_uuid in group_inst_uuids) 450 451 # We detect here the nodes that will need the extra RPC calls for verifying 452 # split LV volumes; they should be locked. 453 extra_lv_nodes = set() 454 455 for inst in self.my_inst_info.values(): 456 disks = self.cfg.GetInstanceDisks(inst.uuid) 457 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 458 inst_nodes = self.cfg.GetInstanceNodes(inst.uuid) 459 for nuuid in inst_nodes: 460 if self.all_node_info[nuuid].group != self.group_uuid: 461 extra_lv_nodes.add(nuuid) 462 463 unlocked_lv_nodes = \ 464 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE)) 465 466 if unlocked_lv_nodes: 467 raise errors.OpPrereqError("Missing node locks for LV check: %s" % 468 utils.CommaJoin(unlocked_lv_nodes), 469 errors.ECODE_STATE) 470 self.extra_lv_nodes = list(extra_lv_nodes)
471
472 - def _VerifyNode(self, ninfo, nresult):
473 """Perform some basic validation on data returned from a node. 474 475 - check the result data structure is well formed and has all the 476 mandatory fields 477 - check ganeti version 478 479 @type ninfo: L{objects.Node} 480 @param ninfo: the node to check 481 @param nresult: the results from the node 482 @rtype: boolean 483 @return: whether overall this call was successful (and we can expect 484 reasonable values in the respose) 485 486 """ 487 # main result, nresult should be a non-empty dict 488 test = not nresult or not isinstance(nresult, dict) 489 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name, 490 "unable to verify node: no data returned") 491 if test: 492 return False 493 494 # compares ganeti version 495 local_version = constants.PROTOCOL_VERSION 496 remote_version = nresult.get("version", None) 497 test = not (remote_version and 498 isinstance(remote_version, (list, tuple)) and 499 len(remote_version) == 2) 500 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name, 501 "connection to node returned invalid data") 502 if test: 503 return False 504 505 test = local_version != remote_version[0] 506 self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name, 507 "incompatible protocol versions: master %s," 508 " node %s", local_version, remote_version[0]) 509 if test: 510 return False 511 512 # node seems compatible, we can actually try to look into its results 513 514 # full package version 515 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1], 516 constants.CV_ENODEVERSION, ninfo.name, 517 "software version mismatch: master %s, node %s", 518 constants.RELEASE_VERSION, remote_version[1], 519 code=self.ETYPE_WARNING) 520 521 hyp_result = nresult.get(constants.NV_HYPERVISOR, None) 522 if ninfo.vm_capable and isinstance(hyp_result, dict): 523 for hv_name, hv_result in hyp_result.iteritems(): 524 test = hv_result is not None 525 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 526 "hypervisor %s verify failure: '%s'", hv_name, hv_result) 527 528 hvp_result = nresult.get(constants.NV_HVPARAMS, None) 529 if ninfo.vm_capable and isinstance(hvp_result, list): 530 for item, hv_name, hv_result in hvp_result: 531 self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name, 532 "hypervisor %s parameter verify failure (source %s): %s", 533 hv_name, item, hv_result) 534 535 test = nresult.get(constants.NV_NODESETUP, 536 ["Missing NODESETUP results"]) 537 self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name, 538 "node setup error: %s", "; ".join(test)) 539 540 return True
541
542 - def _VerifyNodeTime(self, ninfo, nresult, 543 nvinfo_starttime, nvinfo_endtime):
544 """Check the node time. 545 546 @type ninfo: L{objects.Node} 547 @param ninfo: the node to check 548 @param nresult: the remote results for the node 549 @param nvinfo_starttime: the start time of the RPC call 550 @param nvinfo_endtime: the end time of the RPC call 551 552 """ 553 ntime = nresult.get(constants.NV_TIME, None) 554 try: 555 ntime_merged = utils.MergeTime(ntime) 556 except (ValueError, TypeError): 557 self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name, 558 "Node returned invalid time") 559 return 560 561 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW): 562 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged) 563 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW): 564 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime) 565 else: 566 ntime_diff = None 567 568 self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name, 569 "Node time diverges by at least %s from master node time", 570 ntime_diff)
571
572 - def _UpdateVerifyNodeLVM(self, ninfo, nresult, vg_name, nimg):
573 """Check the node LVM results and update info for cross-node checks. 574 575 @type ninfo: L{objects.Node} 576 @param ninfo: the node to check 577 @param nresult: the remote results for the node 578 @param vg_name: the configured VG name 579 @type nimg: L{NodeImage} 580 @param nimg: node image 581 582 """ 583 if vg_name is None: 584 return 585 586 # checks vg existence and size > 20G 587 vglist = nresult.get(constants.NV_VGLIST, None) 588 test = not vglist 589 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name, 590 "unable to check volume groups") 591 if not test: 592 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name, 593 constants.MIN_VG_SIZE) 594 self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus) 595 596 # Check PVs 597 (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage) 598 for em in errmsgs: 599 self._Error(constants.CV_ENODELVM, ninfo.name, em) 600 if pvminmax is not None: 601 (nimg.pv_min, nimg.pv_max) = pvminmax
602
603 - def _VerifyGroupDRBDVersion(self, node_verify_infos):
604 """Check cross-node DRBD version consistency. 605 606 @type node_verify_infos: dict 607 @param node_verify_infos: infos about nodes as returned from the 608 node_verify call. 609 610 """ 611 node_versions = {} 612 for node_uuid, ndata in node_verify_infos.items(): 613 nresult = ndata.payload 614 if nresult: 615 version = nresult.get(constants.NV_DRBDVERSION, None) 616 if version: 617 node_versions[node_uuid] = version 618 619 if len(set(node_versions.values())) > 1: 620 for node_uuid, version in sorted(node_versions.items()): 621 msg = "DRBD version mismatch: %s" % version 622 self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg, 623 code=self.ETYPE_WARNING)
624
625 - def _VerifyGroupLVM(self, node_image, vg_name):
626 """Check cross-node consistency in LVM. 627 628 @type node_image: dict 629 @param node_image: info about nodes, mapping from node to names to 630 L{NodeImage} objects 631 @param vg_name: the configured VG name 632 633 """ 634 if vg_name is None: 635 return 636 637 # Only exclusive storage needs this kind of checks 638 if not self._exclusive_storage: 639 return 640 641 # exclusive_storage wants all PVs to have the same size (approximately), 642 # if the smallest and the biggest ones are okay, everything is fine. 643 # pv_min is None iff pv_max is None 644 vals = filter((lambda ni: ni.pv_min is not None), node_image.values()) 645 if not vals: 646 return 647 (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals) 648 (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals) 649 bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax) 650 self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE, self.group_info.name, 651 "PV sizes differ too much in the group; smallest (%s MB) is" 652 " on %s, biggest (%s MB) is on %s", 653 pvmin, self.cfg.GetNodeName(minnode_uuid), 654 pvmax, self.cfg.GetNodeName(maxnode_uuid))
655
656 - def _VerifyNodeBridges(self, ninfo, nresult, bridges):
657 """Check the node bridges. 658 659 @type ninfo: L{objects.Node} 660 @param ninfo: the node to check 661 @param nresult: the remote results for the node 662 @param bridges: the expected list of bridges 663 664 """ 665 if not bridges: 666 return 667 668 missing = nresult.get(constants.NV_BRIDGES, None) 669 test = not isinstance(missing, list) 670 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 671 "did not return valid bridge information") 672 if not test: 673 self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name, 674 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
675
676 - def _VerifyNodeUserScripts(self, ninfo, nresult):
677 """Check the results of user scripts presence and executability on the node 678 679 @type ninfo: L{objects.Node} 680 @param ninfo: the node to check 681 @param nresult: the remote results for the node 682 683 """ 684 test = not constants.NV_USERSCRIPTS in nresult 685 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, ninfo.name, 686 "did not return user scripts information") 687 688 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None) 689 if not test: 690 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, ninfo.name, 691 "user scripts not present or not executable: %s" % 692 utils.CommaJoin(sorted(broken_scripts)))
693
694 - def _VerifyNodeNetwork(self, ninfo, nresult):
695 """Check the node network connectivity results. 696 697 @type ninfo: L{objects.Node} 698 @param ninfo: the node to check 699 @param nresult: the remote results for the node 700 701 """ 702 test = constants.NV_NODELIST not in nresult 703 self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name, 704 "node hasn't returned node ssh connectivity data") 705 if not test: 706 if nresult[constants.NV_NODELIST]: 707 for a_node, a_msg in nresult[constants.NV_NODELIST].items(): 708 self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name, 709 "ssh communication with node '%s': %s", a_node, a_msg) 710 711 test = constants.NV_NODENETTEST not in nresult 712 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 713 "node hasn't returned node tcp connectivity data") 714 if not test: 715 if nresult[constants.NV_NODENETTEST]: 716 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys()) 717 for anode in nlist: 718 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, 719 "tcp communication with node '%s': %s", 720 anode, nresult[constants.NV_NODENETTEST][anode]) 721 722 test = constants.NV_MASTERIP not in nresult 723 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, 724 "node hasn't returned node master IP reachability data") 725 if not test: 726 if not nresult[constants.NV_MASTERIP]: 727 if ninfo.uuid == self.master_node: 728 msg = "the master node cannot reach the master IP (not configured?)" 729 else: 730 msg = "cannot reach the master IP" 731 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg)
732
733 - def _VerifyInstance(self, instance, node_image, diskstatus):
734 """Verify an instance. 735 736 This function checks to see if the required block devices are 737 available on the instance's node, and that the nodes are in the correct 738 state. 739 740 """ 741 pnode_uuid = instance.primary_node 742 pnode_img = node_image[pnode_uuid] 743 groupinfo = self.cfg.GetAllNodeGroupsInfo() 744 745 node_vol_should = {} 746 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 747 748 cluster = self.cfg.GetClusterInfo() 749 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster, 750 self.group_info) 751 err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg) 752 self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name, 753 utils.CommaJoin(err), code=self.ETYPE_WARNING) 754 755 for node_uuid in node_vol_should: 756 n_img = node_image[node_uuid] 757 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail: 758 # ignore missing volumes on offline or broken nodes 759 continue 760 for volume in node_vol_should[node_uuid]: 761 test = volume not in n_img.volumes 762 self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name, 763 "volume %s missing on node %s", volume, 764 self.cfg.GetNodeName(node_uuid)) 765 766 if instance.admin_state == constants.ADMINST_UP: 767 test = instance.uuid not in pnode_img.instances and not pnode_img.offline 768 self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name, 769 "instance not running on its primary node %s", 770 self.cfg.GetNodeName(pnode_uuid)) 771 self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE, 772 instance.name, "instance is marked as running and lives on" 773 " offline node %s", self.cfg.GetNodeName(pnode_uuid)) 774 775 diskdata = [(nname, success, status, idx) 776 for (nname, disks) in diskstatus.items() 777 for idx, (success, status) in enumerate(disks)] 778 779 for nname, success, bdev_status, idx in diskdata: 780 # the 'ghost node' construction in Exec() ensures that we have a 781 # node here 782 snode = node_image[nname] 783 bad_snode = snode.ghost or snode.offline 784 self._ErrorIf(instance.disks_active and 785 not success and not bad_snode, 786 constants.CV_EINSTANCEFAULTYDISK, instance.name, 787 "couldn't retrieve status for disk/%s on %s: %s", 788 idx, self.cfg.GetNodeName(nname), bdev_status) 789 790 if instance.disks_active and success and bdev_status.is_degraded: 791 msg = "disk/%s on %s is degraded" % (idx, self.cfg.GetNodeName(nname)) 792 793 code = self.ETYPE_ERROR 794 accepted_lds = [constants.LDS_OKAY, constants.LDS_SYNC] 795 796 if bdev_status.ldisk_status in accepted_lds: 797 code = self.ETYPE_WARNING 798 799 msg += "; local disk state is '%s'" % \ 800 constants.LDS_NAMES[bdev_status.ldisk_status] 801 802 self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg, 803 code=code) 804 805 self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline, 806 constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid), 807 "instance %s, connection to primary node failed", 808 instance.name) 809 810 secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid) 811 self._ErrorIf(len(secondary_nodes) > 1, 812 constants.CV_EINSTANCELAYOUT, instance.name, 813 "instance has multiple secondary nodes: %s", 814 utils.CommaJoin(secondary_nodes), 815 code=self.ETYPE_WARNING) 816 817 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid) 818 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, inst_nodes) 819 disks = self.cfg.GetInstanceDisks(instance.uuid) 820 if any(es_flags.values()): 821 if not utils.AllDiskOfType(disks, constants.DTS_EXCL_STORAGE): 822 # Disk template not compatible with exclusive_storage: no instance 823 # node should have the flag set 824 es_nodes = [n 825 for (n, es) in es_flags.items() 826 if es] 827 unsupported = [d.dev_type for d in disks 828 if d.dev_type not in constants.DTS_EXCL_STORAGE] 829 self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name, 830 "instance uses disk types %s, which are not supported on" 831 " nodes that have exclusive storage set: %s", 832 utils.CommaJoin(unsupported), 833 utils.CommaJoin(self.cfg.GetNodeNames(es_nodes))) 834 for (idx, disk) in enumerate(disks): 835 self._ErrorIf(disk.spindles is None, 836 constants.CV_EINSTANCEMISSINGCFGPARAMETER, instance.name, 837 "number of spindles not configured for disk %s while" 838 " exclusive storage is enabled, try running" 839 " gnt-cluster repair-disk-sizes", idx) 840 841 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): 842 instance_nodes = utils.NiceSort(inst_nodes) 843 instance_groups = {} 844 845 for node_uuid in instance_nodes: 846 instance_groups.setdefault(self.all_node_info[node_uuid].group, 847 []).append(node_uuid) 848 849 pretty_list = [ 850 "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)), 851 groupinfo[group].name) 852 # Sort so that we always list the primary node first. 853 for group, nodes in sorted(instance_groups.items(), 854 key=lambda (_, nodes): pnode_uuid in nodes, 855 reverse=True)] 856 857 self._ErrorIf(len(instance_groups) > 1, 858 constants.CV_EINSTANCESPLITGROUPS, 859 instance.name, "instance has primary and secondary nodes in" 860 " different groups: %s", utils.CommaJoin(pretty_list), 861 code=self.ETYPE_WARNING) 862 863 inst_nodes_offline = [] 864 for snode in secondary_nodes: 865 s_img = node_image[snode] 866 self._ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC, 867 self.cfg.GetNodeName(snode), 868 "instance %s, connection to secondary node failed", 869 instance.name) 870 871 if s_img.offline: 872 inst_nodes_offline.append(snode) 873 874 # warn that the instance lives on offline nodes 875 self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, 876 instance.name, "instance has offline secondary node(s) %s", 877 utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline))) 878 # ... or ghost/non-vm_capable nodes 879 for node_uuid in inst_nodes: 880 self._ErrorIf(node_image[node_uuid].ghost, constants.CV_EINSTANCEBADNODE, 881 instance.name, "instance lives on ghost node %s", 882 self.cfg.GetNodeName(node_uuid)) 883 self._ErrorIf(not node_image[node_uuid].vm_capable, 884 constants.CV_EINSTANCEBADNODE, instance.name, 885 "instance lives on non-vm_capable node %s", 886 self.cfg.GetNodeName(node_uuid))
887
888 - def _VerifyOrphanVolumes(self, vg_name, node_vol_should, node_image, 889 reserved):
890 """Verify if there are any unknown volumes in the cluster. 891 892 The .os, .swap and backup volumes are ignored. All other volumes are 893 reported as unknown. 894 895 @type vg_name: string 896 @param vg_name: the name of the Ganeti-administered volume group 897 @type reserved: L{ganeti.utils.FieldSet} 898 @param reserved: a FieldSet of reserved volume names 899 900 """ 901 for node_uuid, n_img in node_image.items(): 902 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or 903 self.all_node_info[node_uuid].group != self.group_uuid): 904 # skip non-healthy nodes 905 continue 906 for volume in n_img.volumes: 907 # skip volumes not belonging to the ganeti-administered volume group 908 if volume.split('/')[0] != vg_name: 909 continue 910 911 test = ((node_uuid not in node_vol_should or 912 volume not in node_vol_should[node_uuid]) and 913 not reserved.Matches(volume)) 914 self._ErrorIf(test, constants.CV_ENODEORPHANLV, 915 self.cfg.GetNodeName(node_uuid), 916 "volume %s is unknown", volume, 917 code=_VerifyErrors.ETYPE_WARNING)
918
919 - def _VerifyNPlusOneMemory(self, node_image, all_insts):
920 """Verify N+1 Memory Resilience. 921 922 Check that if one single node dies we can still start all the 923 instances it was primary for. 924 925 """ 926 cluster_info = self.cfg.GetClusterInfo() 927 for node_uuid, n_img in node_image.items(): 928 # This code checks that every node which is now listed as 929 # secondary has enough memory to host all instances it is 930 # supposed to should a single other node in the cluster fail. 931 # FIXME: not ready for failover to an arbitrary node 932 # FIXME: does not support file-backed instances 933 # WARNING: we currently take into account down instances as well 934 # as up ones, considering that even if they're down someone 935 # might want to start them even in the event of a node failure. 936 if n_img.offline or \ 937 self.all_node_info[node_uuid].group != self.group_uuid: 938 # we're skipping nodes marked offline and nodes in other groups from 939 # the N+1 warning, since most likely we don't have good memory 940 # information from them; we already list instances living on such 941 # nodes, and that's enough warning 942 continue 943 #TODO(dynmem): also consider ballooning out other instances 944 for prinode, inst_uuids in n_img.sbp.items(): 945 needed_mem = 0 946 for inst_uuid in inst_uuids: 947 bep = cluster_info.FillBE(all_insts[inst_uuid]) 948 if bep[constants.BE_AUTO_BALANCE]: 949 needed_mem += bep[constants.BE_MINMEM] 950 test = n_img.mfree < needed_mem 951 self._ErrorIf(test, constants.CV_ENODEN1, 952 self.cfg.GetNodeName(node_uuid), 953 "not enough memory to accomodate instance failovers" 954 " should node %s fail (%dMiB needed, %dMiB available)", 955 self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
956
957 - def _VerifyClientCertificates(self, nodes, all_nvinfo):
958 """Verifies the consistency of the client certificates. 959 960 This includes several aspects: 961 - the individual validation of all nodes' certificates 962 - the consistency of the master candidate certificate map 963 - the consistency of the master candidate certificate map with the 964 certificates that the master candidates are actually using. 965 966 @param nodes: the list of nodes to consider in this verification 967 @param all_nvinfo: the map of results of the verify_node call to 968 all nodes 969 970 """ 971 candidate_certs = self.cfg.GetClusterInfo().candidate_certs 972 if candidate_certs is None or len(candidate_certs) == 0: 973 self._ErrorIf( 974 True, constants.CV_ECLUSTERCLIENTCERT, None, 975 "The cluster's list of master candidate certificates is empty." 976 " If you just updated the cluster, please run" 977 " 'gnt-cluster renew-crypto --new-node-certificates'.") 978 return 979 980 self._ErrorIf( 981 len(candidate_certs) != len(set(candidate_certs.values())), 982 constants.CV_ECLUSTERCLIENTCERT, None, 983 "There are at least two master candidates configured to use the same" 984 " certificate.") 985 986 # collect the client certificate 987 for node in nodes: 988 if node.offline: 989 continue 990 991 nresult = all_nvinfo[node.uuid] 992 if nresult.fail_msg or not nresult.payload: 993 continue 994 995 (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None) 996 997 self._ErrorIf( 998 errcode is not None, constants.CV_ECLUSTERCLIENTCERT, None, 999 "Client certificate of node '%s' failed validation: %s (code '%s')", 1000 node.uuid, msg, errcode) 1001 1002 if not errcode: 1003 digest = msg 1004 if node.master_candidate: 1005 if node.uuid in candidate_certs: 1006 self._ErrorIf( 1007 digest != candidate_certs[node.uuid], 1008 constants.CV_ECLUSTERCLIENTCERT, None, 1009 "Client certificate digest of master candidate '%s' does not" 1010 " match its entry in the cluster's map of master candidate" 1011 " certificates. Expected: %s Got: %s", node.uuid, 1012 digest, candidate_certs[node.uuid]) 1013 else: 1014 self._ErrorIf( 1015 True, constants.CV_ECLUSTERCLIENTCERT, None, 1016 "The master candidate '%s' does not have an entry in the" 1017 " map of candidate certificates.", node.uuid) 1018 self._ErrorIf( 1019 digest in candidate_certs.values(), 1020 constants.CV_ECLUSTERCLIENTCERT, None, 1021 "Master candidate '%s' is using a certificate of another node.", 1022 node.uuid) 1023 else: 1024 self._ErrorIf( 1025 node.uuid in candidate_certs, 1026 constants.CV_ECLUSTERCLIENTCERT, None, 1027 "Node '%s' is not a master candidate, but still listed in the" 1028 " map of master candidate certificates.", node.uuid) 1029 self._ErrorIf( 1030 (node.uuid not in candidate_certs) and 1031 (digest in candidate_certs.values()), 1032 constants.CV_ECLUSTERCLIENTCERT, None, 1033 "Node '%s' is not a master candidate and is incorrectly using a" 1034 " certificate of another node which is master candidate.", 1035 node.uuid)
1036
1037 - def _VerifySshSetup(self, nodes, all_nvinfo):
1038 """Evaluates the verification results of the SSH setup and clutter test. 1039 1040 @param nodes: List of L{objects.Node} objects 1041 @param all_nvinfo: RPC results 1042 1043 """ 1044 for node in nodes: 1045 if not node.offline: 1046 nresult = all_nvinfo[node.uuid] 1047 if nresult.fail_msg or not nresult.payload: 1048 self._ErrorIf(True, constants.CV_ENODESSH, node.name, 1049 "Could not verify the SSH setup of this node.") 1050 return 1051 for ssh_test in [constants.NV_SSH_SETUP, constants.NV_SSH_CLUTTER]: 1052 result = nresult.payload.get(ssh_test, None) 1053 error_msg = "" 1054 if isinstance(result, list): 1055 error_msg = " ".join(result) 1056 self._ErrorIf(result, 1057 constants.CV_ENODESSH, None, error_msg)
1058
1059 - def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo, 1060 (files_all, files_opt, files_mc, files_vm)):
1061 """Verifies file checksums collected from all nodes. 1062 1063 @param nodes: List of L{objects.Node} objects 1064 @param master_node_uuid: UUID of master node 1065 @param all_nvinfo: RPC results 1066 1067 """ 1068 # Define functions determining which nodes to consider for a file 1069 files2nodefn = [ 1070 (files_all, None), 1071 (files_mc, lambda node: (node.master_candidate or 1072 node.uuid == master_node_uuid)), 1073 (files_vm, lambda node: node.vm_capable), 1074 ] 1075 1076 # Build mapping from filename to list of nodes which should have the file 1077 nodefiles = {} 1078 for (files, fn) in files2nodefn: 1079 if fn is None: 1080 filenodes = nodes 1081 else: 1082 filenodes = filter(fn, nodes) 1083 nodefiles.update((filename, 1084 frozenset(map(operator.attrgetter("uuid"), filenodes))) 1085 for filename in files) 1086 1087 assert set(nodefiles) == (files_all | files_mc | files_vm) 1088 1089 fileinfo = dict((filename, {}) for filename in nodefiles) 1090 ignore_nodes = set() 1091 1092 for node in nodes: 1093 if node.offline: 1094 ignore_nodes.add(node.uuid) 1095 continue 1096 1097 nresult = all_nvinfo[node.uuid] 1098 1099 if nresult.fail_msg or not nresult.payload: 1100 node_files = None 1101 else: 1102 fingerprints = nresult.payload.get(constants.NV_FILELIST, {}) 1103 node_files = dict((vcluster.LocalizeVirtualPath(key), value) 1104 for (key, value) in fingerprints.items()) 1105 del fingerprints 1106 1107 test = not (node_files and isinstance(node_files, dict)) 1108 self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name, 1109 "Node did not return file checksum data") 1110 if test: 1111 ignore_nodes.add(node.uuid) 1112 continue 1113 1114 # Build per-checksum mapping from filename to nodes having it 1115 for (filename, checksum) in node_files.items(): 1116 assert filename in nodefiles 1117 fileinfo[filename].setdefault(checksum, set()).add(node.uuid) 1118 1119 for (filename, checksums) in fileinfo.items(): 1120 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum" 1121 1122 # Nodes having the file 1123 with_file = frozenset(node_uuid 1124 for node_uuids in fileinfo[filename].values() 1125 for node_uuid in node_uuids) - ignore_nodes 1126 1127 expected_nodes = nodefiles[filename] - ignore_nodes 1128 1129 # Nodes missing file 1130 missing_file = expected_nodes - with_file 1131 1132 if filename in files_opt: 1133 # All or no nodes 1134 self._ErrorIf(missing_file and missing_file != expected_nodes, 1135 constants.CV_ECLUSTERFILECHECK, None, 1136 "File %s is optional, but it must exist on all or no" 1137 " nodes (not found on %s)", 1138 filename, 1139 utils.CommaJoin( 1140 utils.NiceSort( 1141 map(self.cfg.GetNodeName, missing_file)))) 1142 else: 1143 self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None, 1144 "File %s is missing from node(s) %s", filename, 1145 utils.CommaJoin( 1146 utils.NiceSort( 1147 map(self.cfg.GetNodeName, missing_file)))) 1148 1149 # Warn if a node has a file it shouldn't 1150 unexpected = with_file - expected_nodes 1151 self._ErrorIf(unexpected, 1152 constants.CV_ECLUSTERFILECHECK, None, 1153 "File %s should not exist on node(s) %s", 1154 filename, utils.CommaJoin( 1155 utils.NiceSort(map(self.cfg.GetNodeName, unexpected)))) 1156 1157 # See if there are multiple versions of the file 1158 test = len(checksums) > 1 1159 if test: 1160 variants = ["variant %s on %s" % 1161 (idx + 1, 1162 utils.CommaJoin(utils.NiceSort( 1163 map(self.cfg.GetNodeName, node_uuids)))) 1164 for (idx, (checksum, node_uuids)) in 1165 enumerate(sorted(checksums.items()))] 1166 else: 1167 variants = [] 1168 1169 self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None, 1170 "File %s found with %s different checksums (%s)", 1171 filename, len(checksums), "; ".join(variants))
1172
1173 - def _VerifyNodeDrbdHelper(self, ninfo, nresult, drbd_helper):
1174 """Verify the drbd helper. 1175 1176 """ 1177 if drbd_helper: 1178 helper_result = nresult.get(constants.NV_DRBDHELPER, None) 1179 test = (helper_result is None) 1180 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1181 "no drbd usermode helper returned") 1182 if helper_result: 1183 status, payload = helper_result 1184 test = not status 1185 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1186 "drbd usermode helper check unsuccessful: %s", payload) 1187 test = status and (payload != drbd_helper) 1188 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name, 1189 "wrong drbd usermode helper: %s", payload)
1190 1191 @staticmethod
1192 - def _ComputeDrbdMinors(ninfo, instanceinfo, disks_info, drbd_map, error_if):
1193 """Gives the DRBD information in a map for a node. 1194 1195 @type ninfo: L{objects.Node} 1196 @param ninfo: the node to check 1197 @param instanceinfo: the dict of instances 1198 @param disks_info: the dict of disks 1199 @param drbd_map: the DRBD map as returned by 1200 L{ganeti.config.ConfigWriter.ComputeDRBDMap} 1201 @type error_if: callable like L{_ErrorIf} 1202 @param error_if: The error reporting function 1203 @return: dict from minor number to (disk_uuid, instance_uuid, active) 1204 1205 """ 1206 node_drbd = {} 1207 for minor, disk_uuid in drbd_map[ninfo.uuid].items(): 1208 test = disk_uuid not in disks_info 1209 error_if(test, constants.CV_ECLUSTERCFG, None, 1210 "ghost disk '%s' in temporary DRBD map", disk_uuid) 1211 # ghost disk should not be active, but otherwise we 1212 # don't give double warnings (both ghost disk and 1213 # unallocated minor in use) 1214 if test: 1215 node_drbd[minor] = (disk_uuid, None, False) 1216 else: 1217 disk_active = False 1218 disk_instance = None 1219 for (inst_uuid, inst) in instanceinfo.items(): 1220 if disk_uuid in inst.disks: 1221 disk_active = inst.disks_active 1222 disk_instance = inst_uuid 1223 break 1224 node_drbd[minor] = (disk_uuid, disk_instance, disk_active) 1225 return node_drbd
1226
1227 - def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, disks_info, 1228 drbd_helper, drbd_map):
1229 """Verifies and the node DRBD status. 1230 1231 @type ninfo: L{objects.Node} 1232 @param ninfo: the node to check 1233 @param nresult: the remote results for the node 1234 @param instanceinfo: the dict of instances 1235 @param disks_info: the dict of disks 1236 @param drbd_helper: the configured DRBD usermode helper 1237 @param drbd_map: the DRBD map as returned by 1238 L{ganeti.config.ConfigWriter.ComputeDRBDMap} 1239 1240 """ 1241 self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper) 1242 1243 # compute the DRBD minors 1244 node_drbd = self._ComputeDrbdMinors(ninfo, instanceinfo, disks_info, 1245 drbd_map, self._ErrorIf) 1246 1247 # and now check them 1248 used_minors = nresult.get(constants.NV_DRBDLIST, []) 1249 test = not isinstance(used_minors, (tuple, list)) 1250 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1251 "cannot parse drbd status file: %s", str(used_minors)) 1252 if test: 1253 # we cannot check drbd status 1254 return 1255 1256 for minor, (disk_uuid, inst_uuid, must_exist) in node_drbd.items(): 1257 test = minor not in used_minors and must_exist 1258 if inst_uuid is not None: 1259 attached = "(attached in instance '%s')" % \ 1260 self.cfg.GetInstanceName(inst_uuid) 1261 else: 1262 attached = "(detached)" 1263 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1264 "drbd minor %d of disk %s %s is not active", 1265 minor, disk_uuid, attached) 1266 for minor in used_minors: 1267 test = minor not in node_drbd 1268 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name, 1269 "unallocated drbd minor %d is in use", minor)
1270
1271 - def _UpdateNodeOS(self, ninfo, nresult, nimg):
1272 """Builds the node OS structures. 1273 1274 @type ninfo: L{objects.Node} 1275 @param ninfo: the node to check 1276 @param nresult: the remote results for the node 1277 @param nimg: the node image object 1278 1279 """ 1280 remote_os = nresult.get(constants.NV_OSLIST, None) 1281 test = (not isinstance(remote_os, list) or 1282 not compat.all(isinstance(v, list) and len(v) == 8 1283 for v in remote_os)) 1284 1285 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name, 1286 "node hasn't returned valid OS data") 1287 1288 nimg.os_fail = test 1289 1290 if test: 1291 return 1292 1293 os_dict = {} 1294 1295 for (name, os_path, status, diagnose, 1296 variants, parameters, api_ver, 1297 trusted) in nresult[constants.NV_OSLIST]: 1298 1299 if name not in os_dict: 1300 os_dict[name] = [] 1301 1302 # parameters is a list of lists instead of list of tuples due to 1303 # JSON lacking a real tuple type, fix it: 1304 parameters = [tuple(v) for v in parameters] 1305 os_dict[name].append((os_path, status, diagnose, 1306 set(variants), set(parameters), set(api_ver), 1307 trusted)) 1308 1309 nimg.oslist = os_dict
1310
1311 - def _VerifyNodeOS(self, ninfo, nimg, base):
1312 """Verifies the node OS list. 1313 1314 @type ninfo: L{objects.Node} 1315 @param ninfo: the node to check 1316 @param nimg: the node image object 1317 @param base: the 'template' node we match against (e.g. from the master) 1318 1319 """ 1320 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?" 1321 1322 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l] 1323 for os_name, os_data in nimg.oslist.items(): 1324 assert os_data, "Empty OS status for OS %s?!" % os_name 1325 f_path, f_status, f_diag, f_var, f_param, f_api, f_trusted = os_data[0] 1326 self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name, 1327 "Invalid OS %s (located at %s): %s", 1328 os_name, f_path, f_diag) 1329 self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name, 1330 "OS '%s' has multiple entries" 1331 " (first one shadows the rest): %s", 1332 os_name, utils.CommaJoin([v[0] for v in os_data])) 1333 # comparisons with the 'base' image 1334 test = os_name not in base.oslist 1335 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name, 1336 "Extra OS %s not present on reference node (%s)", 1337 os_name, self.cfg.GetNodeName(base.uuid)) 1338 if test: 1339 continue 1340 assert base.oslist[os_name], "Base node has empty OS status?" 1341 _, b_status, _, b_var, b_param, b_api, b_trusted = base.oslist[os_name][0] 1342 if not b_status: 1343 # base OS is invalid, skipping 1344 continue 1345 for kind, a, b in [("API version", f_api, b_api), 1346 ("variants list", f_var, b_var), 1347 ("parameters", beautify_params(f_param), 1348 beautify_params(b_param))]: 1349 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name, 1350 "OS %s for %s differs from reference node %s:" 1351 " [%s] vs. [%s]", kind, os_name, 1352 self.cfg.GetNodeName(base.uuid), 1353 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b))) 1354 for kind, a, b in [("trusted", f_trusted, b_trusted)]: 1355 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name, 1356 "OS %s for %s differs from reference node %s:" 1357 " %s vs. %s", kind, os_name, 1358 self.cfg.GetNodeName(base.uuid), a, b) 1359 1360 # check any missing OSes 1361 missing = set(base.oslist.keys()).difference(nimg.oslist.keys()) 1362 self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name, 1363 "OSes present on reference node %s" 1364 " but missing on this node: %s", 1365 self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
1366
1367 - def _VerifyAcceptedFileStoragePaths(self, ninfo, nresult, is_master):
1368 """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}. 1369 1370 @type ninfo: L{objects.Node} 1371 @param ninfo: the node to check 1372 @param nresult: the remote results for the node 1373 @type is_master: bool 1374 @param is_master: Whether node is the master node 1375 1376 """ 1377 cluster = self.cfg.GetClusterInfo() 1378 if (is_master and 1379 (cluster.IsFileStorageEnabled() or 1380 cluster.IsSharedFileStorageEnabled())): 1381 try: 1382 fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS] 1383 except KeyError: 1384 # This should never happen 1385 self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1386 "Node did not return forbidden file storage paths") 1387 else: 1388 self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1389 "Found forbidden file storage paths: %s", 1390 utils.CommaJoin(fspaths)) 1391 else: 1392 self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult, 1393 constants.CV_ENODEFILESTORAGEPATHS, ninfo.name, 1394 "Node should not have returned forbidden file storage" 1395 " paths")
1396
1397 - def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template, 1398 verify_key, error_key):
1399 """Verifies (file) storage paths. 1400 1401 @type ninfo: L{objects.Node} 1402 @param ninfo: the node to check 1403 @param nresult: the remote results for the node 1404 @type file_disk_template: string 1405 @param file_disk_template: file-based disk template, whose directory 1406 is supposed to be verified 1407 @type verify_key: string 1408 @param verify_key: key for the verification map of this file 1409 verification step 1410 @param error_key: error key to be added to the verification results 1411 in case something goes wrong in this verification step 1412 1413 """ 1414 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes( 1415 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER 1416 )) 1417 1418 cluster = self.cfg.GetClusterInfo() 1419 if cluster.IsDiskTemplateEnabled(file_disk_template): 1420 self._ErrorIf( 1421 verify_key in nresult, 1422 error_key, ninfo.name, 1423 "The configured %s storage path is unusable: %s" % 1424 (file_disk_template, nresult.get(verify_key)))
1425
1426 - def _VerifyFileStoragePaths(self, ninfo, nresult):
1427 """Verifies (file) storage paths. 1428 1429 @see: C{_VerifyStoragePaths} 1430 1431 """ 1432 self._VerifyStoragePaths( 1433 ninfo, nresult, constants.DT_FILE, 1434 constants.NV_FILE_STORAGE_PATH, 1435 constants.CV_ENODEFILESTORAGEPATHUNUSABLE)
1436
1437 - def _VerifySharedFileStoragePaths(self, ninfo, nresult):
1438 """Verifies (file) storage paths. 1439 1440 @see: C{_VerifyStoragePaths} 1441 1442 """ 1443 self._VerifyStoragePaths( 1444 ninfo, nresult, constants.DT_SHARED_FILE, 1445 constants.NV_SHARED_FILE_STORAGE_PATH, 1446 constants.CV_ENODESHAREDFILESTORAGEPATHUNUSABLE)
1447
1448 - def _VerifyGlusterStoragePaths(self, ninfo, nresult):
1449 """Verifies (file) storage paths. 1450 1451 @see: C{_VerifyStoragePaths} 1452 1453 """ 1454 self._VerifyStoragePaths( 1455 ninfo, nresult, constants.DT_GLUSTER, 1456 constants.NV_GLUSTER_STORAGE_PATH, 1457 constants.CV_ENODEGLUSTERSTORAGEPATHUNUSABLE)
1458
1459 - def _VerifyOob(self, ninfo, nresult):
1460 """Verifies out of band functionality of a node. 1461 1462 @type ninfo: L{objects.Node} 1463 @param ninfo: the node to check 1464 @param nresult: the remote results for the node 1465 1466 """ 1467 # We just have to verify the paths on master and/or master candidates 1468 # as the oob helper is invoked on the master 1469 if ((ninfo.master_candidate or ninfo.master_capable) and 1470 constants.NV_OOB_PATHS in nresult): 1471 for path_result in nresult[constants.NV_OOB_PATHS]: 1472 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, 1473 ninfo.name, path_result)
1474
1475 - def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1476 """Verifies and updates the node volume data. 1477 1478 This function will update a L{NodeImage}'s internal structures 1479 with data from the remote call. 1480 1481 @type ninfo: L{objects.Node} 1482 @param ninfo: the node to check 1483 @param nresult: the remote results for the node 1484 @param nimg: the node image object 1485 @param vg_name: the configured VG name 1486 1487 """ 1488 nimg.lvm_fail = True 1489 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data") 1490 if vg_name is None: 1491 pass 1492 elif isinstance(lvdata, basestring): 1493 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name, 1494 "LVM problem on node: %s", utils.SafeEncode(lvdata)) 1495 elif not isinstance(lvdata, dict): 1496 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name, 1497 "rpc call to node failed (lvlist)") 1498 else: 1499 nimg.volumes = lvdata 1500 nimg.lvm_fail = False
1501
1502 - def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1503 """Verifies and updates the node instance list. 1504 1505 If the listing was successful, then updates this node's instance 1506 list. Otherwise, it marks the RPC call as failed for the instance 1507 list key. 1508 1509 @type ninfo: L{objects.Node} 1510 @param ninfo: the node to check 1511 @param nresult: the remote results for the node 1512 @param nimg: the node image object 1513 1514 """ 1515 idata = nresult.get(constants.NV_INSTANCELIST, None) 1516 test = not isinstance(idata, list) 1517 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 1518 "rpc call to node failed (instancelist): %s", 1519 utils.SafeEncode(str(idata))) 1520 if test: 1521 nimg.hyp_fail = True 1522 else: 1523 nimg.instances = [uuid for (uuid, _) in 1524 self.cfg.GetMultiInstanceInfoByName(idata)]
1525
1526 - def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1527 """Verifies and computes a node information map 1528 1529 @type ninfo: L{objects.Node} 1530 @param ninfo: the node to check 1531 @param nresult: the remote results for the node 1532 @param nimg: the node image object 1533 @param vg_name: the configured VG name 1534 1535 """ 1536 # try to read free memory (from the hypervisor) 1537 hv_info = nresult.get(constants.NV_HVINFO, None) 1538 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info 1539 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, 1540 "rpc call to node failed (hvinfo)") 1541 if not test: 1542 try: 1543 nimg.mfree = int(hv_info["memory_free"]) 1544 except (ValueError, TypeError): 1545 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, 1546 "node returned invalid nodeinfo, check hypervisor") 1547 1548 # FIXME: devise a free space model for file based instances as well 1549 if vg_name is not None: 1550 test = (constants.NV_VGLIST not in nresult or 1551 vg_name not in nresult[constants.NV_VGLIST]) 1552 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name, 1553 "node didn't return data for the volume group '%s'" 1554 " - it is either missing or broken", vg_name) 1555 if not test: 1556 try: 1557 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name]) 1558 except (ValueError, TypeError): 1559 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, 1560 "node returned invalid LVM info, check LVM status")
1561
1562 - def _CollectDiskInfo(self, node_uuids, node_image, instanceinfo):
1563 """Gets per-disk status information for all instances. 1564 1565 @type node_uuids: list of strings 1566 @param node_uuids: Node UUIDs 1567 @type node_image: dict of (UUID, L{objects.Node}) 1568 @param node_image: Node objects 1569 @type instanceinfo: dict of (UUID, L{objects.Instance}) 1570 @param instanceinfo: Instance objects 1571 @rtype: {instance: {node: [(succes, payload)]}} 1572 @return: a dictionary of per-instance dictionaries with nodes as 1573 keys and disk information as values; the disk information is a 1574 list of tuples (success, payload) 1575 1576 """ 1577 node_disks = {} 1578 node_disks_dev_inst_only = {} 1579 diskless_instances = set() 1580 nodisk_instances = set() 1581 1582 for nuuid in node_uuids: 1583 node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst, 1584 node_image[nuuid].sinst)) 1585 diskless_instances.update(uuid for uuid in node_inst_uuids 1586 if not instanceinfo[uuid].disks) 1587 disks = [(inst_uuid, disk) 1588 for inst_uuid in node_inst_uuids 1589 for disk in self.cfg.GetInstanceDisks(inst_uuid)] 1590 1591 if not disks: 1592 nodisk_instances.update(uuid for uuid in node_inst_uuids 1593 if instanceinfo[uuid].disks) 1594 # No need to collect data 1595 continue 1596 1597 node_disks[nuuid] = disks 1598 1599 # _AnnotateDiskParams makes already copies of the disks 1600 dev_inst_only = [] 1601 for (inst_uuid, dev) in disks: 1602 (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev], 1603 self.cfg) 1604 dev_inst_only.append((anno_disk, instanceinfo[inst_uuid])) 1605 1606 node_disks_dev_inst_only[nuuid] = dev_inst_only 1607 1608 assert len(node_disks) == len(node_disks_dev_inst_only) 1609 1610 # Collect data from all nodes with disks 1611 result = self.rpc.call_blockdev_getmirrorstatus_multi( 1612 node_disks.keys(), node_disks_dev_inst_only) 1613 1614 assert len(result) == len(node_disks) 1615 1616 instdisk = {} 1617 1618 for (nuuid, nres) in result.items(): 1619 node = self.cfg.GetNodeInfo(nuuid) 1620 disks = node_disks[node.uuid] 1621 1622 if nres.offline: 1623 # No data from this node 1624 data = len(disks) * [(False, "node offline")] 1625 else: 1626 msg = nres.fail_msg 1627 self._ErrorIf(msg, constants.CV_ENODERPC, node.name, 1628 "while getting disk information: %s", msg) 1629 if msg: 1630 # No data from this node 1631 data = len(disks) * [(False, msg)] 1632 else: 1633 data = [] 1634 for idx, i in enumerate(nres.payload): 1635 if isinstance(i, (tuple, list)) and len(i) == 2: 1636 data.append(i) 1637 else: 1638 logging.warning("Invalid result from node %s, entry %d: %s", 1639 node.name, idx, i) 1640 data.append((False, "Invalid result from the remote node")) 1641 1642 for ((inst_uuid, _), status) in zip(disks, data): 1643 instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \ 1644 .append(status) 1645 1646 # Add empty entries for diskless instances. 1647 for inst_uuid in diskless_instances: 1648 assert inst_uuid not in instdisk 1649 instdisk[inst_uuid] = {} 1650 # ...and disk-full instances that happen to have no disks 1651 for inst_uuid in nodisk_instances: 1652 assert inst_uuid not in instdisk 1653 instdisk[inst_uuid] = {} 1654 1655 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and 1656 len(nuuids) <= len( 1657 self.cfg.GetInstanceNodes(instanceinfo[inst].uuid)) and 1658 compat.all(isinstance(s, (tuple, list)) and 1659 len(s) == 2 for s in statuses) 1660 for inst, nuuids in instdisk.items() 1661 for nuuid, statuses in nuuids.items()) 1662 if __debug__: 1663 instdisk_keys = set(instdisk) 1664 instanceinfo_keys = set(instanceinfo) 1665 assert instdisk_keys == instanceinfo_keys, \ 1666 ("instdisk keys (%s) do not match instanceinfo keys (%s)" % 1667 (instdisk_keys, instanceinfo_keys)) 1668 1669 return instdisk
1670 1671 @staticmethod
1672 - def _SshNodeSelector(group_uuid, all_nodes):
1673 """Create endless iterators for all potential SSH check hosts. 1674 1675 """ 1676 nodes = [node for node in all_nodes 1677 if (node.group != group_uuid and 1678 not node.offline)] 1679 keyfunc = operator.attrgetter("group") 1680 1681 return map(itertools.cycle, 1682 [sorted(map(operator.attrgetter("name"), names)) 1683 for _, names in itertools.groupby(sorted(nodes, key=keyfunc), 1684 keyfunc)])
1685 1686 @classmethod
1687 - def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
1688 """Choose which nodes should talk to which other nodes. 1689 1690 We will make nodes contact all nodes in their group, and one node from 1691 every other group. 1692 1693 @rtype: tuple of (string, dict of strings to list of strings, string) 1694 @return: a tuple containing the list of all online nodes, a dictionary 1695 mapping node names to additional nodes of other node groups to which 1696 connectivity should be tested, and a list of all online master 1697 candidates 1698 1699 @warning: This algorithm has a known issue if one node group is much 1700 smaller than others (e.g. just one node). In such a case all other 1701 nodes will talk to the single node. 1702 1703 """ 1704 online_nodes = sorted(node.name for node in group_nodes if not node.offline) 1705 online_mcs = sorted(node.name for node in group_nodes 1706 if (node.master_candidate and not node.offline)) 1707 sel = cls._SshNodeSelector(group_uuid, all_nodes) 1708 1709 return (online_nodes, 1710 dict((name, sorted([i.next() for i in sel])) 1711 for name in online_nodes), 1712 online_mcs)
1713
1714 - def _PrepareSshSetupCheck(self):
1715 """Prepare the input data for the SSH setup verification. 1716 1717 """ 1718 all_nodes_info = self.cfg.GetAllNodesInfo() 1719 potential_master_candidates = self.cfg.GetPotentialMasterCandidates() 1720 node_status = [ 1721 (uuid, node_info.name, node_info.master_candidate, 1722 node_info.name in potential_master_candidates, not node_info.offline) 1723 for (uuid, node_info) in all_nodes_info.items()] 1724 return node_status
1725
1726 - def BuildHooksEnv(self):
1727 """Build hooks env. 1728 1729 Cluster-Verify hooks just ran in the post phase and their failure makes 1730 the output be logged in the verify output and the verification to fail. 1731 1732 """ 1733 env = { 1734 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()), 1735 } 1736 1737 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags())) 1738 for node in self.my_node_info.values()) 1739 1740 return env
1741
1742 - def BuildHooksNodes(self):
1743 """Build hooks nodes. 1744 1745 """ 1746 return ([], list(self.my_node_info.keys()))
1747 1748 @staticmethod
1749 - def _VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced, 1750 i_offline, n_offline, n_drained):
1751 feedback_fn("* Other Notes") 1752 if i_non_redundant: 1753 feedback_fn(" - NOTICE: %d non-redundant instance(s) found." 1754 % len(i_non_redundant)) 1755 1756 if i_non_a_balanced: 1757 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found." 1758 % len(i_non_a_balanced)) 1759 1760 if i_offline: 1761 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline) 1762 1763 if n_offline: 1764 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline) 1765 1766 if n_drained: 1767 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
1768
1769 - def Exec(self, feedback_fn): # pylint: disable=R0915
1770 """Verify integrity of the node group, performing various test on nodes. 1771 1772 """ 1773 # This method has too many local variables. pylint: disable=R0914 1774 feedback_fn("* Verifying group '%s'" % self.group_info.name) 1775 1776 if not self.my_node_uuids: 1777 # empty node group 1778 feedback_fn("* Empty node group, skipping verification") 1779 return True 1780 1781 self.bad = False 1782 verbose = self.op.verbose 1783 self._feedback_fn = feedback_fn 1784 1785 vg_name = self.cfg.GetVGName() 1786 drbd_helper = self.cfg.GetDRBDHelper() 1787 cluster = self.cfg.GetClusterInfo() 1788 hypervisors = cluster.enabled_hypervisors 1789 node_data_list = self.my_node_info.values() 1790 1791 i_non_redundant = [] # Non redundant instances 1792 i_non_a_balanced = [] # Non auto-balanced instances 1793 i_offline = 0 # Count of offline instances 1794 n_offline = 0 # Count of offline nodes 1795 n_drained = 0 # Count of nodes being drained 1796 node_vol_should = {} 1797 1798 # FIXME: verify OS list 1799 1800 # File verification 1801 filemap = ComputeAncillaryFiles(cluster, False) 1802 1803 # do local checksums 1804 master_node_uuid = self.master_node = self.cfg.GetMasterNode() 1805 master_ip = self.cfg.GetMasterIP() 1806 1807 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids)) 1808 1809 user_scripts = [] 1810 if self.cfg.GetUseExternalMipScript(): 1811 user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT) 1812 1813 node_verify_param = { 1814 constants.NV_FILELIST: 1815 map(vcluster.MakeVirtualPath, 1816 utils.UniqueSequence(filename 1817 for files in filemap 1818 for filename in files)), 1819 constants.NV_NODELIST: 1820 self._SelectSshCheckNodes(node_data_list, self.group_uuid, 1821 self.all_node_info.values()), 1822 constants.NV_HYPERVISOR: hypervisors, 1823 constants.NV_HVPARAMS: 1824 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()), 1825 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip) 1826 for node in node_data_list 1827 if not node.offline], 1828 constants.NV_INSTANCELIST: hypervisors, 1829 constants.NV_VERSION: None, 1830 constants.NV_HVINFO: self.cfg.GetHypervisorType(), 1831 constants.NV_NODESETUP: None, 1832 constants.NV_TIME: None, 1833 constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip), 1834 constants.NV_OSLIST: None, 1835 constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(), 1836 constants.NV_USERSCRIPTS: user_scripts, 1837 constants.NV_CLIENT_CERT: None, 1838 } 1839 1840 if self.cfg.GetClusterInfo().modify_ssh_setup: 1841 node_verify_param[constants.NV_SSH_SETUP] = self._PrepareSshSetupCheck() 1842 if self.op.verify_clutter: 1843 node_verify_param[constants.NV_SSH_CLUTTER] = True 1844 1845 if vg_name is not None: 1846 node_verify_param[constants.NV_VGLIST] = None 1847 node_verify_param[constants.NV_LVLIST] = vg_name 1848 node_verify_param[constants.NV_PVLIST] = [vg_name] 1849 1850 if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8): 1851 if drbd_helper: 1852 node_verify_param[constants.NV_DRBDVERSION] = None 1853 node_verify_param[constants.NV_DRBDLIST] = None 1854 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper 1855 1856 if cluster.IsFileStorageEnabled() or \ 1857 cluster.IsSharedFileStorageEnabled(): 1858 # Load file storage paths only from master node 1859 node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \ 1860 self.cfg.GetMasterNodeName() 1861 if cluster.IsFileStorageEnabled(): 1862 node_verify_param[constants.NV_FILE_STORAGE_PATH] = \ 1863 cluster.file_storage_dir 1864 if cluster.IsSharedFileStorageEnabled(): 1865 node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \ 1866 cluster.shared_file_storage_dir 1867 1868 # bridge checks 1869 # FIXME: this needs to be changed per node-group, not cluster-wide 1870 bridges = set() 1871 default_nicpp = cluster.nicparams[constants.PP_DEFAULT] 1872 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: 1873 bridges.add(default_nicpp[constants.NIC_LINK]) 1874 for inst_uuid in self.my_inst_info.values(): 1875 for nic in inst_uuid.nics: 1876 full_nic = cluster.SimpleFillNIC(nic.nicparams) 1877 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: 1878 bridges.add(full_nic[constants.NIC_LINK]) 1879 1880 if bridges: 1881 node_verify_param[constants.NV_BRIDGES] = list(bridges) 1882 1883 # Build our expected cluster state 1884 node_image = dict((node.uuid, self.NodeImage(offline=node.offline, 1885 uuid=node.uuid, 1886 vm_capable=node.vm_capable)) 1887 for node in node_data_list) 1888 1889 # Gather OOB paths 1890 oob_paths = [] 1891 for node in self.all_node_info.values(): 1892 path = SupportsOob(self.cfg, node) 1893 if path and path not in oob_paths: 1894 oob_paths.append(path) 1895 1896 if oob_paths: 1897 node_verify_param[constants.NV_OOB_PATHS] = oob_paths 1898 1899 for inst_uuid in self.my_inst_uuids: 1900 instance = self.my_inst_info[inst_uuid] 1901 if instance.admin_state == constants.ADMINST_OFFLINE: 1902 i_offline += 1 1903 1904 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid) 1905 for nuuid in inst_nodes: 1906 if nuuid not in node_image: 1907 gnode = self.NodeImage(uuid=nuuid) 1908 gnode.ghost = (nuuid not in self.all_node_info) 1909 node_image[nuuid] = gnode 1910 1911 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 1912 1913 pnode = instance.primary_node 1914 node_image[pnode].pinst.append(instance.uuid) 1915 1916 for snode in self.cfg.GetInstanceSecondaryNodes(instance.uuid): 1917 nimg = node_image[snode] 1918 nimg.sinst.append(instance.uuid) 1919 if pnode not in nimg.sbp: 1920 nimg.sbp[pnode] = [] 1921 nimg.sbp[pnode].append(instance.uuid) 1922 1923 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, 1924 self.my_node_info.keys()) 1925 # The value of exclusive_storage should be the same across the group, so if 1926 # it's True for at least a node, we act as if it were set for all the nodes 1927 self._exclusive_storage = compat.any(es_flags.values()) 1928 if self._exclusive_storage: 1929 node_verify_param[constants.NV_EXCLUSIVEPVS] = True 1930 1931 node_group_uuids = dict(map(lambda n: (n.name, n.group), 1932 self.cfg.GetAllNodesInfo().values())) 1933 groups_config = self.cfg.GetAllNodeGroupsInfoDict() 1934 1935 # At this point, we have the in-memory data structures complete, 1936 # except for the runtime information, which we'll gather next 1937 1938 # NOTE: Here we lock the configuration for the duration of RPC calls, 1939 # which means that the cluster configuration changes are blocked during 1940 # this period. 1941 # This is something that should be done only exceptionally and only for 1942 # justified cases! 1943 # In this case, we need the lock as we can only verify the integrity of 1944 # configuration files on MCs only if we know nobody else is modifying it. 1945 # FIXME: The check for integrity of config.data should be moved to 1946 # WConfD, which is the only one who can otherwise ensure nobody 1947 # will modify the configuration during the check. 1948 with self.cfg.GetConfigManager(shared=True, forcelock=True): 1949 feedback_fn("* Gathering information about nodes (%s nodes)" % 1950 len(self.my_node_uuids)) 1951 # Force the configuration to be fully distributed before doing any tests 1952 self.cfg.FlushConfig() 1953 # Due to the way our RPC system works, exact response times cannot be 1954 # guaranteed (e.g. a broken node could run into a timeout). By keeping 1955 # the time before and after executing the request, we can at least have 1956 # a time window. 1957 nvinfo_starttime = time.time() 1958 # Get lock on the configuration so that nobody modifies it concurrently. 1959 # Otherwise it can be modified by other jobs, failing the consistency 1960 # test. 1961 # NOTE: This is an exceptional situation, we should otherwise avoid 1962 # locking the configuration for something but very fast, pure operations. 1963 cluster_name = self.cfg.GetClusterName() 1964 hvparams = self.cfg.GetClusterInfo().hvparams 1965 all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids, 1966 node_verify_param, 1967 cluster_name, 1968 hvparams, 1969 node_group_uuids, 1970 groups_config) 1971 nvinfo_endtime = time.time() 1972 1973 if self.extra_lv_nodes and vg_name is not None: 1974 feedback_fn("* Gathering information about extra nodes (%s nodes)" % 1975 len(self.extra_lv_nodes)) 1976 extra_lv_nvinfo = \ 1977 self.rpc.call_node_verify(self.extra_lv_nodes, 1978 {constants.NV_LVLIST: vg_name}, 1979 self.cfg.GetClusterName(), 1980 self.cfg.GetClusterInfo().hvparams, 1981 node_group_uuids, 1982 groups_config) 1983 else: 1984 extra_lv_nvinfo = {} 1985 1986 # If not all nodes are being checked, we need to make sure the master 1987 # node and a non-checked vm_capable node are in the list. 1988 absent_node_uuids = set(self.all_node_info).difference(self.my_node_info) 1989 if absent_node_uuids: 1990 vf_nvinfo = all_nvinfo.copy() 1991 vf_node_info = list(self.my_node_info.values()) 1992 additional_node_uuids = [] 1993 if master_node_uuid not in self.my_node_info: 1994 additional_node_uuids.append(master_node_uuid) 1995 vf_node_info.append(self.all_node_info[master_node_uuid]) 1996 # Add the first vm_capable node we find which is not included, 1997 # excluding the master node (which we already have) 1998 for node_uuid in absent_node_uuids: 1999 nodeinfo = self.all_node_info[node_uuid] 2000 if (nodeinfo.vm_capable and not nodeinfo.offline and 2001 node_uuid != master_node_uuid): 2002 additional_node_uuids.append(node_uuid) 2003 vf_node_info.append(self.all_node_info[node_uuid]) 2004 break 2005 key = constants.NV_FILELIST 2006 2007 feedback_fn("* Gathering information about the master node") 2008 vf_nvinfo.update(self.rpc.call_node_verify( 2009 additional_node_uuids, {key: node_verify_param[key]}, 2010 self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams, 2011 node_group_uuids, 2012 groups_config)) 2013 else: 2014 vf_nvinfo = all_nvinfo 2015 vf_node_info = self.my_node_info.values() 2016 2017 all_drbd_map = self.cfg.ComputeDRBDMap() 2018 2019 feedback_fn("* Gathering disk information (%s nodes)" % 2020 len(self.my_node_uuids)) 2021 instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image, 2022 self.my_inst_info) 2023 2024 feedback_fn("* Verifying configuration file consistency") 2025 2026 self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo) 2027 if self.cfg.GetClusterInfo().modify_ssh_setup: 2028 self._VerifySshSetup(self.my_node_info.values(), all_nvinfo) 2029 self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap) 2030 2031 feedback_fn("* Verifying node status") 2032 2033 refos_img = None 2034 2035 for node_i in node_data_list: 2036 nimg = node_image[node_i.uuid] 2037 2038 if node_i.offline: 2039 if verbose: 2040 feedback_fn("* Skipping offline node %s" % (node_i.name,)) 2041 n_offline += 1 2042 continue 2043 2044 if node_i.uuid == master_node_uuid: 2045 ntype = "master" 2046 elif node_i.master_candidate: 2047 ntype = "master candidate" 2048 elif node_i.drained: 2049 ntype = "drained" 2050 n_drained += 1 2051 else: 2052 ntype = "regular" 2053 if verbose: 2054 feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype)) 2055 2056 msg = all_nvinfo[node_i.uuid].fail_msg 2057 self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name, 2058 "while contacting node: %s", msg) 2059 if msg: 2060 nimg.rpc_fail = True 2061 continue 2062 2063 nresult = all_nvinfo[node_i.uuid].payload 2064 2065 nimg.call_ok = self._VerifyNode(node_i, nresult) 2066 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime) 2067 self._VerifyNodeNetwork(node_i, nresult) 2068 self._VerifyNodeUserScripts(node_i, nresult) 2069 self._VerifyOob(node_i, nresult) 2070 self._VerifyAcceptedFileStoragePaths(node_i, nresult, 2071 node_i.uuid == master_node_uuid) 2072 self._VerifyFileStoragePaths(node_i, nresult) 2073 self._VerifySharedFileStoragePaths(node_i, nresult) 2074 self._VerifyGlusterStoragePaths(node_i, nresult) 2075 2076 if nimg.vm_capable: 2077 self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg) 2078 if constants.DT_DRBD8 in cluster.enabled_disk_templates: 2079 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, 2080 self.all_disks_info, drbd_helper, all_drbd_map) 2081 2082 if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \ 2083 (constants.DT_DRBD8 in cluster.enabled_disk_templates): 2084 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name) 2085 self._UpdateNodeInstances(node_i, nresult, nimg) 2086 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name) 2087 self._UpdateNodeOS(node_i, nresult, nimg) 2088 2089 if not nimg.os_fail: 2090 if refos_img is None: 2091 refos_img = nimg 2092 self._VerifyNodeOS(node_i, nimg, refos_img) 2093 self._VerifyNodeBridges(node_i, nresult, bridges) 2094 2095 # Check whether all running instances are primary for the node. (This 2096 # can no longer be done from _VerifyInstance below, since some of the 2097 # wrong instances could be from other node groups.) 2098 non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst) 2099 2100 for inst_uuid in non_primary_inst_uuids: 2101 test = inst_uuid in self.all_inst_info 2102 self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, 2103 self.cfg.GetInstanceName(inst_uuid), 2104 "instance should not run on node %s", node_i.name) 2105 self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name, 2106 "node is running unknown instance %s", inst_uuid) 2107 2108 self._VerifyGroupDRBDVersion(all_nvinfo) 2109 self._VerifyGroupLVM(node_image, vg_name) 2110 2111 for node_uuid, result in extra_lv_nvinfo.items(): 2112 self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload, 2113 node_image[node_uuid], vg_name) 2114 2115 feedback_fn("* Verifying instance status") 2116 for inst_uuid in self.my_inst_uuids: 2117 instance = self.my_inst_info[inst_uuid] 2118 if verbose: 2119 feedback_fn("* Verifying instance %s" % instance.name) 2120 self._VerifyInstance(instance, node_image, instdisk[inst_uuid]) 2121 2122 # If the instance is not fully redundant we cannot survive losing its 2123 # primary node, so we are not N+1 compliant. 2124 inst_disks = self.cfg.GetInstanceDisks(instance.uuid) 2125 if not utils.AllDiskOfType(inst_disks, constants.DTS_MIRRORED): 2126 i_non_redundant.append(instance) 2127 2128 if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]: 2129 i_non_a_balanced.append(instance) 2130 2131 feedback_fn("* Verifying orphan volumes") 2132 reserved = utils.FieldSet(*cluster.reserved_lvs) 2133 2134 # We will get spurious "unknown volume" warnings if any node of this group 2135 # is secondary for an instance whose primary is in another group. To avoid 2136 # them, we find these instances and add their volumes to node_vol_should. 2137 for instance in self.all_inst_info.values(): 2138 for secondary in self.cfg.GetInstanceSecondaryNodes(instance.uuid): 2139 if (secondary in self.my_node_info 2140 and instance.name not in self.my_inst_info): 2141 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should) 2142 break 2143 2144 self._VerifyOrphanVolumes(vg_name, node_vol_should, node_image, reserved) 2145 2146 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks: 2147 feedback_fn("* Verifying N+1 Memory redundancy") 2148 self._VerifyNPlusOneMemory(node_image, self.my_inst_info) 2149 2150 self._VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced, 2151 i_offline, n_offline, n_drained) 2152 2153 return not self.bad
2154
2155 - def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2156 """Analyze the post-hooks' result 2157 2158 This method analyses the hook result, handles it, and sends some 2159 nicely-formatted feedback back to the user. 2160 2161 @param phase: one of L{constants.HOOKS_PHASE_POST} or 2162 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase 2163 @param hooks_results: the results of the multi-node hooks rpc call 2164 @param feedback_fn: function used send feedback back to the caller 2165 @param lu_result: previous Exec result 2166 @return: the new Exec result, based on the previous result 2167 and hook results 2168 2169 """ 2170 # We only really run POST phase hooks, only for non-empty groups, 2171 # and are only interested in their results 2172 if not self.my_node_uuids: 2173 # empty node group 2174 pass 2175 elif phase == constants.HOOKS_PHASE_POST: 2176 # Used to change hooks' output to proper indentation 2177 feedback_fn("* Hooks Results") 2178 assert hooks_results, "invalid result from hooks" 2179 2180 for node_name in hooks_results: 2181 res = hooks_results[node_name] 2182 msg = res.fail_msg 2183 test = msg and not res.offline 2184 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, 2185 "Communication failure in hooks execution: %s", msg) 2186 if test: 2187 lu_result = False 2188 continue 2189 if res.offline: 2190 # No need to investigate payload if node is offline 2191 continue 2192 for script, hkr, output in res.payload: 2193 test = hkr == constants.HKR_FAIL 2194 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, 2195 "Script %s failed, output:", script) 2196 if test: 2197 output = self._HOOKS_INDENT_RE.sub(" ", output) 2198 feedback_fn("%s" % output) 2199 lu_result = False 2200 2201 return lu_result
2202