1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """Logical units dealing with the cluster."""
32
33 import copy
34 import itertools
35 import logging
36 import operator
37 import os
38 import re
39 import time
40
41 from ganeti import compat
42 from ganeti import constants
43 from ganeti import errors
44 from ganeti import hypervisor
45 from ganeti import locking
46 from ganeti import masterd
47 from ganeti import netutils
48 from ganeti import objects
49 from ganeti import opcodes
50 from ganeti import pathutils
51 from ganeti import query
52 import ganeti.rpc.node as rpc
53 from ganeti import runtime
54 from ganeti import ssh
55 from ganeti import uidpool
56 from ganeti import utils
57 from ganeti import vcluster
58
59 from ganeti.cmdlib.base import NoHooksLU, QueryBase, LogicalUnit, \
60 ResultWithJobs
61 from ganeti.cmdlib.common import ShareAll, RunPostHook, \
62 ComputeAncillaryFiles, RedistributeAncillaryFiles, UploadHelper, \
63 GetWantedInstances, MergeAndVerifyHvState, MergeAndVerifyDiskState, \
64 GetUpdatedIPolicy, ComputeNewInstanceViolations, GetUpdatedParams, \
65 CheckOSParams, CheckHVParams, AdjustCandidatePool, CheckNodePVs, \
66 ComputeIPolicyInstanceViolation, AnnotateDiskParams, SupportsOob, \
67 CheckIpolicyVsDiskTemplates, CheckDiskAccessModeValidity, \
68 CheckDiskAccessModeConsistency, CreateNewClientCert, EnsureKvmdOnNodes
69
70 import ganeti.masterd.instance
77 """Renews the master's client certificate and propagates the config.
78
79 @type lu: C{LogicalUnit}
80 @param lu: the logical unit holding the config
81 @type master_uuid: string
82 @param master_uuid: the master node's UUID
83 @type cluster: C{objects.Cluster}
84 @param cluster: the cluster's configuration
85 @type feedback_fn: function
86 @param feedback_fn: feedback functions for config updates
87 @type client_cert: string
88 @param client_cert: the path of the client certificate
89 @type client_cert_tmp: string
90 @param client_cert_tmp: the temporary path of the client certificate
91 @rtype: string
92 @return: the digest of the newly created client certificate
93
94 """
95 client_digest = CreateNewClientCert(lu, master_uuid, filename=client_cert_tmp)
96 utils.AddNodeToCandidateCerts(master_uuid, client_digest,
97 cluster.candidate_certs)
98
99
100 lu.cfg.Update(cluster, feedback_fn)
101
102 utils.RemoveFile(client_cert)
103 utils.RenameFile(client_cert_tmp, client_cert)
104 return client_digest
105
108 """Renew the cluster's crypto tokens.
109
110 Note that most of this operation is done in gnt_cluster.py, this LU only
111 takes care of the renewal of the client SSL certificates.
112
113 """
114 _MAX_NUM_RETRIES = 3
115
116 - def Exec(self, feedback_fn):
117 master_uuid = self.cfg.GetMasterNode()
118 logging.debug("Renewing the master's SSL node certificate."
119 " Master's UUID: %s.", master_uuid)
120 cluster = self.cfg.GetClusterInfo()
121
122 server_digest = utils.GetCertificateDigest(
123 cert_filename=pathutils.NODED_CERT_FILE)
124 logging.debug("SSL digest of the node certificate: %s.", server_digest)
125 utils.AddNodeToCandidateCerts("%s-SERVER" % master_uuid,
126 server_digest,
127 cluster.candidate_certs)
128 logging.debug("Added master's digest as *-SERVER entry to configuration."
129 " Current list of candidate certificates: %s.",
130 str(cluster.candidate_certs))
131
132 try:
133 old_master_digest = utils.GetCertificateDigest(
134 cert_filename=pathutils.NODED_CLIENT_CERT_FILE)
135 logging.debug("SSL digest of old master's SSL node certificate: %s.",
136 old_master_digest)
137 utils.AddNodeToCandidateCerts("%s-OLDMASTER" % master_uuid,
138 old_master_digest,
139 cluster.candidate_certs)
140 logging.debug("Added old master's node certificate digest to config"
141 " as *-OLDMASTER. Current list of candidate certificates:"
142 " %s.", str(cluster.candidate_certs))
143
144 except IOError:
145 logging.info("No old master certificate available.")
146
147 last_exception = None
148 for i in range(self._MAX_NUM_RETRIES):
149 try:
150
151
152
153 _UpdateMasterClientCert(
154 self, master_uuid, cluster, feedback_fn,
155 client_cert=pathutils.NODED_CLIENT_CERT_FILE,
156 client_cert_tmp=pathutils.NODED_CLIENT_CERT_FILE_TMP)
157 logging.debug("Successfully renewed the master's node certificate.")
158 break
159 except errors.OpExecError as e:
160 logging.error("Renewing the master's SSL node certificate failed"
161 " at attempt no. %s with error '%s'", str(i), e)
162 last_exception = e
163 else:
164 if last_exception:
165 feedback_fn("Could not renew the master's client SSL certificate."
166 " Cleaning up. Error: %s." % last_exception)
167
168 utils.RemoveNodeFromCandidateCerts("%s-SERVER" % master_uuid,
169 cluster.candidate_certs)
170 utils.RemoveNodeFromCandidateCerts("%s-OLDMASTER" % master_uuid,
171 cluster.candidate_certs)
172 logging.debug("Cleaned up *-SERVER and *-OLDMASTER certificate from"
173 " master candidate cert list. Current state of the"
174 " list: %s.", str(cluster.candidate_certs))
175 try:
176 utils.RemoveFile(pathutils.NODED_CLIENT_CERT_FILE_TMP)
177 except IOError as e:
178 logging.debug("Could not clean up temporary node certificate of the"
179 " master node. (Possibly because it was already removed"
180 " properly.) Error: %s.", e)
181 return
182
183 node_errors = {}
184 nodes = self.cfg.GetAllNodesInfo()
185 logging.debug("Renewing non-master nodes' node certificates.")
186 for (node_uuid, node_info) in nodes.items():
187 if node_info.offline:
188 feedback_fn("* Skipping offline node %s" % node_info.name)
189 logging.debug("Skipping offline node %s (UUID: %s).",
190 node_info.name, node_uuid)
191 continue
192 if node_uuid != master_uuid:
193 logging.debug("Renewing node certificate of node '%s'.", node_uuid)
194 last_exception = None
195 for i in range(self._MAX_NUM_RETRIES):
196 try:
197 new_digest = CreateNewClientCert(self, node_uuid)
198 if node_info.master_candidate:
199 utils.AddNodeToCandidateCerts(node_uuid,
200 new_digest,
201 cluster.candidate_certs)
202 logging.debug("Added the node's certificate to candidate"
203 " certificate list. Current list: %s.",
204 str(cluster.candidate_certs))
205 break
206 except errors.OpExecError as e:
207 last_exception = e
208 logging.error("Could not renew a non-master node's SSL node"
209 " certificate at attempt no. %s. The node's UUID"
210 " is %s, and the error was: %s.",
211 str(i), node_uuid, e)
212 else:
213 if last_exception:
214 node_errors[node_uuid] = last_exception
215
216 if node_errors:
217 msg = ("Some nodes' SSL client certificates could not be renewed."
218 " Please make sure those nodes are reachable and rerun"
219 " the operation. The affected nodes and their errors are:\n")
220 for uuid, e in node_errors.items():
221 msg += "Node %s: %s\n" % (uuid, e)
222 feedback_fn(msg)
223
224 utils.RemoveNodeFromCandidateCerts("%s-SERVER" % master_uuid,
225 cluster.candidate_certs)
226 utils.RemoveNodeFromCandidateCerts("%s-OLDMASTER" % master_uuid,
227 cluster.candidate_certs)
228 logging.debug("Cleaned up *-SERVER and *-OLDMASTER certificate from"
229 " master candidate cert list. Current state of the"
230 " list: %s.", cluster.candidate_certs)
231
232
233 logging.debug("Trigger an update of the configuration on all nodes.")
234 self.cfg.Update(cluster, feedback_fn)
235
238 """Activate the master IP on the master node.
239
240 """
241 - def Exec(self, feedback_fn):
250
253 """Deactivate the master IP on the master node.
254
255 """
256 - def Exec(self, feedback_fn):
265
268 """Return configuration values.
269
270 """
271 REQ_BGL = False
272
274 self.cq = ClusterQuery(None, self.op.output_fields, False)
275
278
281
282 - def Exec(self, feedback_fn):
283 result = self.cq.OldStyleQuery(self)
284
285 assert len(result) == 1
286
287 return result[0]
288
291 """Logical unit for destroying the cluster.
292
293 """
294 HPATH = "cluster-destroy"
295 HTYPE = constants.HTYPE_CLUSTER
296
298 """Build hooks env.
299
300 """
301 return {
302 "OP_TARGET": self.cfg.GetClusterName(),
303 }
304
306 """Build hooks nodes.
307
308 """
309 return ([], [])
310
312 """Check prerequisites.
313
314 This checks whether the cluster is empty.
315
316 Any errors are signaled by raising errors.OpPrereqError.
317
318 """
319 master = self.cfg.GetMasterNode()
320
321 nodelist = self.cfg.GetNodeList()
322 if len(nodelist) != 1 or nodelist[0] != master:
323 raise errors.OpPrereqError("There are still %d node(s) in"
324 " this cluster." % (len(nodelist) - 1),
325 errors.ECODE_INVAL)
326 instancelist = self.cfg.GetInstanceList()
327 if instancelist:
328 raise errors.OpPrereqError("There are still %d instance(s) in"
329 " this cluster." % len(instancelist),
330 errors.ECODE_INVAL)
331
332 - def Exec(self, feedback_fn):
346
347
348 -class LUClusterPostInit(LogicalUnit):
349 """Logical unit for running hooks after cluster initialization.
350
351 """
352 HPATH = "cluster-init"
353 HTYPE = constants.HTYPE_CLUSTER
354
355 - def CheckArguments(self):
356 self.master_uuid = self.cfg.GetMasterNode()
357 self.master_ndparams = self.cfg.GetNdParams(self.cfg.GetMasterNodeInfo())
358
359
360
361
362
363
364 if (self.master_ndparams[constants.ND_OVS] and not
365 self.master_ndparams.get(constants.ND_OVS_LINK, None)):
366 self.LogInfo("No physical interface for OpenvSwitch was given."
367 " OpenvSwitch will not have an outside connection. This"
368 " might not be what you want.")
369
370 - def BuildHooksEnv(self):
371 """Build hooks env.
372
373 """
374 return {
375 "OP_TARGET": self.cfg.GetClusterName(),
376 }
377
378 - def BuildHooksNodes(self):
379 """Build hooks nodes.
380
381 """
382 return ([], [self.cfg.GetMasterNode()])
383
384 - def Exec(self, feedback_fn):
385 """Create and configure Open vSwitch
386
387 """
388 if self.master_ndparams[constants.ND_OVS]:
389 result = self.rpc.call_node_configure_ovs(
390 self.master_uuid,
391 self.master_ndparams[constants.ND_OVS_NAME],
392 self.master_ndparams.get(constants.ND_OVS_LINK, None))
393 result.Raise("Could not successully configure Open vSwitch")
394
395 cluster = self.cfg.GetClusterInfo()
396 _UpdateMasterClientCert(self, self.master_uuid, cluster, feedback_fn)
397
398 return True
399
455
458 """Query cluster configuration.
459
460 """
461 REQ_BGL = False
462
464 self.needed_locks = {}
465
466 - def Exec(self, feedback_fn):
467 """Return cluster config.
468
469 """
470 cluster = self.cfg.GetClusterInfo()
471 os_hvp = {}
472
473
474 for os_name, hv_dict in cluster.os_hvp.items():
475 os_hvp[os_name] = {}
476 for hv_name, hv_params in hv_dict.items():
477 if hv_name in cluster.enabled_hypervisors:
478 os_hvp[os_name][hv_name] = hv_params
479
480
481 primary_ip_version = constants.IP4_VERSION
482 if cluster.primary_ip_family == netutils.IP6Address.family:
483 primary_ip_version = constants.IP6_VERSION
484
485 result = {
486 "software_version": constants.RELEASE_VERSION,
487 "protocol_version": constants.PROTOCOL_VERSION,
488 "config_version": constants.CONFIG_VERSION,
489 "os_api_version": max(constants.OS_API_VERSIONS),
490 "export_version": constants.EXPORT_VERSION,
491 "vcs_version": constants.VCS_VERSION,
492 "architecture": runtime.GetArchInfo(),
493 "name": cluster.cluster_name,
494 "master": self.cfg.GetMasterNodeName(),
495 "default_hypervisor": cluster.primary_hypervisor,
496 "enabled_hypervisors": cluster.enabled_hypervisors,
497 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
498 for hypervisor_name in cluster.enabled_hypervisors]),
499 "os_hvp": os_hvp,
500 "beparams": cluster.beparams,
501 "osparams": cluster.osparams,
502 "ipolicy": cluster.ipolicy,
503 "nicparams": cluster.nicparams,
504 "ndparams": cluster.ndparams,
505 "diskparams": cluster.diskparams,
506 "candidate_pool_size": cluster.candidate_pool_size,
507 "max_running_jobs": cluster.max_running_jobs,
508 "master_netdev": cluster.master_netdev,
509 "master_netmask": cluster.master_netmask,
510 "use_external_mip_script": cluster.use_external_mip_script,
511 "volume_group_name": cluster.volume_group_name,
512 "drbd_usermode_helper": cluster.drbd_usermode_helper,
513 "file_storage_dir": cluster.file_storage_dir,
514 "shared_file_storage_dir": cluster.shared_file_storage_dir,
515 "maintain_node_health": cluster.maintain_node_health,
516 "ctime": cluster.ctime,
517 "mtime": cluster.mtime,
518 "uuid": cluster.uuid,
519 "tags": list(cluster.GetTags()),
520 "uid_pool": cluster.uid_pool,
521 "default_iallocator": cluster.default_iallocator,
522 "default_iallocator_params": cluster.default_iallocator_params,
523 "reserved_lvs": cluster.reserved_lvs,
524 "primary_ip_version": primary_ip_version,
525 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
526 "hidden_os": cluster.hidden_os,
527 "blacklisted_os": cluster.blacklisted_os,
528 "enabled_disk_templates": cluster.enabled_disk_templates,
529 "enabled_user_shutdown": cluster.enabled_user_shutdown,
530 }
531
532 return result
533
536 """Force the redistribution of cluster configuration.
537
538 This is a very simple LU.
539
540 """
541 REQ_BGL = False
542
549
550 - def Exec(self, feedback_fn):
556
559 """Rename the cluster.
560
561 """
562 HPATH = "cluster-rename"
563 HTYPE = constants.HTYPE_CLUSTER
564
566 """Build hooks env.
567
568 """
569 return {
570 "OP_TARGET": self.cfg.GetClusterName(),
571 "NEW_NAME": self.op.name,
572 }
573
579
602
603 - def Exec(self, feedback_fn):
604 """Rename the cluster.
605
606 """
607 clustername = self.op.name
608 new_ip = self.ip
609
610
611 master_params = self.cfg.GetMasterNetworkParameters()
612 ems = self.cfg.GetUseExternalMipScript()
613 result = self.rpc.call_node_deactivate_master_ip(master_params.uuid,
614 master_params, ems)
615 result.Raise("Could not disable the master role")
616
617 try:
618 cluster = self.cfg.GetClusterInfo()
619 cluster.cluster_name = clustername
620 cluster.master_ip = new_ip
621 self.cfg.Update(cluster, feedback_fn)
622
623
624 ssh.WriteKnownHostsFile(self.cfg, pathutils.SSH_KNOWN_HOSTS_FILE)
625 node_list = self.cfg.GetOnlineNodeList()
626 try:
627 node_list.remove(master_params.uuid)
628 except ValueError:
629 pass
630 UploadHelper(self, node_list, pathutils.SSH_KNOWN_HOSTS_FILE)
631 finally:
632 master_params.ip = new_ip
633 result = self.rpc.call_node_activate_master_ip(master_params.uuid,
634 master_params, ems)
635 result.Warn("Could not re-enable the master role on the master,"
636 " please restart manually", self.LogWarning)
637
638 return clustername
639
642 """Verifies the cluster disks sizes.
643
644 """
645 REQ_BGL = False
646
648 if self.op.instances:
649 (_, self.wanted_names) = GetWantedInstances(self, self.op.instances)
650
651
652 self.needed_locks = {
653 locking.LEVEL_NODE_RES: [],
654 locking.LEVEL_INSTANCE: self.wanted_names,
655 }
656 self.recalculate_locks[locking.LEVEL_NODE_RES] = constants.LOCKS_REPLACE
657 else:
658 self.wanted_names = None
659 self.needed_locks = {
660 locking.LEVEL_NODE_RES: locking.ALL_SET,
661 locking.LEVEL_INSTANCE: locking.ALL_SET,
662
663
664 locking.LEVEL_NODE_ALLOC: locking.ALL_SET,
665 }
666
667 self.share_locks = {
668 locking.LEVEL_NODE_RES: 1,
669 locking.LEVEL_INSTANCE: 0,
670 locking.LEVEL_NODE_ALLOC: 1,
671 }
672
674 if level == locking.LEVEL_NODE_RES and self.wanted_names is not None:
675 self._LockInstancesNodes(primary_only=True, level=level)
676
678 """Check prerequisites.
679
680 This only checks the optional instance list against the existing names.
681
682 """
683 if self.wanted_names is None:
684 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
685
686 self.wanted_instances = \
687 map(compat.snd, self.cfg.GetMultiInstanceInfoByName(self.wanted_names))
688
690 """Ensure children of the disk have the needed disk size.
691
692 This is valid mainly for DRBD8 and fixes an issue where the
693 children have smaller disk size.
694
695 @param disk: an L{ganeti.objects.Disk} object
696
697 """
698 if disk.dev_type == constants.DT_DRBD8:
699 assert disk.children, "Empty children for DRBD8?"
700 fchild = disk.children[0]
701 mismatch = fchild.size < disk.size
702 if mismatch:
703 self.LogInfo("Child disk has size %d, parent %d, fixing",
704 fchild.size, disk.size)
705 fchild.size = disk.size
706
707
708 return self._EnsureChildSizes(fchild) or mismatch
709 else:
710 return False
711
712 - def Exec(self, feedback_fn):
713 """Verify the size of cluster disks.
714
715 """
716
717
718 per_node_disks = {}
719 for instance in self.wanted_instances:
720 pnode = instance.primary_node
721 if pnode not in per_node_disks:
722 per_node_disks[pnode] = []
723 for idx, disk in enumerate(instance.disks):
724 per_node_disks[pnode].append((instance, idx, disk))
725
726 assert not (frozenset(per_node_disks.keys()) -
727 self.owned_locks(locking.LEVEL_NODE_RES)), \
728 "Not owning correct locks"
729 assert not self.owned_locks(locking.LEVEL_NODE)
730
731 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg,
732 per_node_disks.keys())
733
734 changed = []
735 for node_uuid, dskl in per_node_disks.items():
736 if not dskl:
737
738 continue
739
740 newl = [([v[2].Copy()], v[0]) for v in dskl]
741 node_name = self.cfg.GetNodeName(node_uuid)
742 result = self.rpc.call_blockdev_getdimensions(node_uuid, newl)
743 if result.fail_msg:
744 self.LogWarning("Failure in blockdev_getdimensions call to node"
745 " %s, ignoring", node_name)
746 continue
747 if len(result.payload) != len(dskl):
748 logging.warning("Invalid result from node %s: len(dksl)=%d,"
749 " result.payload=%s", node_name, len(dskl),
750 result.payload)
751 self.LogWarning("Invalid result from node %s, ignoring node results",
752 node_name)
753 continue
754 for ((instance, idx, disk), dimensions) in zip(dskl, result.payload):
755 if dimensions is None:
756 self.LogWarning("Disk %d of instance %s did not return size"
757 " information, ignoring", idx, instance.name)
758 continue
759 if not isinstance(dimensions, (tuple, list)):
760 self.LogWarning("Disk %d of instance %s did not return valid"
761 " dimension information, ignoring", idx,
762 instance.name)
763 continue
764 (size, spindles) = dimensions
765 if not isinstance(size, (int, long)):
766 self.LogWarning("Disk %d of instance %s did not return valid"
767 " size information, ignoring", idx, instance.name)
768 continue
769 size = size >> 20
770 if size != disk.size:
771 self.LogInfo("Disk %d of instance %s has mismatched size,"
772 " correcting: recorded %d, actual %d", idx,
773 instance.name, disk.size, size)
774 disk.size = size
775 self.cfg.Update(instance, feedback_fn)
776 changed.append((instance.name, idx, "size", size))
777 if es_flags[node_uuid]:
778 if spindles is None:
779 self.LogWarning("Disk %d of instance %s did not return valid"
780 " spindles information, ignoring", idx,
781 instance.name)
782 elif disk.spindles is None or disk.spindles != spindles:
783 self.LogInfo("Disk %d of instance %s has mismatched spindles,"
784 " correcting: recorded %s, actual %s",
785 idx, instance.name, disk.spindles, spindles)
786 disk.spindles = spindles
787 self.cfg.Update(instance, feedback_fn)
788 changed.append((instance.name, idx, "spindles", disk.spindles))
789 if self._EnsureChildSizes(disk):
790 self.cfg.Update(instance, feedback_fn)
791 changed.append((instance.name, idx, "size", disk.size))
792 return changed
793
814
819 """Checks whether the given file-based storage directory is acceptable.
820
821 Note: This function is public, because it is also used in bootstrap.py.
822
823 @type logging_warn_fn: function
824 @param logging_warn_fn: function which accepts a string and logs it
825 @type file_storage_dir: string
826 @param file_storage_dir: the directory to be used for file-based instances
827 @type enabled_disk_templates: list of string
828 @param enabled_disk_templates: the list of enabled disk templates
829 @type file_disk_template: string
830 @param file_disk_template: the file-based disk template for which the
831 path should be checked
832
833 """
834 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes(
835 constants.ST_FILE, constants.ST_SHARED_FILE
836 ))
837 file_storage_enabled = file_disk_template in enabled_disk_templates
838 if file_storage_dir is not None:
839 if file_storage_dir == "":
840 if file_storage_enabled:
841 raise errors.OpPrereqError(
842 "Unsetting the '%s' storage directory while having '%s' storage"
843 " enabled is not permitted." %
844 (file_disk_template, file_disk_template))
845 else:
846 if not file_storage_enabled:
847 logging_warn_fn(
848 "Specified a %s storage directory, although %s storage is not"
849 " enabled." % (file_disk_template, file_disk_template))
850 else:
851 raise errors.ProgrammerError("Received %s storage dir with value"
852 " 'None'." % file_disk_template)
853
865
877
880 """Change the parameters of the cluster.
881
882 """
883 HPATH = "cluster-modify"
884 HTYPE = constants.HTYPE_CLUSTER
885 REQ_BGL = False
886
912
925
927 """Build hooks env.
928
929 """
930 return {
931 "OP_TARGET": self.cfg.GetClusterName(),
932 "NEW_VG_NAME": self.op.vg_name,
933 }
934
936 """Build hooks nodes.
937
938 """
939 mn = self.cfg.GetMasterNode()
940 return ([mn], [mn])
941
942 - def _CheckVgName(self, node_uuids, enabled_disk_templates,
943 new_enabled_disk_templates):
944 """Check the consistency of the vg name on all nodes and in case it gets
945 unset whether there are instances still using it.
946
947 """
948 lvm_is_enabled = utils.IsLvmEnabled(enabled_disk_templates)
949 lvm_gets_enabled = utils.LvmGetsEnabled(enabled_disk_templates,
950 new_enabled_disk_templates)
951 current_vg_name = self.cfg.GetVGName()
952
953 if self.op.vg_name == '':
954 if lvm_is_enabled:
955 raise errors.OpPrereqError("Cannot unset volume group if lvm-based"
956 " disk templates are or get enabled.")
957
958 if self.op.vg_name is None:
959 if current_vg_name is None and lvm_is_enabled:
960 raise errors.OpPrereqError("Please specify a volume group when"
961 " enabling lvm-based disk-templates.")
962
963 if self.op.vg_name is not None and not self.op.vg_name:
964 if self.cfg.HasAnyDiskOfType(constants.DT_PLAIN):
965 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
966 " instances exist", errors.ECODE_INVAL)
967
968 if (self.op.vg_name is not None and lvm_is_enabled) or \
969 (self.cfg.GetVGName() is not None and lvm_gets_enabled):
970 self._CheckVgNameOnNodes(node_uuids)
971
992
993 @staticmethod
996 """Computes three sets of disk templates.
997
998 @see: C{_GetDiskTemplateSets} for more details.
999
1000 """
1001 enabled_disk_templates = None
1002 new_enabled_disk_templates = []
1003 disabled_disk_templates = []
1004 if op_enabled_disk_templates:
1005 enabled_disk_templates = op_enabled_disk_templates
1006 new_enabled_disk_templates = \
1007 list(set(enabled_disk_templates)
1008 - set(old_enabled_disk_templates))
1009 disabled_disk_templates = \
1010 list(set(old_enabled_disk_templates)
1011 - set(enabled_disk_templates))
1012 else:
1013 enabled_disk_templates = old_enabled_disk_templates
1014 return (enabled_disk_templates, new_enabled_disk_templates,
1015 disabled_disk_templates)
1016
1018 """Computes three sets of disk templates.
1019
1020 The three sets are:
1021 - disk templates that will be enabled after this operation (no matter if
1022 they were enabled before or not)
1023 - disk templates that get enabled by this operation (thus haven't been
1024 enabled before.)
1025 - disk templates that get disabled by this operation
1026
1027 """
1028 return self._GetDiskTemplateSetsInner(self.op.enabled_disk_templates,
1029 cluster.enabled_disk_templates)
1030
1032 """Checks the ipolicy.
1033
1034 @type cluster: C{objects.Cluster}
1035 @param cluster: the cluster's configuration
1036 @type enabled_disk_templates: list of string
1037 @param enabled_disk_templates: list of (possibly newly) enabled disk
1038 templates
1039
1040 """
1041
1042 if self.op.ipolicy:
1043 self.new_ipolicy = GetUpdatedIPolicy(cluster.ipolicy, self.op.ipolicy,
1044 group_policy=False)
1045
1046 CheckIpolicyVsDiskTemplates(self.new_ipolicy,
1047 enabled_disk_templates)
1048
1049 all_instances = self.cfg.GetAllInstancesInfo().values()
1050 violations = set()
1051 for group in self.cfg.GetAllNodeGroupsInfo().values():
1052 instances = frozenset([inst for inst in all_instances
1053 if compat.any(nuuid in group.members
1054 for nuuid in inst.all_nodes)])
1055 new_ipolicy = objects.FillIPolicy(self.new_ipolicy, group.ipolicy)
1056 ipol = masterd.instance.CalculateGroupIPolicy(cluster, group)
1057 new = ComputeNewInstanceViolations(ipol, new_ipolicy, instances,
1058 self.cfg)
1059 if new:
1060 violations.update(new)
1061
1062 if violations:
1063 self.LogWarning("After the ipolicy change the following instances"
1064 " violate them: %s",
1065 utils.CommaJoin(utils.NiceSort(violations)))
1066 else:
1067 CheckIpolicyVsDiskTemplates(cluster.ipolicy,
1068 enabled_disk_templates)
1069
1071 """Checks whether the set DRBD helper actually exists on the nodes.
1072
1073 @type drbd_helper: string
1074 @param drbd_helper: path of the drbd usermode helper binary
1075 @type node_uuids: list of strings
1076 @param node_uuids: list of node UUIDs to check for the helper
1077
1078 """
1079
1080 helpers = self.rpc.call_drbd_helper(node_uuids)
1081 for (_, ninfo) in self.cfg.GetMultiNodeInfo(node_uuids):
1082 if ninfo.offline:
1083 self.LogInfo("Not checking drbd helper on offline node %s",
1084 ninfo.name)
1085 continue
1086 msg = helpers[ninfo.uuid].fail_msg
1087 if msg:
1088 raise errors.OpPrereqError("Error checking drbd helper on node"
1089 " '%s': %s" % (ninfo.name, msg),
1090 errors.ECODE_ENVIRON)
1091 node_helper = helpers[ninfo.uuid].payload
1092 if node_helper != drbd_helper:
1093 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
1094 (ninfo.name, node_helper),
1095 errors.ECODE_ENVIRON)
1096
1098 """Check the DRBD usermode helper.
1099
1100 @type node_uuids: list of strings
1101 @param node_uuids: a list of nodes' UUIDs
1102 @type drbd_enabled: boolean
1103 @param drbd_enabled: whether DRBD will be enabled after this operation
1104 (no matter if it was disabled before or not)
1105 @type drbd_gets_enabled: boolen
1106 @param drbd_gets_enabled: true if DRBD was disabled before this
1107 operation, but will be enabled afterwards
1108
1109 """
1110 if self.op.drbd_helper == '':
1111 if drbd_enabled:
1112 raise errors.OpPrereqError("Cannot disable drbd helper while"
1113 " DRBD is enabled.")
1114 if self.cfg.HasAnyDiskOfType(constants.DT_DRBD8):
1115 raise errors.OpPrereqError("Cannot disable drbd helper while"
1116 " drbd-based instances exist",
1117 errors.ECODE_INVAL)
1118
1119 else:
1120 if self.op.drbd_helper is not None and drbd_enabled:
1121 self._CheckDrbdHelperOnNodes(self.op.drbd_helper, node_uuids)
1122 else:
1123 if drbd_gets_enabled:
1124 current_drbd_helper = self.cfg.GetClusterInfo().drbd_usermode_helper
1125 if current_drbd_helper is not None:
1126 self._CheckDrbdHelperOnNodes(current_drbd_helper, node_uuids)
1127 else:
1128 raise errors.OpPrereqError("Cannot enable DRBD without a"
1129 " DRBD usermode helper set.")
1130
1133 """Check whether we try to disable a disk template that is in use.
1134
1135 @type disabled_disk_templates: list of string
1136 @param disabled_disk_templates: list of disk templates that are going to
1137 be disabled by this operation
1138
1139 """
1140 for disk_template in disabled_disk_templates:
1141 if self.cfg.HasAnyDiskOfType(disk_template):
1142 raise errors.OpPrereqError(
1143 "Cannot disable disk template '%s', because there is at least one"
1144 " instance using it." % disk_template)
1145
1147 """Check prerequisites.
1148
1149 This checks whether the given params don't conflict and
1150 if the given volume group is valid.
1151
1152 """
1153 node_uuids = self.owned_locks(locking.LEVEL_NODE)
1154 self.cluster = cluster = self.cfg.GetClusterInfo()
1155
1156 vm_capable_node_uuids = [node.uuid
1157 for node in self.cfg.GetAllNodesInfo().values()
1158 if node.uuid in node_uuids and node.vm_capable]
1159
1160 (enabled_disk_templates, new_enabled_disk_templates,
1161 disabled_disk_templates) = self._GetDiskTemplateSets(cluster)
1162 self._CheckInstancesOfDisabledDiskTemplates(disabled_disk_templates)
1163
1164 self._CheckVgName(vm_capable_node_uuids, enabled_disk_templates,
1165 new_enabled_disk_templates)
1166
1167 if self.op.file_storage_dir is not None:
1168 CheckFileStoragePathVsEnabledDiskTemplates(
1169 self.LogWarning, self.op.file_storage_dir, enabled_disk_templates)
1170
1171 if self.op.shared_file_storage_dir is not None:
1172 CheckSharedFileStoragePathVsEnabledDiskTemplates(
1173 self.LogWarning, self.op.shared_file_storage_dir,
1174 enabled_disk_templates)
1175
1176 drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates
1177 drbd_gets_enabled = constants.DT_DRBD8 in new_enabled_disk_templates
1178 self._CheckDrbdHelper(vm_capable_node_uuids,
1179 drbd_enabled, drbd_gets_enabled)
1180
1181
1182 if self.op.beparams:
1183 objects.UpgradeBeParams(self.op.beparams)
1184 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
1185 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
1186
1187 if self.op.ndparams:
1188 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
1189 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
1190
1191
1192
1193 if self.new_ndparams["oob_program"] == "":
1194 self.new_ndparams["oob_program"] = \
1195 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
1196
1197 if self.op.hv_state:
1198 new_hv_state = MergeAndVerifyHvState(self.op.hv_state,
1199 self.cluster.hv_state_static)
1200 self.new_hv_state = dict((hv, cluster.SimpleFillHvState(values))
1201 for hv, values in new_hv_state.items())
1202
1203 if self.op.disk_state:
1204 new_disk_state = MergeAndVerifyDiskState(self.op.disk_state,
1205 self.cluster.disk_state_static)
1206 self.new_disk_state = \
1207 dict((storage, dict((name, cluster.SimpleFillDiskState(values))
1208 for name, values in svalues.items()))
1209 for storage, svalues in new_disk_state.items())
1210
1211 self._CheckIpolicy(cluster, enabled_disk_templates)
1212
1213 if self.op.nicparams:
1214 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
1215 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
1216 objects.NIC.CheckParameterSyntax(self.new_nicparams)
1217 nic_errors = []
1218
1219
1220 for instance in self.cfg.GetAllInstancesInfo().values():
1221 for nic_idx, nic in enumerate(instance.nics):
1222 params_copy = copy.deepcopy(nic.nicparams)
1223 params_filled = objects.FillDict(self.new_nicparams, params_copy)
1224
1225
1226 try:
1227 objects.NIC.CheckParameterSyntax(params_filled)
1228 except errors.ConfigurationError, err:
1229 nic_errors.append("Instance %s, nic/%d: %s" %
1230 (instance.name, nic_idx, err))
1231
1232
1233 target_mode = params_filled[constants.NIC_MODE]
1234 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
1235 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
1236 " address" % (instance.name, nic_idx))
1237 if nic_errors:
1238 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
1239 "\n".join(nic_errors), errors.ECODE_INVAL)
1240
1241
1242 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
1243 if self.op.hvparams:
1244 for hv_name, hv_dict in self.op.hvparams.items():
1245 if hv_name not in self.new_hvparams:
1246 self.new_hvparams[hv_name] = hv_dict
1247 else:
1248 self.new_hvparams[hv_name].update(hv_dict)
1249
1250
1251 self.new_diskparams = objects.FillDict(cluster.diskparams, {})
1252 if self.op.diskparams:
1253 for dt_name, dt_params in self.op.diskparams.items():
1254 if dt_name not in self.new_diskparams:
1255 self.new_diskparams[dt_name] = dt_params
1256 else:
1257 self.new_diskparams[dt_name].update(dt_params)
1258 CheckDiskAccessModeConsistency(self.op.diskparams, self.cfg)
1259
1260
1261 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
1262 if self.op.os_hvp:
1263 for os_name, hvs in self.op.os_hvp.items():
1264 if os_name not in self.new_os_hvp:
1265 self.new_os_hvp[os_name] = hvs
1266 else:
1267 for hv_name, hv_dict in hvs.items():
1268 if hv_dict is None:
1269
1270 self.new_os_hvp[os_name].pop(hv_name, None)
1271 elif hv_name not in self.new_os_hvp[os_name]:
1272 self.new_os_hvp[os_name][hv_name] = hv_dict
1273 else:
1274 self.new_os_hvp[os_name][hv_name].update(hv_dict)
1275
1276
1277 self.new_osp = objects.FillDict(cluster.osparams, {})
1278 if self.op.osparams:
1279 for os_name, osp in self.op.osparams.items():
1280 if os_name not in self.new_osp:
1281 self.new_osp[os_name] = {}
1282
1283 self.new_osp[os_name] = GetUpdatedParams(self.new_osp[os_name], osp,
1284 use_none=True)
1285
1286 if not self.new_osp[os_name]:
1287
1288 del self.new_osp[os_name]
1289 else:
1290
1291 CheckOSParams(self, False, [self.cfg.GetMasterNode()],
1292 os_name, self.new_osp[os_name])
1293
1294
1295 if self.op.enabled_hypervisors is not None:
1296 for hv in self.op.enabled_hypervisors:
1297
1298
1299
1300
1301
1302 if hv not in new_hvp:
1303 new_hvp[hv] = {}
1304 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
1305 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
1306
1307 if self.op.hvparams or self.op.enabled_hypervisors is not None:
1308
1309 for hv_name, hv_params in self.new_hvparams.items():
1310 if ((self.op.hvparams and hv_name in self.op.hvparams) or
1311 (self.op.enabled_hypervisors and
1312 hv_name in self.op.enabled_hypervisors)):
1313
1314 hv_class = hypervisor.GetHypervisorClass(hv_name)
1315 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1316 hv_class.CheckParameterSyntax(hv_params)
1317 CheckHVParams(self, node_uuids, hv_name, hv_params)
1318
1319 self._CheckDiskTemplateConsistency()
1320
1321 if self.op.os_hvp:
1322
1323
1324 for os_name, os_hvp in self.new_os_hvp.items():
1325 for hv_name, hv_params in os_hvp.items():
1326 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1327
1328 cluster_defaults = self.new_hvparams.get(hv_name, {})
1329 new_osp = objects.FillDict(cluster_defaults, hv_params)
1330 hv_class = hypervisor.GetHypervisorClass(hv_name)
1331 hv_class.CheckParameterSyntax(new_osp)
1332 CheckHVParams(self, node_uuids, hv_name, new_osp)
1333
1334 if self.op.default_iallocator:
1335 alloc_script = utils.FindFile(self.op.default_iallocator,
1336 constants.IALLOCATOR_SEARCH_PATH,
1337 os.path.isfile)
1338 if alloc_script is None:
1339 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
1340 " specified" % self.op.default_iallocator,
1341 errors.ECODE_INVAL)
1342
1344 """Check whether the disk templates that are going to be disabled
1345 are still in use by some instances.
1346
1347 """
1348 if self.op.enabled_disk_templates:
1349 cluster = self.cfg.GetClusterInfo()
1350 instances = self.cfg.GetAllInstancesInfo()
1351
1352 disk_templates_to_remove = set(cluster.enabled_disk_templates) \
1353 - set(self.op.enabled_disk_templates)
1354 for instance in instances.itervalues():
1355 if instance.disk_template in disk_templates_to_remove:
1356 raise errors.OpPrereqError("Cannot disable disk template '%s',"
1357 " because instance '%s' is using it." %
1358 (instance.disk_template, instance.name))
1359
1361 """Determines and sets the new volume group name.
1362
1363 """
1364 if self.op.vg_name is not None:
1365 new_volume = self.op.vg_name
1366 if not new_volume:
1367 new_volume = None
1368 if new_volume != self.cfg.GetVGName():
1369 self.cfg.SetVGName(new_volume)
1370 else:
1371 feedback_fn("Cluster LVM configuration already in desired"
1372 " state, not changing")
1373
1375 """Set the file storage directory.
1376
1377 """
1378 if self.op.file_storage_dir is not None:
1379 if self.cluster.file_storage_dir == self.op.file_storage_dir:
1380 feedback_fn("Global file storage dir already set to value '%s'"
1381 % self.cluster.file_storage_dir)
1382 else:
1383 self.cluster.file_storage_dir = self.op.file_storage_dir
1384
1386 """Set the shared file storage directory.
1387
1388 """
1389 if self.op.shared_file_storage_dir is not None:
1390 if self.cluster.shared_file_storage_dir == \
1391 self.op.shared_file_storage_dir:
1392 feedback_fn("Global shared file storage dir already set to value '%s'"
1393 % self.cluster.shared_file_storage_dir)
1394 else:
1395 self.cluster.shared_file_storage_dir = self.op.shared_file_storage_dir
1396
1398 """Set the DRBD usermode helper.
1399
1400 """
1401 if self.op.drbd_helper is not None:
1402 if not constants.DT_DRBD8 in self.cluster.enabled_disk_templates:
1403 feedback_fn("Note that you specified a drbd user helper, but did not"
1404 " enable the drbd disk template.")
1405 new_helper = self.op.drbd_helper
1406 if not new_helper:
1407 new_helper = None
1408 if new_helper != self.cfg.GetDRBDHelper():
1409 self.cfg.SetDRBDHelper(new_helper)
1410 else:
1411 feedback_fn("Cluster DRBD helper already in desired state,"
1412 " not changing")
1413
1414 - def Exec(self, feedback_fn):
1415 """Change the parameters of the cluster.
1416
1417 """
1418 if self.op.enabled_disk_templates:
1419 self.cluster.enabled_disk_templates = \
1420 list(self.op.enabled_disk_templates)
1421
1422 self._SetVgName(feedback_fn)
1423 self._SetFileStorageDir(feedback_fn)
1424 self._SetSharedFileStorageDir(feedback_fn)
1425 self._SetDrbdHelper(feedback_fn)
1426
1427 ensure_kvmd = False
1428
1429 if self.op.hvparams:
1430 self.cluster.hvparams = self.new_hvparams
1431 if self.op.os_hvp:
1432 self.cluster.os_hvp = self.new_os_hvp
1433 if self.op.enabled_hypervisors is not None:
1434 self.cluster.hvparams = self.new_hvparams
1435 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
1436 ensure_kvmd = True
1437 if self.op.beparams:
1438 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
1439 if self.op.nicparams:
1440 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
1441 if self.op.ipolicy:
1442 self.cluster.ipolicy = self.new_ipolicy
1443 if self.op.osparams:
1444 self.cluster.osparams = self.new_osp
1445 if self.op.ndparams:
1446 self.cluster.ndparams = self.new_ndparams
1447 if self.op.diskparams:
1448 self.cluster.diskparams = self.new_diskparams
1449 if self.op.hv_state:
1450 self.cluster.hv_state_static = self.new_hv_state
1451 if self.op.disk_state:
1452 self.cluster.disk_state_static = self.new_disk_state
1453
1454 if self.op.candidate_pool_size is not None:
1455 self.cluster.candidate_pool_size = self.op.candidate_pool_size
1456
1457 AdjustCandidatePool(self, [], feedback_fn)
1458
1459 if self.op.max_running_jobs is not None:
1460 self.cluster.max_running_jobs = self.op.max_running_jobs
1461
1462 if self.op.maintain_node_health is not None:
1463 if self.op.maintain_node_health and not constants.ENABLE_CONFD:
1464 feedback_fn("Note: CONFD was disabled at build time, node health"
1465 " maintenance is not useful (still enabling it)")
1466 self.cluster.maintain_node_health = self.op.maintain_node_health
1467
1468 if self.op.modify_etc_hosts is not None:
1469 self.cluster.modify_etc_hosts = self.op.modify_etc_hosts
1470
1471 if self.op.prealloc_wipe_disks is not None:
1472 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
1473
1474 if self.op.add_uids is not None:
1475 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
1476
1477 if self.op.remove_uids is not None:
1478 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
1479
1480 if self.op.uid_pool is not None:
1481 self.cluster.uid_pool = self.op.uid_pool
1482
1483 if self.op.default_iallocator is not None:
1484 self.cluster.default_iallocator = self.op.default_iallocator
1485
1486 if self.op.default_iallocator_params is not None:
1487 self.cluster.default_iallocator_params = self.op.default_iallocator_params
1488
1489 if self.op.reserved_lvs is not None:
1490 self.cluster.reserved_lvs = self.op.reserved_lvs
1491
1492 if self.op.use_external_mip_script is not None:
1493 self.cluster.use_external_mip_script = self.op.use_external_mip_script
1494
1495 if self.op.enabled_user_shutdown is not None and \
1496 self.cluster.enabled_user_shutdown != self.op.enabled_user_shutdown:
1497 self.cluster.enabled_user_shutdown = self.op.enabled_user_shutdown
1498 ensure_kvmd = True
1499
1500 def helper_os(aname, mods, desc):
1501 desc += " OS list"
1502 lst = getattr(self.cluster, aname)
1503 for key, val in mods:
1504 if key == constants.DDM_ADD:
1505 if val in lst:
1506 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
1507 else:
1508 lst.append(val)
1509 elif key == constants.DDM_REMOVE:
1510 if val in lst:
1511 lst.remove(val)
1512 else:
1513 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
1514 else:
1515 raise errors.ProgrammerError("Invalid modification '%s'" % key)
1516
1517 if self.op.hidden_os:
1518 helper_os("hidden_os", self.op.hidden_os, "hidden")
1519
1520 if self.op.blacklisted_os:
1521 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
1522
1523 if self.op.master_netdev:
1524 master_params = self.cfg.GetMasterNetworkParameters()
1525 ems = self.cfg.GetUseExternalMipScript()
1526 feedback_fn("Shutting down master ip on the current netdev (%s)" %
1527 self.cluster.master_netdev)
1528 result = self.rpc.call_node_deactivate_master_ip(master_params.uuid,
1529 master_params, ems)
1530 if not self.op.force:
1531 result.Raise("Could not disable the master ip")
1532 else:
1533 if result.fail_msg:
1534 msg = ("Could not disable the master ip (continuing anyway): %s" %
1535 result.fail_msg)
1536 feedback_fn(msg)
1537 feedback_fn("Changing master_netdev from %s to %s" %
1538 (master_params.netdev, self.op.master_netdev))
1539 self.cluster.master_netdev = self.op.master_netdev
1540
1541 if self.op.master_netmask:
1542 master_params = self.cfg.GetMasterNetworkParameters()
1543 feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask)
1544 result = self.rpc.call_node_change_master_netmask(
1545 master_params.uuid, master_params.netmask,
1546 self.op.master_netmask, master_params.ip,
1547 master_params.netdev)
1548 result.Warn("Could not change the master IP netmask", feedback_fn)
1549 self.cluster.master_netmask = self.op.master_netmask
1550
1551 self.cfg.Update(self.cluster, feedback_fn)
1552
1553 if self.op.master_netdev:
1554 master_params = self.cfg.GetMasterNetworkParameters()
1555 feedback_fn("Starting the master ip on the new master netdev (%s)" %
1556 self.op.master_netdev)
1557 ems = self.cfg.GetUseExternalMipScript()
1558 result = self.rpc.call_node_activate_master_ip(master_params.uuid,
1559 master_params, ems)
1560 result.Warn("Could not re-enable the master ip on the master,"
1561 " please restart manually", self.LogWarning)
1562
1563
1564
1565
1566
1567 if ensure_kvmd:
1568 EnsureKvmdOnNodes(self, feedback_fn)
1569
1572 """Submits all jobs necessary to verify the cluster.
1573
1574 """
1575 REQ_BGL = False
1576
1578 self.needed_locks = {}
1579
1580 - def Exec(self, feedback_fn):
1581 jobs = []
1582
1583 if self.op.group_name:
1584 groups = [self.op.group_name]
1585 depends_fn = lambda: None
1586 else:
1587 groups = self.cfg.GetNodeGroupList()
1588
1589
1590 jobs.append([
1591 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors),
1592 ])
1593
1594
1595 depends_fn = lambda: [(-len(jobs), [])]
1596
1597 jobs.extend(
1598 [opcodes.OpClusterVerifyGroup(group_name=group,
1599 ignore_errors=self.op.ignore_errors,
1600 depends=depends_fn())]
1601 for group in groups)
1602
1603
1604 for op in itertools.chain(*jobs):
1605 op.debug_simulate_errors = self.op.debug_simulate_errors
1606 op.verbose = self.op.verbose
1607 op.error_codes = self.op.error_codes
1608 try:
1609 op.skip_checks = self.op.skip_checks
1610 except AttributeError:
1611 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
1612
1613 return ResultWithJobs(jobs)
1614
1617 """Mix-in for cluster/group verify LUs.
1618
1619 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
1620 self.op and self._feedback_fn to be available.)
1621
1622 """
1623
1624 ETYPE_FIELD = "code"
1625 ETYPE_ERROR = constants.CV_ERROR
1626 ETYPE_WARNING = constants.CV_WARNING
1627
1628 - def _Error(self, ecode, item, msg, *args, **kwargs):
1629 """Format an error message.
1630
1631 Based on the opcode's error_codes parameter, either format a
1632 parseable error code, or a simpler error string.
1633
1634 This must be called only from Exec and functions called from Exec.
1635
1636 """
1637 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1638 itype, etxt, _ = ecode
1639
1640
1641 if etxt in self.op.ignore_errors:
1642 ltype = self.ETYPE_WARNING
1643
1644 if args:
1645 msg = msg % args
1646
1647 if self.op.error_codes:
1648 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1649 else:
1650 if item:
1651 item = " " + item
1652 else:
1653 item = ""
1654 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1655
1656 self._feedback_fn(" - %s" % msg)
1657
1658 if ltype == self.ETYPE_ERROR:
1659 self.bad = True
1660
1661 - def _ErrorIf(self, cond, *args, **kwargs):
1662 """Log an error message if the passed condition is True.
1663
1664 """
1665 if (bool(cond)
1666 or self.op.debug_simulate_errors):
1667 self._Error(*args, **kwargs)
1668
1671 """Compute the set of all hypervisor parameters.
1672
1673 @type cluster: L{objects.Cluster}
1674 @param cluster: the cluster object
1675 @param instances: list of L{objects.Instance}
1676 @param instances: additional instances from which to obtain parameters
1677 @rtype: list of (origin, hypervisor, parameters)
1678 @return: a list with all parameters found, indicating the hypervisor they
1679 apply to, and the origin (can be "cluster", "os X", or "instance Y")
1680
1681 """
1682 hvp_data = []
1683
1684 for hv_name in cluster.enabled_hypervisors:
1685 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
1686
1687 for os_name, os_hvp in cluster.os_hvp.items():
1688 for hv_name, hv_params in os_hvp.items():
1689 if hv_params:
1690 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
1691 hvp_data.append(("os %s" % os_name, hv_name, full_params))
1692
1693
1694 for instance in instances:
1695 if instance.hvparams:
1696 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
1697 cluster.FillHV(instance)))
1698
1699 return hvp_data
1700
1703 """Verifies the cluster config.
1704
1705 """
1706 REQ_BGL = False
1707
1721
1725
1734
1735 - def Exec(self, feedback_fn):
1736 """Verify integrity of cluster, performing various test on nodes.
1737
1738 """
1739 self.bad = False
1740 self._feedback_fn = feedback_fn
1741
1742 feedback_fn("* Verifying cluster config")
1743
1744 for msg in self.cfg.VerifyConfig():
1745 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
1746
1747 feedback_fn("* Verifying cluster certificate files")
1748
1749 for cert_filename in pathutils.ALL_CERT_FILES:
1750 (errcode, msg) = utils.VerifyCertificate(cert_filename)
1751 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
1752
1753 self._ErrorIf(not utils.CanRead(constants.LUXID_USER,
1754 pathutils.NODED_CERT_FILE),
1755 constants.CV_ECLUSTERCERT,
1756 None,
1757 pathutils.NODED_CERT_FILE + " must be accessible by the " +
1758 constants.LUXID_USER + " user")
1759
1760 feedback_fn("* Verifying hypervisor parameters")
1761
1762 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
1763 self.all_inst_info.values()))
1764
1765 feedback_fn("* Verifying all nodes belong to an existing group")
1766
1767
1768
1769
1770
1771 dangling_nodes = set(node for node in self.all_node_info.values()
1772 if node.group not in self.all_group_info)
1773
1774 dangling_instances = {}
1775 no_node_instances = []
1776
1777 for inst in self.all_inst_info.values():
1778 if inst.primary_node in [node.uuid for node in dangling_nodes]:
1779 dangling_instances.setdefault(inst.primary_node, []).append(inst)
1780 elif inst.primary_node not in self.all_node_info:
1781 no_node_instances.append(inst)
1782
1783 pretty_dangling = [
1784 "%s (%s)" %
1785 (node.name,
1786 utils.CommaJoin(inst.name for
1787 inst in dangling_instances.get(node.uuid, [])))
1788 for node in dangling_nodes]
1789
1790 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
1791 None,
1792 "the following nodes (and their instances) belong to a non"
1793 " existing group: %s", utils.CommaJoin(pretty_dangling))
1794
1795 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
1796 None,
1797 "the following instances have a non-existing primary-node:"
1798 " %s", utils.CommaJoin(inst.name for
1799 inst in no_node_instances))
1800
1801 return not self.bad
1802
1805 """Verifies the status of a node group.
1806
1807 """
1808 HPATH = "cluster-verify"
1809 HTYPE = constants.HTYPE_CLUSTER
1810 REQ_BGL = False
1811
1812 _HOOKS_INDENT_RE = re.compile("^", re.M)
1813
1815 """A class representing the logical and physical status of a node.
1816
1817 @type uuid: string
1818 @ivar uuid: the node UUID to which this object refers
1819 @ivar volumes: a structure as returned from
1820 L{ganeti.backend.GetVolumeList} (runtime)
1821 @ivar instances: a list of running instances (runtime)
1822 @ivar pinst: list of configured primary instances (config)
1823 @ivar sinst: list of configured secondary instances (config)
1824 @ivar sbp: dictionary of {primary-node: list of instances} for all
1825 instances for which this node is secondary (config)
1826 @ivar mfree: free memory, as reported by hypervisor (runtime)
1827 @ivar dfree: free disk, as reported by the node (runtime)
1828 @ivar offline: the offline status (config)
1829 @type rpc_fail: boolean
1830 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1831 not whether the individual keys were correct) (runtime)
1832 @type lvm_fail: boolean
1833 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1834 @type hyp_fail: boolean
1835 @ivar hyp_fail: whether the RPC call didn't return the instance list
1836 @type ghost: boolean
1837 @ivar ghost: whether this is a known node or not (config)
1838 @type os_fail: boolean
1839 @ivar os_fail: whether the RPC call didn't return valid OS data
1840 @type oslist: list
1841 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1842 @type vm_capable: boolean
1843 @ivar vm_capable: whether the node can host instances
1844 @type pv_min: float
1845 @ivar pv_min: size in MiB of the smallest PVs
1846 @type pv_max: float
1847 @ivar pv_max: size in MiB of the biggest PVs
1848
1849 """
1850 - def __init__(self, offline=False, uuid=None, vm_capable=True):
1851 self.uuid = uuid
1852 self.volumes = {}
1853 self.instances = []
1854 self.pinst = []
1855 self.sinst = []
1856 self.sbp = {}
1857 self.mfree = 0
1858 self.dfree = 0
1859 self.offline = offline
1860 self.vm_capable = vm_capable
1861 self.rpc_fail = False
1862 self.lvm_fail = False
1863 self.hyp_fail = False
1864 self.ghost = False
1865 self.os_fail = False
1866 self.oslist = {}
1867 self.pv_min = None
1868 self.pv_max = None
1869
1890
1908
1910 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
1911 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
1912
1913 group_node_uuids = set(self.group_info.members)
1914 group_inst_uuids = \
1915 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
1916
1917 unlocked_node_uuids = \
1918 group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE))
1919
1920 unlocked_inst_uuids = \
1921 group_inst_uuids.difference(
1922 [self.cfg.GetInstanceInfoByName(name).uuid
1923 for name in self.owned_locks(locking.LEVEL_INSTANCE)])
1924
1925 if unlocked_node_uuids:
1926 raise errors.OpPrereqError(
1927 "Missing lock for nodes: %s" %
1928 utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)),
1929 errors.ECODE_STATE)
1930
1931 if unlocked_inst_uuids:
1932 raise errors.OpPrereqError(
1933 "Missing lock for instances: %s" %
1934 utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)),
1935 errors.ECODE_STATE)
1936
1937 self.all_node_info = self.cfg.GetAllNodesInfo()
1938 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1939
1940 self.my_node_uuids = group_node_uuids
1941 self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid])
1942 for node_uuid in group_node_uuids)
1943
1944 self.my_inst_uuids = group_inst_uuids
1945 self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid])
1946 for inst_uuid in group_inst_uuids)
1947
1948
1949
1950 extra_lv_nodes = set()
1951
1952 for inst in self.my_inst_info.values():
1953 if inst.disk_template in constants.DTS_INT_MIRROR:
1954 for nuuid in inst.all_nodes:
1955 if self.all_node_info[nuuid].group != self.group_uuid:
1956 extra_lv_nodes.add(nuuid)
1957
1958 unlocked_lv_nodes = \
1959 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1960
1961 if unlocked_lv_nodes:
1962 raise errors.OpPrereqError("Missing node locks for LV check: %s" %
1963 utils.CommaJoin(unlocked_lv_nodes),
1964 errors.ECODE_STATE)
1965 self.extra_lv_nodes = list(extra_lv_nodes)
1966
1968 """Perform some basic validation on data returned from a node.
1969
1970 - check the result data structure is well formed and has all the
1971 mandatory fields
1972 - check ganeti version
1973
1974 @type ninfo: L{objects.Node}
1975 @param ninfo: the node to check
1976 @param nresult: the results from the node
1977 @rtype: boolean
1978 @return: whether overall this call was successful (and we can expect
1979 reasonable values in the respose)
1980
1981 """
1982
1983 test = not nresult or not isinstance(nresult, dict)
1984 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
1985 "unable to verify node: no data returned")
1986 if test:
1987 return False
1988
1989
1990 local_version = constants.PROTOCOL_VERSION
1991 remote_version = nresult.get("version", None)
1992 test = not (remote_version and
1993 isinstance(remote_version, (list, tuple)) and
1994 len(remote_version) == 2)
1995 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
1996 "connection to node returned invalid data")
1997 if test:
1998 return False
1999
2000 test = local_version != remote_version[0]
2001 self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name,
2002 "incompatible protocol versions: master %s,"
2003 " node %s", local_version, remote_version[0])
2004 if test:
2005 return False
2006
2007
2008
2009
2010 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
2011 constants.CV_ENODEVERSION, ninfo.name,
2012 "software version mismatch: master %s, node %s",
2013 constants.RELEASE_VERSION, remote_version[1],
2014 code=self.ETYPE_WARNING)
2015
2016 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
2017 if ninfo.vm_capable and isinstance(hyp_result, dict):
2018 for hv_name, hv_result in hyp_result.iteritems():
2019 test = hv_result is not None
2020 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
2021 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
2022
2023 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
2024 if ninfo.vm_capable and isinstance(hvp_result, list):
2025 for item, hv_name, hv_result in hvp_result:
2026 self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name,
2027 "hypervisor %s parameter verify failure (source %s): %s",
2028 hv_name, item, hv_result)
2029
2030 test = nresult.get(constants.NV_NODESETUP,
2031 ["Missing NODESETUP results"])
2032 self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name,
2033 "node setup error: %s", "; ".join(test))
2034
2035 return True
2036
2037 - def _VerifyNodeTime(self, ninfo, nresult,
2038 nvinfo_starttime, nvinfo_endtime):
2039 """Check the node time.
2040
2041 @type ninfo: L{objects.Node}
2042 @param ninfo: the node to check
2043 @param nresult: the remote results for the node
2044 @param nvinfo_starttime: the start time of the RPC call
2045 @param nvinfo_endtime: the end time of the RPC call
2046
2047 """
2048 ntime = nresult.get(constants.NV_TIME, None)
2049 try:
2050 ntime_merged = utils.MergeTime(ntime)
2051 except (ValueError, TypeError):
2052 self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name,
2053 "Node returned invalid time")
2054 return
2055
2056 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
2057 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
2058 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
2059 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
2060 else:
2061 ntime_diff = None
2062
2063 self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name,
2064 "Node time diverges by at least %s from master node time",
2065 ntime_diff)
2066
2068 """Check the node LVM results and update info for cross-node checks.
2069
2070 @type ninfo: L{objects.Node}
2071 @param ninfo: the node to check
2072 @param nresult: the remote results for the node
2073 @param vg_name: the configured VG name
2074 @type nimg: L{NodeImage}
2075 @param nimg: node image
2076
2077 """
2078 if vg_name is None:
2079 return
2080
2081
2082 vglist = nresult.get(constants.NV_VGLIST, None)
2083 test = not vglist
2084 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
2085 "unable to check volume groups")
2086 if not test:
2087 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
2088 constants.MIN_VG_SIZE)
2089 self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus)
2090
2091
2092 (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage)
2093 for em in errmsgs:
2094 self._Error(constants.CV_ENODELVM, ninfo.name, em)
2095 if pvminmax is not None:
2096 (nimg.pv_min, nimg.pv_max) = pvminmax
2097
2099 """Check cross-node DRBD version consistency.
2100
2101 @type node_verify_infos: dict
2102 @param node_verify_infos: infos about nodes as returned from the
2103 node_verify call.
2104
2105 """
2106 node_versions = {}
2107 for node_uuid, ndata in node_verify_infos.items():
2108 nresult = ndata.payload
2109 if nresult:
2110 version = nresult.get(constants.NV_DRBDVERSION, None)
2111 if version:
2112 node_versions[node_uuid] = version
2113
2114 if len(set(node_versions.values())) > 1:
2115 for node_uuid, version in sorted(node_versions.items()):
2116 msg = "DRBD version mismatch: %s" % version
2117 self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg,
2118 code=self.ETYPE_WARNING)
2119
2121 """Check cross-node consistency in LVM.
2122
2123 @type node_image: dict
2124 @param node_image: info about nodes, mapping from node to names to
2125 L{NodeImage} objects
2126 @param vg_name: the configured VG name
2127
2128 """
2129 if vg_name is None:
2130 return
2131
2132
2133 if not self._exclusive_storage:
2134 return
2135
2136
2137
2138
2139 vals = filter((lambda ni: ni.pv_min is not None), node_image.values())
2140 if not vals:
2141 return
2142 (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals)
2143 (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals)
2144 bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax)
2145 self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE, self.group_info.name,
2146 "PV sizes differ too much in the group; smallest (%s MB) is"
2147 " on %s, biggest (%s MB) is on %s",
2148 pvmin, self.cfg.GetNodeName(minnode_uuid),
2149 pvmax, self.cfg.GetNodeName(maxnode_uuid))
2150
2152 """Check the node bridges.
2153
2154 @type ninfo: L{objects.Node}
2155 @param ninfo: the node to check
2156 @param nresult: the remote results for the node
2157 @param bridges: the expected list of bridges
2158
2159 """
2160 if not bridges:
2161 return
2162
2163 missing = nresult.get(constants.NV_BRIDGES, None)
2164 test = not isinstance(missing, list)
2165 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
2166 "did not return valid bridge information")
2167 if not test:
2168 self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name,
2169 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
2170
2188
2190 """Check the node network connectivity results.
2191
2192 @type ninfo: L{objects.Node}
2193 @param ninfo: the node to check
2194 @param nresult: the remote results for the node
2195
2196 """
2197 test = constants.NV_NODELIST not in nresult
2198 self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name,
2199 "node hasn't returned node ssh connectivity data")
2200 if not test:
2201 if nresult[constants.NV_NODELIST]:
2202 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
2203 self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name,
2204 "ssh communication with node '%s': %s", a_node, a_msg)
2205
2206 test = constants.NV_NODENETTEST not in nresult
2207 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
2208 "node hasn't returned node tcp connectivity data")
2209 if not test:
2210 if nresult[constants.NV_NODENETTEST]:
2211 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
2212 for anode in nlist:
2213 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name,
2214 "tcp communication with node '%s': %s",
2215 anode, nresult[constants.NV_NODENETTEST][anode])
2216
2217 test = constants.NV_MASTERIP not in nresult
2218 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
2219 "node hasn't returned node master IP reachability data")
2220 if not test:
2221 if not nresult[constants.NV_MASTERIP]:
2222 if ninfo.uuid == self.master_node:
2223 msg = "the master node cannot reach the master IP (not configured?)"
2224 else:
2225 msg = "cannot reach the master IP"
2226 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg)
2227
2229 """Verify an instance.
2230
2231 This function checks to see if the required block devices are
2232 available on the instance's node, and that the nodes are in the correct
2233 state.
2234
2235 """
2236 pnode_uuid = instance.primary_node
2237 pnode_img = node_image[pnode_uuid]
2238 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2239
2240 node_vol_should = {}
2241 instance.MapLVsByNode(node_vol_should)
2242
2243 cluster = self.cfg.GetClusterInfo()
2244 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster,
2245 self.group_info)
2246 err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg)
2247 self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name,
2248 utils.CommaJoin(err), code=self.ETYPE_WARNING)
2249
2250 for node_uuid in node_vol_should:
2251 n_img = node_image[node_uuid]
2252 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2253
2254 continue
2255 for volume in node_vol_should[node_uuid]:
2256 test = volume not in n_img.volumes
2257 self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name,
2258 "volume %s missing on node %s", volume,
2259 self.cfg.GetNodeName(node_uuid))
2260
2261 if instance.admin_state == constants.ADMINST_UP:
2262 test = instance.uuid not in pnode_img.instances and not pnode_img.offline
2263 self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name,
2264 "instance not running on its primary node %s",
2265 self.cfg.GetNodeName(pnode_uuid))
2266 self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE,
2267 instance.name, "instance is marked as running and lives on"
2268 " offline node %s", self.cfg.GetNodeName(pnode_uuid))
2269
2270 diskdata = [(nname, success, status, idx)
2271 for (nname, disks) in diskstatus.items()
2272 for idx, (success, status) in enumerate(disks)]
2273
2274 for nname, success, bdev_status, idx in diskdata:
2275
2276
2277 snode = node_image[nname]
2278 bad_snode = snode.ghost or snode.offline
2279 self._ErrorIf(instance.disks_active and
2280 not success and not bad_snode,
2281 constants.CV_EINSTANCEFAULTYDISK, instance.name,
2282 "couldn't retrieve status for disk/%s on %s: %s",
2283 idx, self.cfg.GetNodeName(nname), bdev_status)
2284
2285 if instance.disks_active and success and \
2286 (bdev_status.is_degraded or
2287 bdev_status.ldisk_status != constants.LDS_OKAY):
2288 msg = "disk/%s on %s" % (idx, self.cfg.GetNodeName(nname))
2289 if bdev_status.is_degraded:
2290 msg += " is degraded"
2291 if bdev_status.ldisk_status != constants.LDS_OKAY:
2292 msg += "; state is '%s'" % \
2293 constants.LDS_NAMES[bdev_status.ldisk_status]
2294
2295 self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg)
2296
2297 self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2298 constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid),
2299 "instance %s, connection to primary node failed",
2300 instance.name)
2301
2302 self._ErrorIf(len(instance.secondary_nodes) > 1,
2303 constants.CV_EINSTANCELAYOUT, instance.name,
2304 "instance has multiple secondary nodes: %s",
2305 utils.CommaJoin(instance.secondary_nodes),
2306 code=self.ETYPE_WARNING)
2307
2308 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, instance.all_nodes)
2309 if any(es_flags.values()):
2310 if instance.disk_template not in constants.DTS_EXCL_STORAGE:
2311
2312
2313 es_nodes = [n
2314 for (n, es) in es_flags.items()
2315 if es]
2316 self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name,
2317 "instance has template %s, which is not supported on nodes"
2318 " that have exclusive storage set: %s",
2319 instance.disk_template,
2320 utils.CommaJoin(self.cfg.GetNodeNames(es_nodes)))
2321 for (idx, disk) in enumerate(instance.disks):
2322 self._ErrorIf(disk.spindles is None,
2323 constants.CV_EINSTANCEMISSINGCFGPARAMETER, instance.name,
2324 "number of spindles not configured for disk %s while"
2325 " exclusive storage is enabled, try running"
2326 " gnt-cluster repair-disk-sizes", idx)
2327
2328 if instance.disk_template in constants.DTS_INT_MIRROR:
2329 instance_nodes = utils.NiceSort(instance.all_nodes)
2330 instance_groups = {}
2331
2332 for node_uuid in instance_nodes:
2333 instance_groups.setdefault(self.all_node_info[node_uuid].group,
2334 []).append(node_uuid)
2335
2336 pretty_list = [
2337 "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)),
2338 groupinfo[group].name)
2339
2340 for group, nodes in sorted(instance_groups.items(),
2341 key=lambda (_, nodes): pnode_uuid in nodes,
2342 reverse=True)]
2343
2344 self._ErrorIf(len(instance_groups) > 1,
2345 constants.CV_EINSTANCESPLITGROUPS,
2346 instance.name, "instance has primary and secondary nodes in"
2347 " different groups: %s", utils.CommaJoin(pretty_list),
2348 code=self.ETYPE_WARNING)
2349
2350 inst_nodes_offline = []
2351 for snode in instance.secondary_nodes:
2352 s_img = node_image[snode]
2353 self._ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
2354 self.cfg.GetNodeName(snode),
2355 "instance %s, connection to secondary node failed",
2356 instance.name)
2357
2358 if s_img.offline:
2359 inst_nodes_offline.append(snode)
2360
2361
2362 self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE,
2363 instance.name, "instance has offline secondary node(s) %s",
2364 utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline)))
2365
2366 for node_uuid in instance.all_nodes:
2367 self._ErrorIf(node_image[node_uuid].ghost, constants.CV_EINSTANCEBADNODE,
2368 instance.name, "instance lives on ghost node %s",
2369 self.cfg.GetNodeName(node_uuid))
2370 self._ErrorIf(not node_image[node_uuid].vm_capable,
2371 constants.CV_EINSTANCEBADNODE, instance.name,
2372 "instance lives on non-vm_capable node %s",
2373 self.cfg.GetNodeName(node_uuid))
2374
2376 """Verify if there are any unknown volumes in the cluster.
2377
2378 The .os, .swap and backup volumes are ignored. All other volumes are
2379 reported as unknown.
2380
2381 @type reserved: L{ganeti.utils.FieldSet}
2382 @param reserved: a FieldSet of reserved volume names
2383
2384 """
2385 for node_uuid, n_img in node_image.items():
2386 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or
2387 self.all_node_info[node_uuid].group != self.group_uuid):
2388
2389 continue
2390 for volume in n_img.volumes:
2391 test = ((node_uuid not in node_vol_should or
2392 volume not in node_vol_should[node_uuid]) and
2393 not reserved.Matches(volume))
2394 self._ErrorIf(test, constants.CV_ENODEORPHANLV,
2395 self.cfg.GetNodeName(node_uuid),
2396 "volume %s is unknown", volume,
2397 code=_VerifyErrors.ETYPE_WARNING)
2398
2400 """Verify N+1 Memory Resilience.
2401
2402 Check that if one single node dies we can still start all the
2403 instances it was primary for.
2404
2405 """
2406 cluster_info = self.cfg.GetClusterInfo()
2407 for node_uuid, n_img in node_image.items():
2408
2409
2410
2411
2412
2413
2414
2415
2416 if n_img.offline or \
2417 self.all_node_info[node_uuid].group != self.group_uuid:
2418
2419
2420
2421
2422 continue
2423
2424 for prinode, inst_uuids in n_img.sbp.items():
2425 needed_mem = 0
2426 for inst_uuid in inst_uuids:
2427 bep = cluster_info.FillBE(all_insts[inst_uuid])
2428 if bep[constants.BE_AUTO_BALANCE]:
2429 needed_mem += bep[constants.BE_MINMEM]
2430 test = n_img.mfree < needed_mem
2431 self._ErrorIf(test, constants.CV_ENODEN1,
2432 self.cfg.GetNodeName(node_uuid),
2433 "not enough memory to accomodate instance failovers"
2434 " should node %s fail (%dMiB needed, %dMiB available)",
2435 self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
2436
2438 """Verifies the consistency of the client certificates.
2439
2440 This includes several aspects:
2441 - the individual validation of all nodes' certificates
2442 - the consistency of the master candidate certificate map
2443 - the consistency of the master candidate certificate map with the
2444 certificates that the master candidates are actually using.
2445
2446 @param nodes: the list of nodes to consider in this verification
2447 @param all_nvinfo: the map of results of the verify_node call to
2448 all nodes
2449
2450 """
2451 candidate_certs = self.cfg.GetClusterInfo().candidate_certs
2452 if candidate_certs is None or len(candidate_certs) == 0:
2453 self._ErrorIf(
2454 True, constants.CV_ECLUSTERCLIENTCERT, None,
2455 "The cluster's list of master candidate certificates is empty."
2456 " If you just updated the cluster, please run"
2457 " 'gnt-cluster renew-crypto --new-node-certificates'.")
2458 return
2459
2460 self._ErrorIf(
2461 len(candidate_certs) != len(set(candidate_certs.values())),
2462 constants.CV_ECLUSTERCLIENTCERT, None,
2463 "There are at least two master candidates configured to use the same"
2464 " certificate.")
2465
2466
2467 for node in nodes:
2468 if node.offline:
2469 continue
2470
2471 nresult = all_nvinfo[node.uuid]
2472 if nresult.fail_msg or not nresult.payload:
2473 continue
2474
2475 (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None)
2476
2477 self._ErrorIf(
2478 errcode is not None, constants.CV_ECLUSTERCLIENTCERT, None,
2479 "Client certificate of node '%s' failed validation: %s (code '%s')",
2480 node.uuid, msg, errcode)
2481
2482 if not errcode:
2483 digest = msg
2484 if node.master_candidate:
2485 if node.uuid in candidate_certs:
2486 self._ErrorIf(
2487 digest != candidate_certs[node.uuid],
2488 constants.CV_ECLUSTERCLIENTCERT, None,
2489 "Client certificate digest of master candidate '%s' does not"
2490 " match its entry in the cluster's map of master candidate"
2491 " certificates. Expected: %s Got: %s", node.uuid,
2492 digest, candidate_certs[node.uuid])
2493 else:
2494 self._ErrorIf(
2495 True, constants.CV_ECLUSTERCLIENTCERT, None,
2496 "The master candidate '%s' does not have an entry in the"
2497 " map of candidate certificates.", node.uuid)
2498 self._ErrorIf(
2499 digest in candidate_certs.values(),
2500 constants.CV_ECLUSTERCLIENTCERT, None,
2501 "Master candidate '%s' is using a certificate of another node.",
2502 node.uuid)
2503 else:
2504 self._ErrorIf(
2505 node.uuid in candidate_certs,
2506 constants.CV_ECLUSTERCLIENTCERT, None,
2507 "Node '%s' is not a master candidate, but still listed in the"
2508 " map of master candidate certificates.", node.uuid)
2509 self._ErrorIf(
2510 (node.uuid not in candidate_certs) and
2511 (digest in candidate_certs.values()),
2512 constants.CV_ECLUSTERCLIENTCERT, None,
2513 "Node '%s' is not a master candidate and is incorrectly using a"
2514 " certificate of another node which is master candidate.",
2515 node.uuid)
2516
2517 - def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo,
2518 (files_all, files_opt, files_mc, files_vm)):
2519 """Verifies file checksums collected from all nodes.
2520
2521 @param nodes: List of L{objects.Node} objects
2522 @param master_node_uuid: UUID of master node
2523 @param all_nvinfo: RPC results
2524
2525 """
2526
2527 files2nodefn = [
2528 (files_all, None),
2529 (files_mc, lambda node: (node.master_candidate or
2530 node.uuid == master_node_uuid)),
2531 (files_vm, lambda node: node.vm_capable),
2532 ]
2533
2534
2535 nodefiles = {}
2536 for (files, fn) in files2nodefn:
2537 if fn is None:
2538 filenodes = nodes
2539 else:
2540 filenodes = filter(fn, nodes)
2541 nodefiles.update((filename,
2542 frozenset(map(operator.attrgetter("uuid"), filenodes)))
2543 for filename in files)
2544
2545 assert set(nodefiles) == (files_all | files_mc | files_vm)
2546
2547 fileinfo = dict((filename, {}) for filename in nodefiles)
2548 ignore_nodes = set()
2549
2550 for node in nodes:
2551 if node.offline:
2552 ignore_nodes.add(node.uuid)
2553 continue
2554
2555 nresult = all_nvinfo[node.uuid]
2556
2557 if nresult.fail_msg or not nresult.payload:
2558 node_files = None
2559 else:
2560 fingerprints = nresult.payload.get(constants.NV_FILELIST, {})
2561 node_files = dict((vcluster.LocalizeVirtualPath(key), value)
2562 for (key, value) in fingerprints.items())
2563 del fingerprints
2564
2565 test = not (node_files and isinstance(node_files, dict))
2566 self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name,
2567 "Node did not return file checksum data")
2568 if test:
2569 ignore_nodes.add(node.uuid)
2570 continue
2571
2572
2573 for (filename, checksum) in node_files.items():
2574 assert filename in nodefiles
2575 fileinfo[filename].setdefault(checksum, set()).add(node.uuid)
2576
2577 for (filename, checksums) in fileinfo.items():
2578 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
2579
2580
2581 with_file = frozenset(node_uuid
2582 for node_uuids in fileinfo[filename].values()
2583 for node_uuid in node_uuids) - ignore_nodes
2584
2585 expected_nodes = nodefiles[filename] - ignore_nodes
2586
2587
2588 missing_file = expected_nodes - with_file
2589
2590 if filename in files_opt:
2591
2592 self._ErrorIf(missing_file and missing_file != expected_nodes,
2593 constants.CV_ECLUSTERFILECHECK, None,
2594 "File %s is optional, but it must exist on all or no"
2595 " nodes (not found on %s)",
2596 filename,
2597 utils.CommaJoin(
2598 utils.NiceSort(
2599 map(self.cfg.GetNodeName, missing_file))))
2600 else:
2601 self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None,
2602 "File %s is missing from node(s) %s", filename,
2603 utils.CommaJoin(
2604 utils.NiceSort(
2605 map(self.cfg.GetNodeName, missing_file))))
2606
2607
2608 unexpected = with_file - expected_nodes
2609 self._ErrorIf(unexpected,
2610 constants.CV_ECLUSTERFILECHECK, None,
2611 "File %s should not exist on node(s) %s",
2612 filename, utils.CommaJoin(
2613 utils.NiceSort(map(self.cfg.GetNodeName, unexpected))))
2614
2615
2616 test = len(checksums) > 1
2617 if test:
2618 variants = ["variant %s on %s" %
2619 (idx + 1,
2620 utils.CommaJoin(utils.NiceSort(
2621 map(self.cfg.GetNodeName, node_uuids))))
2622 for (idx, (checksum, node_uuids)) in
2623 enumerate(sorted(checksums.items()))]
2624 else:
2625 variants = []
2626
2627 self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None,
2628 "File %s found with %s different checksums (%s)",
2629 filename, len(checksums), "; ".join(variants))
2630
2632 """Verify the drbd helper.
2633
2634 """
2635 if drbd_helper:
2636 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
2637 test = (helper_result is None)
2638 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
2639 "no drbd usermode helper returned")
2640 if helper_result:
2641 status, payload = helper_result
2642 test = not status
2643 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
2644 "drbd usermode helper check unsuccessful: %s", payload)
2645 test = status and (payload != drbd_helper)
2646 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
2647 "wrong drbd usermode helper: %s", payload)
2648
2649 - def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
2650 drbd_map):
2651 """Verifies and the node DRBD status.
2652
2653 @type ninfo: L{objects.Node}
2654 @param ninfo: the node to check
2655 @param nresult: the remote results for the node
2656 @param instanceinfo: the dict of instances
2657 @param drbd_helper: the configured DRBD usermode helper
2658 @param drbd_map: the DRBD map as returned by
2659 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
2660
2661 """
2662 self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper)
2663
2664
2665 node_drbd = {}
2666 for minor, inst_uuid in drbd_map[ninfo.uuid].items():
2667 test = inst_uuid not in instanceinfo
2668 self._ErrorIf(test, constants.CV_ECLUSTERCFG, None,
2669 "ghost instance '%s' in temporary DRBD map", inst_uuid)
2670
2671
2672
2673 if test:
2674 node_drbd[minor] = (inst_uuid, False)
2675 else:
2676 instance = instanceinfo[inst_uuid]
2677 node_drbd[minor] = (inst_uuid, instance.disks_active)
2678
2679
2680 used_minors = nresult.get(constants.NV_DRBDLIST, [])
2681 test = not isinstance(used_minors, (tuple, list))
2682 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
2683 "cannot parse drbd status file: %s", str(used_minors))
2684 if test:
2685
2686 return
2687
2688 for minor, (inst_uuid, must_exist) in node_drbd.items():
2689 test = minor not in used_minors and must_exist
2690 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
2691 "drbd minor %d of instance %s is not active", minor,
2692 self.cfg.GetInstanceName(inst_uuid))
2693 for minor in used_minors:
2694 test = minor not in node_drbd
2695 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
2696 "unallocated drbd minor %d is in use", minor)
2697
2699 """Builds the node OS structures.
2700
2701 @type ninfo: L{objects.Node}
2702 @param ninfo: the node to check
2703 @param nresult: the remote results for the node
2704 @param nimg: the node image object
2705
2706 """
2707 remote_os = nresult.get(constants.NV_OSLIST, None)
2708 test = (not isinstance(remote_os, list) or
2709 not compat.all(isinstance(v, list) and len(v) == 7
2710 for v in remote_os))
2711
2712 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
2713 "node hasn't returned valid OS data")
2714
2715 nimg.os_fail = test
2716
2717 if test:
2718 return
2719
2720 os_dict = {}
2721
2722 for (name, os_path, status, diagnose,
2723 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
2724
2725 if name not in os_dict:
2726 os_dict[name] = []
2727
2728
2729
2730 parameters = [tuple(v) for v in parameters]
2731 os_dict[name].append((os_path, status, diagnose,
2732 set(variants), set(parameters), set(api_ver)))
2733
2734 nimg.oslist = os_dict
2735
2737 """Verifies the node OS list.
2738
2739 @type ninfo: L{objects.Node}
2740 @param ninfo: the node to check
2741 @param nimg: the node image object
2742 @param base: the 'template' node we match against (e.g. from the master)
2743
2744 """
2745 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
2746
2747 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
2748 for os_name, os_data in nimg.oslist.items():
2749 assert os_data, "Empty OS status for OS %s?!" % os_name
2750 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
2751 self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name,
2752 "Invalid OS %s (located at %s): %s",
2753 os_name, f_path, f_diag)
2754 self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name,
2755 "OS '%s' has multiple entries"
2756 " (first one shadows the rest): %s",
2757 os_name, utils.CommaJoin([v[0] for v in os_data]))
2758
2759 test = os_name not in base.oslist
2760 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
2761 "Extra OS %s not present on reference node (%s)",
2762 os_name, self.cfg.GetNodeName(base.uuid))
2763 if test:
2764 continue
2765 assert base.oslist[os_name], "Base node has empty OS status?"
2766 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
2767 if not b_status:
2768
2769 continue
2770 for kind, a, b in [("API version", f_api, b_api),
2771 ("variants list", f_var, b_var),
2772 ("parameters", beautify_params(f_param),
2773 beautify_params(b_param))]:
2774 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
2775 "OS %s for %s differs from reference node %s:"
2776 " [%s] vs. [%s]", kind, os_name,
2777 self.cfg.GetNodeName(base.uuid),
2778 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
2779
2780
2781 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2782 self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name,
2783 "OSes present on reference node %s"
2784 " but missing on this node: %s",
2785 self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
2786
2788 """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}.
2789
2790 @type ninfo: L{objects.Node}
2791 @param ninfo: the node to check
2792 @param nresult: the remote results for the node
2793 @type is_master: bool
2794 @param is_master: Whether node is the master node
2795
2796 """
2797 cluster = self.cfg.GetClusterInfo()
2798 if (is_master and
2799 (cluster.IsFileStorageEnabled() or
2800 cluster.IsSharedFileStorageEnabled())):
2801 try:
2802 fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS]
2803 except KeyError:
2804
2805 self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
2806 "Node did not return forbidden file storage paths")
2807 else:
2808 self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
2809 "Found forbidden file storage paths: %s",
2810 utils.CommaJoin(fspaths))
2811 else:
2812 self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult,
2813 constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
2814 "Node should not have returned forbidden file storage"
2815 " paths")
2816
2817 - def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template,
2818 verify_key, error_key):
2819 """Verifies (file) storage paths.
2820
2821 @type ninfo: L{objects.Node}
2822 @param ninfo: the node to check
2823 @param nresult: the remote results for the node
2824 @type file_disk_template: string
2825 @param file_disk_template: file-based disk template, whose directory
2826 is supposed to be verified
2827 @type verify_key: string
2828 @param verify_key: key for the verification map of this file
2829 verification step
2830 @param error_key: error key to be added to the verification results
2831 in case something goes wrong in this verification step
2832
2833 """
2834 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes(
2835 constants.ST_FILE, constants.ST_SHARED_FILE
2836 ))
2837
2838 cluster = self.cfg.GetClusterInfo()
2839 if cluster.IsDiskTemplateEnabled(file_disk_template):
2840 self._ErrorIf(
2841 verify_key in nresult,
2842 error_key, ninfo.name,
2843 "The configured %s storage path is unusable: %s" %
2844 (file_disk_template, nresult.get(verify_key)))
2845
2856
2867
2869 """Verifies out of band functionality of a node.
2870
2871 @type ninfo: L{objects.Node}
2872 @param ninfo: the node to check
2873 @param nresult: the remote results for the node
2874
2875 """
2876
2877
2878 if ((ninfo.master_candidate or ninfo.master_capable) and
2879 constants.NV_OOB_PATHS in nresult):
2880 for path_result in nresult[constants.NV_OOB_PATHS]:
2881 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH,
2882 ninfo.name, path_result)
2883
2885 """Verifies and updates the node volume data.
2886
2887 This function will update a L{NodeImage}'s internal structures
2888 with data from the remote call.
2889
2890 @type ninfo: L{objects.Node}
2891 @param ninfo: the node to check
2892 @param nresult: the remote results for the node
2893 @param nimg: the node image object
2894 @param vg_name: the configured VG name
2895
2896 """
2897 nimg.lvm_fail = True
2898 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2899 if vg_name is None:
2900 pass
2901 elif isinstance(lvdata, basestring):
2902 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
2903 "LVM problem on node: %s", utils.SafeEncode(lvdata))
2904 elif not isinstance(lvdata, dict):
2905 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
2906 "rpc call to node failed (lvlist)")
2907 else:
2908 nimg.volumes = lvdata
2909 nimg.lvm_fail = False
2910
2912 """Verifies and updates the node instance list.
2913
2914 If the listing was successful, then updates this node's instance
2915 list. Otherwise, it marks the RPC call as failed for the instance
2916 list key.
2917
2918 @type ninfo: L{objects.Node}
2919 @param ninfo: the node to check
2920 @param nresult: the remote results for the node
2921 @param nimg: the node image object
2922
2923 """
2924 idata = nresult.get(constants.NV_INSTANCELIST, None)
2925 test = not isinstance(idata, list)
2926 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
2927 "rpc call to node failed (instancelist): %s",
2928 utils.SafeEncode(str(idata)))
2929 if test:
2930 nimg.hyp_fail = True
2931 else:
2932 nimg.instances = [uuid for (uuid, _) in
2933 self.cfg.GetMultiInstanceInfoByName(idata)]
2934
2936 """Verifies and computes a node information map
2937
2938 @type ninfo: L{objects.Node}
2939 @param ninfo: the node to check
2940 @param nresult: the remote results for the node
2941 @param nimg: the node image object
2942 @param vg_name: the configured VG name
2943
2944 """
2945
2946 hv_info = nresult.get(constants.NV_HVINFO, None)
2947 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2948 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
2949 "rpc call to node failed (hvinfo)")
2950 if not test:
2951 try:
2952 nimg.mfree = int(hv_info["memory_free"])
2953 except (ValueError, TypeError):
2954 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
2955 "node returned invalid nodeinfo, check hypervisor")
2956
2957
2958 if vg_name is not None:
2959 test = (constants.NV_VGLIST not in nresult or
2960 vg_name not in nresult[constants.NV_VGLIST])
2961 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
2962 "node didn't return data for the volume group '%s'"
2963 " - it is either missing or broken", vg_name)
2964 if not test:
2965 try:
2966 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2967 except (ValueError, TypeError):
2968 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
2969 "node returned invalid LVM info, check LVM status")
2970
2972 """Gets per-disk status information for all instances.
2973
2974 @type node_uuids: list of strings
2975 @param node_uuids: Node UUIDs
2976 @type node_image: dict of (UUID, L{objects.Node})
2977 @param node_image: Node objects
2978 @type instanceinfo: dict of (UUID, L{objects.Instance})
2979 @param instanceinfo: Instance objects
2980 @rtype: {instance: {node: [(succes, payload)]}}
2981 @return: a dictionary of per-instance dictionaries with nodes as
2982 keys and disk information as values; the disk information is a
2983 list of tuples (success, payload)
2984
2985 """
2986 node_disks = {}
2987 node_disks_dev_inst_only = {}
2988 diskless_instances = set()
2989 nodisk_instances = set()
2990 diskless = constants.DT_DISKLESS
2991
2992 for nuuid in node_uuids:
2993 node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst,
2994 node_image[nuuid].sinst))
2995 diskless_instances.update(uuid for uuid in node_inst_uuids
2996 if instanceinfo[uuid].disk_template == diskless)
2997 disks = [(inst_uuid, disk)
2998 for inst_uuid in node_inst_uuids
2999 for disk in instanceinfo[inst_uuid].disks]
3000
3001 if not disks:
3002 nodisk_instances.update(uuid for uuid in node_inst_uuids
3003 if instanceinfo[uuid].disk_template != diskless)
3004
3005 continue
3006
3007 node_disks[nuuid] = disks
3008
3009
3010 dev_inst_only = []
3011 for (inst_uuid, dev) in disks:
3012 (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev],
3013 self.cfg)
3014 dev_inst_only.append((anno_disk, instanceinfo[inst_uuid]))
3015
3016 node_disks_dev_inst_only[nuuid] = dev_inst_only
3017
3018 assert len(node_disks) == len(node_disks_dev_inst_only)
3019
3020
3021 result = self.rpc.call_blockdev_getmirrorstatus_multi(
3022 node_disks.keys(), node_disks_dev_inst_only)
3023
3024 assert len(result) == len(node_disks)
3025
3026 instdisk = {}
3027
3028 for (nuuid, nres) in result.items():
3029 node = self.cfg.GetNodeInfo(nuuid)
3030 disks = node_disks[node.uuid]
3031
3032 if nres.offline:
3033
3034 data = len(disks) * [(False, "node offline")]
3035 else:
3036 msg = nres.fail_msg
3037 self._ErrorIf(msg, constants.CV_ENODERPC, node.name,
3038 "while getting disk information: %s", msg)
3039 if msg:
3040
3041 data = len(disks) * [(False, msg)]
3042 else:
3043 data = []
3044 for idx, i in enumerate(nres.payload):
3045 if isinstance(i, (tuple, list)) and len(i) == 2:
3046 data.append(i)
3047 else:
3048 logging.warning("Invalid result from node %s, entry %d: %s",
3049 node.name, idx, i)
3050 data.append((False, "Invalid result from the remote node"))
3051
3052 for ((inst_uuid, _), status) in zip(disks, data):
3053 instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \
3054 .append(status)
3055
3056
3057 for inst_uuid in diskless_instances:
3058 assert inst_uuid not in instdisk
3059 instdisk[inst_uuid] = {}
3060
3061 for inst_uuid in nodisk_instances:
3062 assert inst_uuid not in instdisk
3063 instdisk[inst_uuid] = {}
3064
3065 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
3066 len(nuuids) <= len(instanceinfo[inst].all_nodes) and
3067 compat.all(isinstance(s, (tuple, list)) and
3068 len(s) == 2 for s in statuses)
3069 for inst, nuuids in instdisk.items()
3070 for nuuid, statuses in nuuids.items())
3071 if __debug__:
3072 instdisk_keys = set(instdisk)
3073 instanceinfo_keys = set(instanceinfo)
3074 assert instdisk_keys == instanceinfo_keys, \
3075 ("instdisk keys (%s) do not match instanceinfo keys (%s)" %
3076 (instdisk_keys, instanceinfo_keys))
3077
3078 return instdisk
3079
3080 @staticmethod
3082 """Create endless iterators for all potential SSH check hosts.
3083
3084 """
3085 nodes = [node for node in all_nodes
3086 if (node.group != group_uuid and
3087 not node.offline)]
3088 keyfunc = operator.attrgetter("group")
3089
3090 return map(itertools.cycle,
3091 [sorted(map(operator.attrgetter("name"), names))
3092 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
3093 keyfunc)])
3094
3095 @classmethod
3097 """Choose which nodes should talk to which other nodes.
3098
3099 We will make nodes contact all nodes in their group, and one node from
3100 every other group.
3101
3102 @warning: This algorithm has a known issue if one node group is much
3103 smaller than others (e.g. just one node). In such a case all other
3104 nodes will talk to the single node.
3105
3106 """
3107 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
3108 sel = cls._SshNodeSelector(group_uuid, all_nodes)
3109
3110 return (online_nodes,
3111 dict((name, sorted([i.next() for i in sel]))
3112 for name in online_nodes))
3113
3115 """Build hooks env.
3116
3117 Cluster-Verify hooks just ran in the post phase and their failure makes
3118 the output be logged in the verify output and the verification to fail.
3119
3120 """
3121 env = {
3122 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()),
3123 }
3124
3125 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
3126 for node in self.my_node_info.values())
3127
3128 return env
3129
3131 """Build hooks nodes.
3132
3133 """
3134 return ([], list(self.my_node_info.keys()))
3135
3136 - def Exec(self, feedback_fn):
3137 """Verify integrity of the node group, performing various test on nodes.
3138
3139 """
3140
3141 feedback_fn("* Verifying group '%s'" % self.group_info.name)
3142
3143 if not self.my_node_uuids:
3144
3145 feedback_fn("* Empty node group, skipping verification")
3146 return True
3147
3148 self.bad = False
3149 verbose = self.op.verbose
3150 self._feedback_fn = feedback_fn
3151
3152 vg_name = self.cfg.GetVGName()
3153 drbd_helper = self.cfg.GetDRBDHelper()
3154 cluster = self.cfg.GetClusterInfo()
3155 hypervisors = cluster.enabled_hypervisors
3156 node_data_list = self.my_node_info.values()
3157
3158 i_non_redundant = []
3159 i_non_a_balanced = []
3160 i_offline = 0
3161 n_offline = 0
3162 n_drained = 0
3163 node_vol_should = {}
3164
3165
3166
3167
3168 filemap = ComputeAncillaryFiles(cluster, False)
3169
3170
3171 master_node_uuid = self.master_node = self.cfg.GetMasterNode()
3172 master_ip = self.cfg.GetMasterIP()
3173
3174 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids))
3175
3176 user_scripts = []
3177 if self.cfg.GetUseExternalMipScript():
3178 user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT)
3179
3180 node_verify_param = {
3181 constants.NV_FILELIST:
3182 map(vcluster.MakeVirtualPath,
3183 utils.UniqueSequence(filename
3184 for files in filemap
3185 for filename in files)),
3186 constants.NV_NODELIST:
3187 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
3188 self.all_node_info.values()),
3189 constants.NV_HYPERVISOR: hypervisors,
3190 constants.NV_HVPARAMS:
3191 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
3192 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
3193 for node in node_data_list
3194 if not node.offline],
3195 constants.NV_INSTANCELIST: hypervisors,
3196 constants.NV_VERSION: None,
3197 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
3198 constants.NV_NODESETUP: None,
3199 constants.NV_TIME: None,
3200 constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip),
3201 constants.NV_OSLIST: None,
3202 constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(),
3203 constants.NV_USERSCRIPTS: user_scripts,
3204 constants.NV_CLIENT_CERT: None,
3205 }
3206
3207 if vg_name is not None:
3208 node_verify_param[constants.NV_VGLIST] = None
3209 node_verify_param[constants.NV_LVLIST] = vg_name
3210 node_verify_param[constants.NV_PVLIST] = [vg_name]
3211
3212 if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8):
3213 if drbd_helper:
3214 node_verify_param[constants.NV_DRBDVERSION] = None
3215 node_verify_param[constants.NV_DRBDLIST] = None
3216 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
3217
3218 if cluster.IsFileStorageEnabled() or \
3219 cluster.IsSharedFileStorageEnabled():
3220
3221 node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \
3222 self.cfg.GetMasterNodeName()
3223 if cluster.IsFileStorageEnabled():
3224 node_verify_param[constants.NV_FILE_STORAGE_PATH] = \
3225 cluster.file_storage_dir
3226 if cluster.IsSharedFileStorageEnabled():
3227 node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \
3228 cluster.shared_file_storage_dir
3229
3230
3231
3232 bridges = set()
3233 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
3234 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
3235 bridges.add(default_nicpp[constants.NIC_LINK])
3236 for inst_uuid in self.my_inst_info.values():
3237 for nic in inst_uuid.nics:
3238 full_nic = cluster.SimpleFillNIC(nic.nicparams)
3239 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
3240 bridges.add(full_nic[constants.NIC_LINK])
3241
3242 if bridges:
3243 node_verify_param[constants.NV_BRIDGES] = list(bridges)
3244
3245
3246 node_image = dict((node.uuid, self.NodeImage(offline=node.offline,
3247 uuid=node.uuid,
3248 vm_capable=node.vm_capable))
3249 for node in node_data_list)
3250
3251
3252 oob_paths = []
3253 for node in self.all_node_info.values():
3254 path = SupportsOob(self.cfg, node)
3255 if path and path not in oob_paths:
3256 oob_paths.append(path)
3257
3258 if oob_paths:
3259 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
3260
3261 for inst_uuid in self.my_inst_uuids:
3262 instance = self.my_inst_info[inst_uuid]
3263 if instance.admin_state == constants.ADMINST_OFFLINE:
3264 i_offline += 1
3265
3266 for nuuid in instance.all_nodes:
3267 if nuuid not in node_image:
3268 gnode = self.NodeImage(uuid=nuuid)
3269 gnode.ghost = (nuuid not in self.all_node_info)
3270 node_image[nuuid] = gnode
3271
3272 instance.MapLVsByNode(node_vol_should)
3273
3274 pnode = instance.primary_node
3275 node_image[pnode].pinst.append(instance.uuid)
3276
3277 for snode in instance.secondary_nodes:
3278 nimg = node_image[snode]
3279 nimg.sinst.append(instance.uuid)
3280 if pnode not in nimg.sbp:
3281 nimg.sbp[pnode] = []
3282 nimg.sbp[pnode].append(instance.uuid)
3283
3284 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg,
3285 self.my_node_info.keys())
3286
3287
3288 self._exclusive_storage = compat.any(es_flags.values())
3289 if self._exclusive_storage:
3290 node_verify_param[constants.NV_EXCLUSIVEPVS] = True
3291
3292 node_group_uuids = dict(map(lambda n: (n.name, n.group),
3293 self.cfg.GetAllNodesInfo().values()))
3294 groups_config = self.cfg.GetAllNodeGroupsInfoDict()
3295
3296
3297
3298
3299
3300
3301
3302
3303 nvinfo_starttime = time.time()
3304 all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids,
3305 node_verify_param,
3306 self.cfg.GetClusterName(),
3307 self.cfg.GetClusterInfo().hvparams,
3308 node_group_uuids,
3309 groups_config)
3310 nvinfo_endtime = time.time()
3311
3312 if self.extra_lv_nodes and vg_name is not None:
3313 extra_lv_nvinfo = \
3314 self.rpc.call_node_verify(self.extra_lv_nodes,
3315 {constants.NV_LVLIST: vg_name},
3316 self.cfg.GetClusterName(),
3317 self.cfg.GetClusterInfo().hvparams,
3318 node_group_uuids,
3319 groups_config)
3320 else:
3321 extra_lv_nvinfo = {}
3322
3323 all_drbd_map = self.cfg.ComputeDRBDMap()
3324
3325 feedback_fn("* Gathering disk information (%s nodes)" %
3326 len(self.my_node_uuids))
3327 instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image,
3328 self.my_inst_info)
3329
3330 feedback_fn("* Verifying configuration file consistency")
3331
3332 self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo)
3333
3334
3335 absent_node_uuids = set(self.all_node_info).difference(self.my_node_info)
3336 if absent_node_uuids:
3337 vf_nvinfo = all_nvinfo.copy()
3338 vf_node_info = list(self.my_node_info.values())
3339 additional_node_uuids = []
3340 if master_node_uuid not in self.my_node_info:
3341 additional_node_uuids.append(master_node_uuid)
3342 vf_node_info.append(self.all_node_info[master_node_uuid])
3343
3344
3345 for node_uuid in absent_node_uuids:
3346 nodeinfo = self.all_node_info[node_uuid]
3347 if (nodeinfo.vm_capable and not nodeinfo.offline and
3348 node_uuid != master_node_uuid):
3349 additional_node_uuids.append(node_uuid)
3350 vf_node_info.append(self.all_node_info[node_uuid])
3351 break
3352 key = constants.NV_FILELIST
3353 vf_nvinfo.update(self.rpc.call_node_verify(
3354 additional_node_uuids, {key: node_verify_param[key]},
3355 self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams,
3356 node_group_uuids,
3357 groups_config))
3358 else:
3359 vf_nvinfo = all_nvinfo
3360 vf_node_info = self.my_node_info.values()
3361
3362 self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap)
3363
3364 feedback_fn("* Verifying node status")
3365
3366 refos_img = None
3367
3368 for node_i in node_data_list:
3369 nimg = node_image[node_i.uuid]
3370
3371 if node_i.offline:
3372 if verbose:
3373 feedback_fn("* Skipping offline node %s" % (node_i.name,))
3374 n_offline += 1
3375 continue
3376
3377 if node_i.uuid == master_node_uuid:
3378 ntype = "master"
3379 elif node_i.master_candidate:
3380 ntype = "master candidate"
3381 elif node_i.drained:
3382 ntype = "drained"
3383 n_drained += 1
3384 else:
3385 ntype = "regular"
3386 if verbose:
3387 feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype))
3388
3389 msg = all_nvinfo[node_i.uuid].fail_msg
3390 self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name,
3391 "while contacting node: %s", msg)
3392 if msg:
3393 nimg.rpc_fail = True
3394 continue
3395
3396 nresult = all_nvinfo[node_i.uuid].payload
3397
3398 nimg.call_ok = self._VerifyNode(node_i, nresult)
3399 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
3400 self._VerifyNodeNetwork(node_i, nresult)
3401 self._VerifyNodeUserScripts(node_i, nresult)
3402 self._VerifyOob(node_i, nresult)
3403 self._VerifyAcceptedFileStoragePaths(node_i, nresult,
3404 node_i.uuid == master_node_uuid)
3405 self._VerifyFileStoragePaths(node_i, nresult)
3406 self._VerifySharedFileStoragePaths(node_i, nresult)
3407
3408 if nimg.vm_capable:
3409 self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg)
3410 if constants.DT_DRBD8 in cluster.enabled_disk_templates:
3411 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper,
3412 all_drbd_map)
3413
3414 if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \
3415 (constants.DT_DRBD8 in cluster.enabled_disk_templates):
3416 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
3417 self._UpdateNodeInstances(node_i, nresult, nimg)
3418 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
3419 self._UpdateNodeOS(node_i, nresult, nimg)
3420
3421 if not nimg.os_fail:
3422 if refos_img is None:
3423 refos_img = nimg
3424 self._VerifyNodeOS(node_i, nimg, refos_img)
3425 self._VerifyNodeBridges(node_i, nresult, bridges)
3426
3427
3428
3429
3430 non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst)
3431
3432 for inst_uuid in non_primary_inst_uuids:
3433 test = inst_uuid in self.all_inst_info
3434 self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE,
3435 self.cfg.GetInstanceName(inst_uuid),
3436 "instance should not run on node %s", node_i.name)
3437 self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
3438 "node is running unknown instance %s", inst_uuid)
3439
3440 self._VerifyGroupDRBDVersion(all_nvinfo)
3441 self._VerifyGroupLVM(node_image, vg_name)
3442
3443 for node_uuid, result in extra_lv_nvinfo.items():
3444 self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload,
3445 node_image[node_uuid], vg_name)
3446
3447 feedback_fn("* Verifying instance status")
3448 for inst_uuid in self.my_inst_uuids:
3449 instance = self.my_inst_info[inst_uuid]
3450 if verbose:
3451 feedback_fn("* Verifying instance %s" % instance.name)
3452 self._VerifyInstance(instance, node_image, instdisk[inst_uuid])
3453
3454
3455
3456 if instance.disk_template not in constants.DTS_MIRRORED:
3457 i_non_redundant.append(instance)
3458
3459 if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]:
3460 i_non_a_balanced.append(instance)
3461
3462 feedback_fn("* Verifying orphan volumes")
3463 reserved = utils.FieldSet(*cluster.reserved_lvs)
3464
3465
3466
3467
3468 for instance in self.all_inst_info.values():
3469 for secondary in instance.secondary_nodes:
3470 if (secondary in self.my_node_info
3471 and instance.name not in self.my_inst_info):
3472 instance.MapLVsByNode(node_vol_should)
3473 break
3474
3475 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
3476
3477 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
3478 feedback_fn("* Verifying N+1 Memory redundancy")
3479 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
3480
3481 feedback_fn("* Other Notes")
3482 if i_non_redundant:
3483 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
3484 % len(i_non_redundant))
3485
3486 if i_non_a_balanced:
3487 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
3488 % len(i_non_a_balanced))
3489
3490 if i_offline:
3491 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline)
3492
3493 if n_offline:
3494 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
3495
3496 if n_drained:
3497 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
3498
3499 return not self.bad
3500
3501 - def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
3502 """Analyze the post-hooks' result
3503
3504 This method analyses the hook result, handles it, and sends some
3505 nicely-formatted feedback back to the user.
3506
3507 @param phase: one of L{constants.HOOKS_PHASE_POST} or
3508 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
3509 @param hooks_results: the results of the multi-node hooks rpc call
3510 @param feedback_fn: function used send feedback back to the caller
3511 @param lu_result: previous Exec result
3512 @return: the new Exec result, based on the previous result
3513 and hook results
3514
3515 """
3516
3517
3518 if not self.my_node_uuids:
3519
3520 pass
3521 elif phase == constants.HOOKS_PHASE_POST:
3522
3523 feedback_fn("* Hooks Results")
3524 assert hooks_results, "invalid result from hooks"
3525
3526 for node_name in hooks_results:
3527 res = hooks_results[node_name]
3528 msg = res.fail_msg
3529 test = msg and not res.offline
3530 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3531 "Communication failure in hooks execution: %s", msg)
3532 if test:
3533 lu_result = False
3534 continue
3535 if res.offline:
3536
3537 continue
3538 for script, hkr, output in res.payload:
3539 test = hkr == constants.HKR_FAIL
3540 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3541 "Script %s failed, output:", script)
3542 if test:
3543 output = self._HOOKS_INDENT_RE.sub(" ", output)
3544 feedback_fn("%s" % output)
3545 lu_result = False
3546
3547 return lu_result
3548
3551 """Verifies the cluster disks status.
3552
3553 """
3554 REQ_BGL = False
3555
3561
3562 - def Exec(self, feedback_fn):
3568