1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """Module implementing the master-side code."""
23
24
25
26
27
28
29
30
31 import os
32 import os.path
33 import time
34 import re
35 import platform
36 import logging
37 import copy
38 import OpenSSL
39 import socket
40 import tempfile
41 import shutil
42
43 from ganeti import ssh
44 from ganeti import utils
45 from ganeti import errors
46 from ganeti import hypervisor
47 from ganeti import locking
48 from ganeti import constants
49 from ganeti import objects
50 from ganeti import serializer
51 from ganeti import ssconf
52 from ganeti import uidpool
53 from ganeti import compat
54 from ganeti import masterd
55 from ganeti import netutils
56 from ganeti import ht
57
58 import ganeti.masterd.instance
59
60
61
62
63 _POutputFields = ("output_fields", ht.NoDefault, ht.TListOf(ht.TNonEmptyString))
64
65
66
67 _PShutdownTimeout = ("shutdown_timeout", constants.DEFAULT_SHUTDOWN_TIMEOUT,
68 ht.TPositiveInt)
69
70
71 _PForce = ("force", False, ht.TBool)
72
73
74 _PInstanceName = ("instance_name", ht.NoDefault, ht.TNonEmptyString)
75
76
77 _PIgnoreOfflineNodes = ("ignore_offline_nodes", False, ht.TBool)
78
79
80 _PNodeName = ("node_name", ht.NoDefault, ht.TNonEmptyString)
81
82
83 _PMigrationMode = ("mode", None,
84 ht.TOr(ht.TNone, ht.TElemOf(constants.HT_MIGRATION_MODES)))
85
86
87 _PMigrationLive = ("live", None, ht.TMaybeBool)
92 """Logical Unit base class.
93
94 Subclasses must follow these rules:
95 - implement ExpandNames
96 - implement CheckPrereq (except when tasklets are used)
97 - implement Exec (except when tasklets are used)
98 - implement BuildHooksEnv
99 - redefine HPATH and HTYPE
100 - optionally redefine their run requirements:
101 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
102
103 Note that all commands require root permissions.
104
105 @ivar dry_run_result: the value (if any) that will be returned to the caller
106 in dry-run mode (signalled by opcode dry_run parameter)
107 @cvar _OP_PARAMS: a list of opcode attributes, their defaults values
108 they should get if not already defined, and types they must match
109
110 """
111 HPATH = None
112 HTYPE = None
113 _OP_PARAMS = []
114 REQ_BGL = True
115
116 - def __init__(self, processor, op, context, rpc):
117 """Constructor for LogicalUnit.
118
119 This needs to be overridden in derived classes in order to check op
120 validity.
121
122 """
123 self.proc = processor
124 self.op = op
125 self.cfg = context.cfg
126 self.context = context
127 self.rpc = rpc
128
129 self.needed_locks = None
130 self.acquired_locks = {}
131 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
132 self.add_locks = {}
133 self.remove_locks = {}
134
135 self.recalculate_locks = {}
136 self.__ssh = None
137
138 self.Log = processor.Log
139 self.LogWarning = processor.LogWarning
140 self.LogInfo = processor.LogInfo
141 self.LogStep = processor.LogStep
142
143 self.dry_run_result = None
144
145 if (not hasattr(self.op, "debug_level") or
146 not isinstance(self.op.debug_level, int)):
147 self.op.debug_level = 0
148
149
150 self.tasklets = None
151
152
153 op_id = self.op.OP_ID
154 for attr_name, aval, test in self._OP_PARAMS:
155 if not hasattr(op, attr_name):
156 if aval == ht.NoDefault:
157 raise errors.OpPrereqError("Required parameter '%s.%s' missing" %
158 (op_id, attr_name), errors.ECODE_INVAL)
159 else:
160 if callable(aval):
161 dval = aval()
162 else:
163 dval = aval
164 setattr(self.op, attr_name, dval)
165 attr_val = getattr(op, attr_name)
166 if test == ht.NoType:
167
168 continue
169 if not callable(test):
170 raise errors.ProgrammerError("Validation for parameter '%s.%s' failed,"
171 " given type is not a proper type (%s)" %
172 (op_id, attr_name, test))
173 if not test(attr_val):
174 logging.error("OpCode %s, parameter %s, has invalid type %s/value %s",
175 self.op.OP_ID, attr_name, type(attr_val), attr_val)
176 raise errors.OpPrereqError("Parameter '%s.%s' fails validation" %
177 (op_id, attr_name), errors.ECODE_INVAL)
178
179 self.CheckArguments()
180
182 """Returns the SshRunner object
183
184 """
185 if not self.__ssh:
186 self.__ssh = ssh.SshRunner(self.cfg.GetClusterName())
187 return self.__ssh
188
189 ssh = property(fget=__GetSSH)
190
192 """Check syntactic validity for the opcode arguments.
193
194 This method is for doing a simple syntactic check and ensure
195 validity of opcode parameters, without any cluster-related
196 checks. While the same can be accomplished in ExpandNames and/or
197 CheckPrereq, doing these separate is better because:
198
199 - ExpandNames is left as as purely a lock-related function
200 - CheckPrereq is run after we have acquired locks (and possible
201 waited for them)
202
203 The function is allowed to change the self.op attribute so that
204 later methods can no longer worry about missing parameters.
205
206 """
207 pass
208
210 """Expand names for this LU.
211
212 This method is called before starting to execute the opcode, and it should
213 update all the parameters of the opcode to their canonical form (e.g. a
214 short node name must be fully expanded after this method has successfully
215 completed). This way locking, hooks, logging, ecc. can work correctly.
216
217 LUs which implement this method must also populate the self.needed_locks
218 member, as a dict with lock levels as keys, and a list of needed lock names
219 as values. Rules:
220
221 - use an empty dict if you don't need any lock
222 - if you don't need any lock at a particular level omit that level
223 - don't put anything for the BGL level
224 - if you want all locks at a level use locking.ALL_SET as a value
225
226 If you need to share locks (rather than acquire them exclusively) at one
227 level you can modify self.share_locks, setting a true value (usually 1) for
228 that level. By default locks are not shared.
229
230 This function can also define a list of tasklets, which then will be
231 executed in order instead of the usual LU-level CheckPrereq and Exec
232 functions, if those are not defined by the LU.
233
234 Examples::
235
236 # Acquire all nodes and one instance
237 self.needed_locks = {
238 locking.LEVEL_NODE: locking.ALL_SET,
239 locking.LEVEL_INSTANCE: ['instance1.example.com'],
240 }
241 # Acquire just two nodes
242 self.needed_locks = {
243 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
244 }
245 # Acquire no locks
246 self.needed_locks = {} # No, you can't leave it to the default value None
247
248 """
249
250
251
252 if self.REQ_BGL:
253 self.needed_locks = {}
254 else:
255 raise NotImplementedError
256
258 """Declare LU locking needs for a level
259
260 While most LUs can just declare their locking needs at ExpandNames time,
261 sometimes there's the need to calculate some locks after having acquired
262 the ones before. This function is called just before acquiring locks at a
263 particular level, but after acquiring the ones at lower levels, and permits
264 such calculations. It can be used to modify self.needed_locks, and by
265 default it does nothing.
266
267 This function is only called if you have something already set in
268 self.needed_locks for the level.
269
270 @param level: Locking level which is going to be locked
271 @type level: member of ganeti.locking.LEVELS
272
273 """
274
276 """Check prerequisites for this LU.
277
278 This method should check that the prerequisites for the execution
279 of this LU are fulfilled. It can do internode communication, but
280 it should be idempotent - no cluster or system changes are
281 allowed.
282
283 The method should raise errors.OpPrereqError in case something is
284 not fulfilled. Its return value is ignored.
285
286 This method should also update all the parameters of the opcode to
287 their canonical form if it hasn't been done by ExpandNames before.
288
289 """
290 if self.tasklets is not None:
291 for (idx, tl) in enumerate(self.tasklets):
292 logging.debug("Checking prerequisites for tasklet %s/%s",
293 idx + 1, len(self.tasklets))
294 tl.CheckPrereq()
295 else:
296 pass
297
298 - def Exec(self, feedback_fn):
299 """Execute the LU.
300
301 This method should implement the actual work. It should raise
302 errors.OpExecError for failures that are somewhat dealt with in
303 code, or expected.
304
305 """
306 if self.tasklets is not None:
307 for (idx, tl) in enumerate(self.tasklets):
308 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
309 tl.Exec(feedback_fn)
310 else:
311 raise NotImplementedError
312
314 """Build hooks environment for this LU.
315
316 This method should return a three-node tuple consisting of: a dict
317 containing the environment that will be used for running the
318 specific hook for this LU, a list of node names on which the hook
319 should run before the execution, and a list of node names on which
320 the hook should run after the execution.
321
322 The keys of the dict must not have 'GANETI_' prefixed as this will
323 be handled in the hooks runner. Also note additional keys will be
324 added by the hooks runner. If the LU doesn't define any
325 environment, an empty dict (and not None) should be returned.
326
327 No nodes should be returned as an empty list (and not None).
328
329 Note that if the HPATH for a LU class is None, this function will
330 not be called.
331
332 """
333 raise NotImplementedError
334
335 - def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
336 """Notify the LU about the results of its hooks.
337
338 This method is called every time a hooks phase is executed, and notifies
339 the Logical Unit about the hooks' result. The LU can then use it to alter
340 its result based on the hooks. By default the method does nothing and the
341 previous result is passed back unchanged but any LU can define it if it
342 wants to use the local cluster hook-scripts somehow.
343
344 @param phase: one of L{constants.HOOKS_PHASE_POST} or
345 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
346 @param hook_results: the results of the multi-node hooks rpc call
347 @param feedback_fn: function used send feedback back to the caller
348 @param lu_result: the previous Exec result this LU had, or None
349 in the PRE phase
350 @return: the new Exec result, based on the previous result
351 and hook results
352
353 """
354
355
356
357 return lu_result
358
360 """Helper function to expand and lock an instance.
361
362 Many LUs that work on an instance take its name in self.op.instance_name
363 and need to expand it and then declare the expanded name for locking. This
364 function does it, and then updates self.op.instance_name to the expanded
365 name. It also initializes needed_locks as a dict, if this hasn't been done
366 before.
367
368 """
369 if self.needed_locks is None:
370 self.needed_locks = {}
371 else:
372 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
373 "_ExpandAndLockInstance called with instance-level locks set"
374 self.op.instance_name = _ExpandInstanceName(self.cfg,
375 self.op.instance_name)
376 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
377
379 """Helper function to declare instances' nodes for locking.
380
381 This function should be called after locking one or more instances to lock
382 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
383 with all primary or secondary nodes for instances already locked and
384 present in self.needed_locks[locking.LEVEL_INSTANCE].
385
386 It should be called from DeclareLocks, and for safety only works if
387 self.recalculate_locks[locking.LEVEL_NODE] is set.
388
389 In the future it may grow parameters to just lock some instance's nodes, or
390 to just lock primaries or secondary nodes, if needed.
391
392 If should be called in DeclareLocks in a way similar to::
393
394 if level == locking.LEVEL_NODE:
395 self._LockInstancesNodes()
396
397 @type primary_only: boolean
398 @param primary_only: only lock primary nodes of locked instances
399
400 """
401 assert locking.LEVEL_NODE in self.recalculate_locks, \
402 "_LockInstancesNodes helper function called with no nodes to recalculate"
403
404
405
406
407
408
409 wanted_nodes = []
410 for instance_name in self.acquired_locks[locking.LEVEL_INSTANCE]:
411 instance = self.context.cfg.GetInstanceInfo(instance_name)
412 wanted_nodes.append(instance.primary_node)
413 if not primary_only:
414 wanted_nodes.extend(instance.secondary_nodes)
415
416 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
417 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
418 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
419 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
420
421 del self.recalculate_locks[locking.LEVEL_NODE]
422
425 """Simple LU which runs no hooks.
426
427 This LU is intended as a parent for other LogicalUnits which will
428 run no hooks, in order to reduce duplicate code.
429
430 """
431 HPATH = None
432 HTYPE = None
433
435 """Empty BuildHooksEnv for NoHooksLu.
436
437 This just raises an error.
438
439 """
440 assert False, "BuildHooksEnv called for NoHooksLUs"
441
444 """Tasklet base class.
445
446 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
447 they can mix legacy code with tasklets. Locking needs to be done in the LU,
448 tasklets know nothing about locks.
449
450 Subclasses must follow these rules:
451 - Implement CheckPrereq
452 - Implement Exec
453
454 """
456 self.lu = lu
457
458
459 self.cfg = lu.cfg
460 self.rpc = lu.rpc
461
463 """Check prerequisites for this tasklets.
464
465 This method should check whether the prerequisites for the execution of
466 this tasklet are fulfilled. It can do internode communication, but it
467 should be idempotent - no cluster or system changes are allowed.
468
469 The method should raise errors.OpPrereqError in case something is not
470 fulfilled. Its return value is ignored.
471
472 This method should also update all parameters to their canonical form if it
473 hasn't been done before.
474
475 """
476 pass
477
478 - def Exec(self, feedback_fn):
479 """Execute the tasklet.
480
481 This method should implement the actual work. It should raise
482 errors.OpExecError for failures that are somewhat dealt with in code, or
483 expected.
484
485 """
486 raise NotImplementedError
487
490 """Returns list of checked and expanded node names.
491
492 @type lu: L{LogicalUnit}
493 @param lu: the logical unit on whose behalf we execute
494 @type nodes: list
495 @param nodes: list of node names or None for all nodes
496 @rtype: list
497 @return: the list of nodes, sorted
498 @raise errors.ProgrammerError: if the nodes parameter is wrong type
499
500 """
501 if not nodes:
502 raise errors.ProgrammerError("_GetWantedNodes should only be called with a"
503 " non-empty list of nodes whose name is to be expanded.")
504
505 wanted = [_ExpandNodeName(lu.cfg, name) for name in nodes]
506 return utils.NiceSort(wanted)
507
510 """Returns list of checked and expanded instance names.
511
512 @type lu: L{LogicalUnit}
513 @param lu: the logical unit on whose behalf we execute
514 @type instances: list
515 @param instances: list of instance names or None for all instances
516 @rtype: list
517 @return: the list of instances, sorted
518 @raise errors.OpPrereqError: if the instances parameter is wrong type
519 @raise errors.OpPrereqError: if any of the passed instances is not found
520
521 """
522 if instances:
523 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
524 else:
525 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
526 return wanted
527
528
529 -def _GetUpdatedParams(old_params, update_dict,
530 use_default=True, use_none=False):
531 """Return the new version of a parameter dictionary.
532
533 @type old_params: dict
534 @param old_params: old parameters
535 @type update_dict: dict
536 @param update_dict: dict containing new parameter values, or
537 constants.VALUE_DEFAULT to reset the parameter to its default
538 value
539 @param use_default: boolean
540 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
541 values as 'to be deleted' values
542 @param use_none: boolean
543 @type use_none: whether to recognise C{None} values as 'to be
544 deleted' values
545 @rtype: dict
546 @return: the new parameter dictionary
547
548 """
549 params_copy = copy.deepcopy(old_params)
550 for key, val in update_dict.iteritems():
551 if ((use_default and val == constants.VALUE_DEFAULT) or
552 (use_none and val is None)):
553 try:
554 del params_copy[key]
555 except KeyError:
556 pass
557 else:
558 params_copy[key] = val
559 return params_copy
560
563 """Checks whether all selected fields are valid.
564
565 @type static: L{utils.FieldSet}
566 @param static: static fields set
567 @type dynamic: L{utils.FieldSet}
568 @param dynamic: dynamic fields set
569
570 """
571 f = utils.FieldSet()
572 f.Extend(static)
573 f.Extend(dynamic)
574
575 delta = f.NonMatching(selected)
576 if delta:
577 raise errors.OpPrereqError("Unknown output fields selected: %s"
578 % ",".join(delta), errors.ECODE_INVAL)
579
582 """Validates that given hypervisor params are not global ones.
583
584 This will ensure that instances don't get customised versions of
585 global params.
586
587 """
588 used_globals = constants.HVC_GLOBALS.intersection(params)
589 if used_globals:
590 msg = ("The following hypervisor parameters are global and cannot"
591 " be customized at instance level, please modify them at"
592 " cluster level: %s" % utils.CommaJoin(used_globals))
593 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
594
597 """Ensure that a given node is online.
598
599 @param lu: the LU on behalf of which we make the check
600 @param node: the node to check
601 @param msg: if passed, should be a message to replace the default one
602 @raise errors.OpPrereqError: if the node is offline
603
604 """
605 if msg is None:
606 msg = "Can't use offline node"
607 if lu.cfg.GetNodeInfo(node).offline:
608 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
609
612 """Ensure that a given node is not drained.
613
614 @param lu: the LU on behalf of which we make the check
615 @param node: the node to check
616 @raise errors.OpPrereqError: if the node is drained
617
618 """
619 if lu.cfg.GetNodeInfo(node).drained:
620 raise errors.OpPrereqError("Can't use drained node %s" % node,
621 errors.ECODE_STATE)
622
625 """Ensure that a given node is vm capable.
626
627 @param lu: the LU on behalf of which we make the check
628 @param node: the node to check
629 @raise errors.OpPrereqError: if the node is not vm capable
630
631 """
632 if not lu.cfg.GetNodeInfo(node).vm_capable:
633 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
634 errors.ECODE_STATE)
635
638 """Ensure that a node supports a given OS.
639
640 @param lu: the LU on behalf of which we make the check
641 @param node: the node to check
642 @param os_name: the OS to query about
643 @param force_variant: whether to ignore variant errors
644 @raise errors.OpPrereqError: if the node is not supporting the OS
645
646 """
647 result = lu.rpc.call_os_get(node, os_name)
648 result.Raise("OS '%s' not in supported OS list for node %s" %
649 (os_name, node),
650 prereq=True, ecode=errors.ECODE_INVAL)
651 if not force_variant:
652 _CheckOSVariant(result.payload, os_name)
653
656 """Ensure that a node has the given secondary ip.
657
658 @type lu: L{LogicalUnit}
659 @param lu: the LU on behalf of which we make the check
660 @type node: string
661 @param node: the node to check
662 @type secondary_ip: string
663 @param secondary_ip: the ip to check
664 @type prereq: boolean
665 @param prereq: whether to throw a prerequisite or an execute error
666 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
667 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
668
669 """
670 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
671 result.Raise("Failure checking secondary ip on node %s" % node,
672 prereq=prereq, ecode=errors.ECODE_ENVIRON)
673 if not result.payload:
674 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
675 " please fix and re-run this command" % secondary_ip)
676 if prereq:
677 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
678 else:
679 raise errors.OpExecError(msg)
680
691
704
716
719 """Reads the cluster domain secret.
720
721 """
722 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
723 strict=True)
724
740
743 """Expand an item name.
744
745 @param fn: the function to use for expansion
746 @param name: requested item name
747 @param kind: text description ('Node' or 'Instance')
748 @return: the resolved (full) name
749 @raise errors.OpPrereqError: if the item is not found
750
751 """
752 full_name = fn(name)
753 if full_name is None:
754 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
755 errors.ECODE_NOENT)
756 return full_name
757
762
767
768
769 -def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
770 memory, vcpus, nics, disk_template, disks,
771 bep, hvp, hypervisor_name):
772 """Builds instance related env variables for hooks
773
774 This builds the hook environment from individual variables.
775
776 @type name: string
777 @param name: the name of the instance
778 @type primary_node: string
779 @param primary_node: the name of the instance's primary node
780 @type secondary_nodes: list
781 @param secondary_nodes: list of secondary nodes as strings
782 @type os_type: string
783 @param os_type: the name of the instance's OS
784 @type status: boolean
785 @param status: the should_run status of the instance
786 @type memory: string
787 @param memory: the memory size of the instance
788 @type vcpus: string
789 @param vcpus: the count of VCPUs the instance has
790 @type nics: list
791 @param nics: list of tuples (ip, mac, mode, link) representing
792 the NICs the instance has
793 @type disk_template: string
794 @param disk_template: the disk template of the instance
795 @type disks: list
796 @param disks: the list of (size, mode) pairs
797 @type bep: dict
798 @param bep: the backend parameters for the instance
799 @type hvp: dict
800 @param hvp: the hypervisor parameters for the instance
801 @type hypervisor_name: string
802 @param hypervisor_name: the hypervisor for the instance
803 @rtype: dict
804 @return: the hook environment for this instance
805
806 """
807 if status:
808 str_status = "up"
809 else:
810 str_status = "down"
811 env = {
812 "OP_TARGET": name,
813 "INSTANCE_NAME": name,
814 "INSTANCE_PRIMARY": primary_node,
815 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
816 "INSTANCE_OS_TYPE": os_type,
817 "INSTANCE_STATUS": str_status,
818 "INSTANCE_MEMORY": memory,
819 "INSTANCE_VCPUS": vcpus,
820 "INSTANCE_DISK_TEMPLATE": disk_template,
821 "INSTANCE_HYPERVISOR": hypervisor_name,
822 }
823
824 if nics:
825 nic_count = len(nics)
826 for idx, (ip, mac, mode, link) in enumerate(nics):
827 if ip is None:
828 ip = ""
829 env["INSTANCE_NIC%d_IP" % idx] = ip
830 env["INSTANCE_NIC%d_MAC" % idx] = mac
831 env["INSTANCE_NIC%d_MODE" % idx] = mode
832 env["INSTANCE_NIC%d_LINK" % idx] = link
833 if mode == constants.NIC_MODE_BRIDGED:
834 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
835 else:
836 nic_count = 0
837
838 env["INSTANCE_NIC_COUNT"] = nic_count
839
840 if disks:
841 disk_count = len(disks)
842 for idx, (size, mode) in enumerate(disks):
843 env["INSTANCE_DISK%d_SIZE" % idx] = size
844 env["INSTANCE_DISK%d_MODE" % idx] = mode
845 else:
846 disk_count = 0
847
848 env["INSTANCE_DISK_COUNT"] = disk_count
849
850 for source, kind in [(bep, "BE"), (hvp, "HV")]:
851 for key, value in source.items():
852 env["INSTANCE_%s_%s" % (kind, key)] = value
853
854 return env
855
858 """Build a list of nic information tuples.
859
860 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
861 value in LUQueryInstanceData.
862
863 @type lu: L{LogicalUnit}
864 @param lu: the logical unit on whose behalf we execute
865 @type nics: list of L{objects.NIC}
866 @param nics: list of nics to convert to hooks tuples
867
868 """
869 hooks_nics = []
870 cluster = lu.cfg.GetClusterInfo()
871 for nic in nics:
872 ip = nic.ip
873 mac = nic.mac
874 filled_params = cluster.SimpleFillNIC(nic.nicparams)
875 mode = filled_params[constants.NIC_MODE]
876 link = filled_params[constants.NIC_LINK]
877 hooks_nics.append((ip, mac, mode, link))
878 return hooks_nics
879
882 """Builds instance related env variables for hooks from an object.
883
884 @type lu: L{LogicalUnit}
885 @param lu: the logical unit on whose behalf we execute
886 @type instance: L{objects.Instance}
887 @param instance: the instance for which we should build the
888 environment
889 @type override: dict
890 @param override: dictionary with key/values that will override
891 our values
892 @rtype: dict
893 @return: the hook environment dictionary
894
895 """
896 cluster = lu.cfg.GetClusterInfo()
897 bep = cluster.FillBE(instance)
898 hvp = cluster.FillHV(instance)
899 args = {
900 'name': instance.name,
901 'primary_node': instance.primary_node,
902 'secondary_nodes': instance.secondary_nodes,
903 'os_type': instance.os,
904 'status': instance.admin_up,
905 'memory': bep[constants.BE_MEMORY],
906 'vcpus': bep[constants.BE_VCPUS],
907 'nics': _NICListToTuple(lu, instance.nics),
908 'disk_template': instance.disk_template,
909 'disks': [(disk.size, disk.mode) for disk in instance.disks],
910 'bep': bep,
911 'hvp': hvp,
912 'hypervisor_name': instance.hypervisor,
913 }
914 if override:
915 args.update(override)
916 return _BuildInstanceHookEnv(**args)
917
920 """Adjust the candidate pool after node operations.
921
922 """
923 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
924 if mod_list:
925 lu.LogInfo("Promoted nodes to master candidate role: %s",
926 utils.CommaJoin(node.name for node in mod_list))
927 for name in mod_list:
928 lu.context.ReaddNode(name)
929 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
930 if mc_now > mc_max:
931 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
932 (mc_now, mc_max))
933
944
958
967
970 """Check whether an OS name conforms to the os variants specification.
971
972 @type os_obj: L{objects.OS}
973 @param os_obj: OS object to check
974 @type name: string
975 @param name: OS name passed by the user, to check for validity
976
977 """
978 if not os_obj.supported_variants:
979 return
980 variant = objects.OS.GetVariant(name)
981 if not variant:
982 raise errors.OpPrereqError("OS name must include a variant",
983 errors.ECODE_INVAL)
984
985 if variant not in os_obj.supported_variants:
986 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
987
991
994 """Returns a list of all primary and secondary instances on a node.
995
996 """
997
998 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
999
1002 """Returns primary instances on a node.
1003
1004 """
1005 return _GetNodeInstancesInner(cfg,
1006 lambda inst: node_name == inst.primary_node)
1007
1015
1018 """Returns the arguments for a storage type.
1019
1020 """
1021
1022 if storage_type == constants.ST_FILE:
1023
1024 return [[cfg.GetFileStorageDir()]]
1025
1026 return []
1027
1030 faulty = []
1031
1032 for dev in instance.disks:
1033 cfg.SetDiskID(dev, node_name)
1034
1035 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
1036 result.Raise("Failed to get disk status from node %s" % node_name,
1037 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1038
1039 for idx, bdev_status in enumerate(result.payload):
1040 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1041 faulty.append(idx)
1042
1043 return faulty
1044
1047 """Check the sanity of iallocator and node arguments and use the
1048 cluster-wide iallocator if appropriate.
1049
1050 Check that at most one of (iallocator, node) is specified. If none is
1051 specified, then the LU's opcode's iallocator slot is filled with the
1052 cluster-wide default iallocator.
1053
1054 @type iallocator_slot: string
1055 @param iallocator_slot: the name of the opcode iallocator slot
1056 @type node_slot: string
1057 @param node_slot: the name of the opcode target node slot
1058
1059 """
1060 node = getattr(lu.op, node_slot, None)
1061 iallocator = getattr(lu.op, iallocator_slot, None)
1062
1063 if node is not None and iallocator is not None:
1064 raise errors.OpPrereqError("Do not specify both, iallocator and node.",
1065 errors.ECODE_INVAL)
1066 elif node is None and iallocator is None:
1067 default_iallocator = lu.cfg.GetDefaultIAllocator()
1068 if default_iallocator:
1069 setattr(lu.op, iallocator_slot, default_iallocator)
1070 else:
1071 raise errors.OpPrereqError("No iallocator or node given and no"
1072 " cluster-wide default iallocator found."
1073 " Please specify either an iallocator or a"
1074 " node, or set a cluster-wide default"
1075 " iallocator.")
1076
1077
1078 -class LUPostInitCluster(LogicalUnit):
1079 """Logical unit for running hooks after cluster initialization.
1080
1081 """
1082 HPATH = "cluster-init"
1083 HTYPE = constants.HTYPE_CLUSTER
1084
1085 - def BuildHooksEnv(self):
1086 """Build hooks env.
1087
1088 """
1089 env = {"OP_TARGET": self.cfg.GetClusterName()}
1090 mn = self.cfg.GetMasterNode()
1091 return env, [], [mn]
1092
1093 - def Exec(self, feedback_fn):
1094 """Nothing to do.
1095
1096 """
1097 return True
1098
1101 """Logical unit for destroying the cluster.
1102
1103 """
1104 HPATH = "cluster-destroy"
1105 HTYPE = constants.HTYPE_CLUSTER
1106
1108 """Build hooks env.
1109
1110 """
1111 env = {"OP_TARGET": self.cfg.GetClusterName()}
1112 return env, [], []
1113
1115 """Check prerequisites.
1116
1117 This checks whether the cluster is empty.
1118
1119 Any errors are signaled by raising errors.OpPrereqError.
1120
1121 """
1122 master = self.cfg.GetMasterNode()
1123
1124 nodelist = self.cfg.GetNodeList()
1125 if len(nodelist) != 1 or nodelist[0] != master:
1126 raise errors.OpPrereqError("There are still %d node(s) in"
1127 " this cluster." % (len(nodelist) - 1),
1128 errors.ECODE_INVAL)
1129 instancelist = self.cfg.GetInstanceList()
1130 if instancelist:
1131 raise errors.OpPrereqError("There are still %d instance(s) in"
1132 " this cluster." % len(instancelist),
1133 errors.ECODE_INVAL)
1134
1135 - def Exec(self, feedback_fn):
1153
1156 """Verifies a certificate for LUVerifyCluster.
1157
1158 @type filename: string
1159 @param filename: Path to PEM file
1160
1161 """
1162 try:
1163 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1164 utils.ReadFile(filename))
1165 except Exception, err:
1166 return (LUVerifyCluster.ETYPE_ERROR,
1167 "Failed to load X509 certificate %s: %s" % (filename, err))
1168
1169 (errcode, msg) = \
1170 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1171 constants.SSL_CERT_EXPIRATION_ERROR)
1172
1173 if msg:
1174 fnamemsg = "While verifying %s: %s" % (filename, msg)
1175 else:
1176 fnamemsg = None
1177
1178 if errcode is None:
1179 return (None, fnamemsg)
1180 elif errcode == utils.CERT_WARNING:
1181 return (LUVerifyCluster.ETYPE_WARNING, fnamemsg)
1182 elif errcode == utils.CERT_ERROR:
1183 return (LUVerifyCluster.ETYPE_ERROR, fnamemsg)
1184
1185 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1186
1189 """Verifies the cluster status.
1190
1191 """
1192 HPATH = "cluster-verify"
1193 HTYPE = constants.HTYPE_CLUSTER
1194 _OP_PARAMS = [
1195 ("skip_checks", ht.EmptyList,
1196 ht.TListOf(ht.TElemOf(constants.VERIFY_OPTIONAL_CHECKS))),
1197 ("verbose", False, ht.TBool),
1198 ("error_codes", False, ht.TBool),
1199 ("debug_simulate_errors", False, ht.TBool),
1200 ]
1201 REQ_BGL = False
1202
1203 TCLUSTER = "cluster"
1204 TNODE = "node"
1205 TINSTANCE = "instance"
1206
1207 ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
1208 ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
1209 EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
1210 EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
1211 EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
1212 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1213 EINSTANCEFAULTYDISK = (TINSTANCE, "EINSTANCEFAULTYDISK")
1214 EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
1215 ENODEDRBD = (TNODE, "ENODEDRBD")
1216 ENODEDRBDHELPER = (TNODE, "ENODEDRBDHELPER")
1217 ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
1218 ENODEHOOKS = (TNODE, "ENODEHOOKS")
1219 ENODEHV = (TNODE, "ENODEHV")
1220 ENODELVM = (TNODE, "ENODELVM")
1221 ENODEN1 = (TNODE, "ENODEN1")
1222 ENODENET = (TNODE, "ENODENET")
1223 ENODEOS = (TNODE, "ENODEOS")
1224 ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
1225 ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
1226 ENODERPC = (TNODE, "ENODERPC")
1227 ENODESSH = (TNODE, "ENODESSH")
1228 ENODEVERSION = (TNODE, "ENODEVERSION")
1229 ENODESETUP = (TNODE, "ENODESETUP")
1230 ENODETIME = (TNODE, "ENODETIME")
1231
1232 ETYPE_FIELD = "code"
1233 ETYPE_ERROR = "ERROR"
1234 ETYPE_WARNING = "WARNING"
1235
1237 """A class representing the logical and physical status of a node.
1238
1239 @type name: string
1240 @ivar name: the node name to which this object refers
1241 @ivar volumes: a structure as returned from
1242 L{ganeti.backend.GetVolumeList} (runtime)
1243 @ivar instances: a list of running instances (runtime)
1244 @ivar pinst: list of configured primary instances (config)
1245 @ivar sinst: list of configured secondary instances (config)
1246 @ivar sbp: diction of {secondary-node: list of instances} of all peers
1247 of this node (config)
1248 @ivar mfree: free memory, as reported by hypervisor (runtime)
1249 @ivar dfree: free disk, as reported by the node (runtime)
1250 @ivar offline: the offline status (config)
1251 @type rpc_fail: boolean
1252 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1253 not whether the individual keys were correct) (runtime)
1254 @type lvm_fail: boolean
1255 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1256 @type hyp_fail: boolean
1257 @ivar hyp_fail: whether the RPC call didn't return the instance list
1258 @type ghost: boolean
1259 @ivar ghost: whether this is a known node or not (config)
1260 @type os_fail: boolean
1261 @ivar os_fail: whether the RPC call didn't return valid OS data
1262 @type oslist: list
1263 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1264 @type vm_capable: boolean
1265 @ivar vm_capable: whether the node can host instances
1266
1267 """
1268 - def __init__(self, offline=False, name=None, vm_capable=True):
1269 self.name = name
1270 self.volumes = {}
1271 self.instances = []
1272 self.pinst = []
1273 self.sinst = []
1274 self.sbp = {}
1275 self.mfree = 0
1276 self.dfree = 0
1277 self.offline = offline
1278 self.vm_capable = vm_capable
1279 self.rpc_fail = False
1280 self.lvm_fail = False
1281 self.hyp_fail = False
1282 self.ghost = False
1283 self.os_fail = False
1284 self.oslist = {}
1285
1292
1293 - def _Error(self, ecode, item, msg, *args, **kwargs):
1294 """Format an error message.
1295
1296 Based on the opcode's error_codes parameter, either format a
1297 parseable error code, or a simpler error string.
1298
1299 This must be called only from Exec and functions called from Exec.
1300
1301 """
1302 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1303 itype, etxt = ecode
1304
1305 if args:
1306 msg = msg % args
1307
1308 if self.op.error_codes:
1309 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1310 else:
1311 if item:
1312 item = " " + item
1313 else:
1314 item = ""
1315 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1316
1317 self._feedback_fn(" - %s" % msg)
1318
1319 - def _ErrorIf(self, cond, *args, **kwargs):
1320 """Log an error message if the passed condition is True.
1321
1322 """
1323 cond = bool(cond) or self.op.debug_simulate_errors
1324 if cond:
1325 self._Error(*args, **kwargs)
1326
1327 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1328 self.bad = self.bad or cond
1329
1331 """Perform some basic validation on data returned from a node.
1332
1333 - check the result data structure is well formed and has all the
1334 mandatory fields
1335 - check ganeti version
1336
1337 @type ninfo: L{objects.Node}
1338 @param ninfo: the node to check
1339 @param nresult: the results from the node
1340 @rtype: boolean
1341 @return: whether overall this call was successful (and we can expect
1342 reasonable values in the respose)
1343
1344 """
1345 node = ninfo.name
1346 _ErrorIf = self._ErrorIf
1347
1348
1349 test = not nresult or not isinstance(nresult, dict)
1350 _ErrorIf(test, self.ENODERPC, node,
1351 "unable to verify node: no data returned")
1352 if test:
1353 return False
1354
1355
1356 local_version = constants.PROTOCOL_VERSION
1357 remote_version = nresult.get("version", None)
1358 test = not (remote_version and
1359 isinstance(remote_version, (list, tuple)) and
1360 len(remote_version) == 2)
1361 _ErrorIf(test, self.ENODERPC, node,
1362 "connection to node returned invalid data")
1363 if test:
1364 return False
1365
1366 test = local_version != remote_version[0]
1367 _ErrorIf(test, self.ENODEVERSION, node,
1368 "incompatible protocol versions: master %s,"
1369 " node %s", local_version, remote_version[0])
1370 if test:
1371 return False
1372
1373
1374
1375
1376 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1377 self.ENODEVERSION, node,
1378 "software version mismatch: master %s, node %s",
1379 constants.RELEASE_VERSION, remote_version[1],
1380 code=self.ETYPE_WARNING)
1381
1382 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1383 if ninfo.vm_capable and isinstance(hyp_result, dict):
1384 for hv_name, hv_result in hyp_result.iteritems():
1385 test = hv_result is not None
1386 _ErrorIf(test, self.ENODEHV, node,
1387 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1388
1389 test = nresult.get(constants.NV_NODESETUP,
1390 ["Missing NODESETUP results"])
1391 _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1392 "; ".join(test))
1393
1394 return True
1395
1396 - def _VerifyNodeTime(self, ninfo, nresult,
1397 nvinfo_starttime, nvinfo_endtime):
1398 """Check the node time.
1399
1400 @type ninfo: L{objects.Node}
1401 @param ninfo: the node to check
1402 @param nresult: the remote results for the node
1403 @param nvinfo_starttime: the start time of the RPC call
1404 @param nvinfo_endtime: the end time of the RPC call
1405
1406 """
1407 node = ninfo.name
1408 _ErrorIf = self._ErrorIf
1409
1410 ntime = nresult.get(constants.NV_TIME, None)
1411 try:
1412 ntime_merged = utils.MergeTime(ntime)
1413 except (ValueError, TypeError):
1414 _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1415 return
1416
1417 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1418 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1419 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1420 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1421 else:
1422 ntime_diff = None
1423
1424 _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1425 "Node time diverges by at least %s from master node time",
1426 ntime_diff)
1427
1429 """Check the node time.
1430
1431 @type ninfo: L{objects.Node}
1432 @param ninfo: the node to check
1433 @param nresult: the remote results for the node
1434 @param vg_name: the configured VG name
1435
1436 """
1437 if vg_name is None:
1438 return
1439
1440 node = ninfo.name
1441 _ErrorIf = self._ErrorIf
1442
1443
1444 vglist = nresult.get(constants.NV_VGLIST, None)
1445 test = not vglist
1446 _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1447 if not test:
1448 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1449 constants.MIN_VG_SIZE)
1450 _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1451
1452
1453 pvlist = nresult.get(constants.NV_PVLIST, None)
1454 test = pvlist is None
1455 _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1456 if not test:
1457
1458
1459
1460 for _, pvname, owner_vg in pvlist:
1461 test = ":" in pvname
1462 _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1463 " '%s' of VG '%s'", pvname, owner_vg)
1464
1466 """Check the node time.
1467
1468 @type ninfo: L{objects.Node}
1469 @param ninfo: the node to check
1470 @param nresult: the remote results for the node
1471
1472 """
1473 node = ninfo.name
1474 _ErrorIf = self._ErrorIf
1475
1476 test = constants.NV_NODELIST not in nresult
1477 _ErrorIf(test, self.ENODESSH, node,
1478 "node hasn't returned node ssh connectivity data")
1479 if not test:
1480 if nresult[constants.NV_NODELIST]:
1481 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1482 _ErrorIf(True, self.ENODESSH, node,
1483 "ssh communication with node '%s': %s", a_node, a_msg)
1484
1485 test = constants.NV_NODENETTEST not in nresult
1486 _ErrorIf(test, self.ENODENET, node,
1487 "node hasn't returned node tcp connectivity data")
1488 if not test:
1489 if nresult[constants.NV_NODENETTEST]:
1490 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1491 for anode in nlist:
1492 _ErrorIf(True, self.ENODENET, node,
1493 "tcp communication with node '%s': %s",
1494 anode, nresult[constants.NV_NODENETTEST][anode])
1495
1496 test = constants.NV_MASTERIP not in nresult
1497 _ErrorIf(test, self.ENODENET, node,
1498 "node hasn't returned node master IP reachability data")
1499 if not test:
1500 if not nresult[constants.NV_MASTERIP]:
1501 if node == self.master_node:
1502 msg = "the master node cannot reach the master IP (not configured?)"
1503 else:
1504 msg = "cannot reach the master IP"
1505 _ErrorIf(True, self.ENODENET, node, msg)
1506
1507 - def _VerifyInstance(self, instance, instanceconfig, node_image,
1508 diskstatus):
1509 """Verify an instance.
1510
1511 This function checks to see if the required block devices are
1512 available on the instance's node.
1513
1514 """
1515 _ErrorIf = self._ErrorIf
1516 node_current = instanceconfig.primary_node
1517
1518 node_vol_should = {}
1519 instanceconfig.MapLVsByNode(node_vol_should)
1520
1521 for node in node_vol_should:
1522 n_img = node_image[node]
1523 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1524
1525 continue
1526 for volume in node_vol_should[node]:
1527 test = volume not in n_img.volumes
1528 _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
1529 "volume %s missing on node %s", volume, node)
1530
1531 if instanceconfig.admin_up:
1532 pri_img = node_image[node_current]
1533 test = instance not in pri_img.instances and not pri_img.offline
1534 _ErrorIf(test, self.EINSTANCEDOWN, instance,
1535 "instance not running on its primary node %s",
1536 node_current)
1537
1538 for node, n_img in node_image.items():
1539 if (not node == node_current):
1540 test = instance in n_img.instances
1541 _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
1542 "instance should not run on node %s", node)
1543
1544 diskdata = [(nname, success, status, idx)
1545 for (nname, disks) in diskstatus.items()
1546 for idx, (success, status) in enumerate(disks)]
1547
1548 for nname, success, bdev_status, idx in diskdata:
1549 _ErrorIf(instanceconfig.admin_up and not success,
1550 self.EINSTANCEFAULTYDISK, instance,
1551 "couldn't retrieve status for disk/%s on %s: %s",
1552 idx, nname, bdev_status)
1553 _ErrorIf((instanceconfig.admin_up and success and
1554 bdev_status.ldisk_status == constants.LDS_FAULTY),
1555 self.EINSTANCEFAULTYDISK, instance,
1556 "disk/%s on %s is faulty", idx, nname)
1557
1559 """Verify if there are any unknown volumes in the cluster.
1560
1561 The .os, .swap and backup volumes are ignored. All other volumes are
1562 reported as unknown.
1563
1564 @type reserved: L{ganeti.utils.FieldSet}
1565 @param reserved: a FieldSet of reserved volume names
1566
1567 """
1568 for node, n_img in node_image.items():
1569 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1570
1571 continue
1572 for volume in n_img.volumes:
1573 test = ((node not in node_vol_should or
1574 volume not in node_vol_should[node]) and
1575 not reserved.Matches(volume))
1576 self._ErrorIf(test, self.ENODEORPHANLV, node,
1577 "volume %s is unknown", volume)
1578
1580 """Verify the list of running instances.
1581
1582 This checks what instances are running but unknown to the cluster.
1583
1584 """
1585 for node, n_img in node_image.items():
1586 for o_inst in n_img.instances:
1587 test = o_inst not in instancelist
1588 self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
1589 "instance %s on node %s should not exist", o_inst, node)
1590
1592 """Verify N+1 Memory Resilience.
1593
1594 Check that if one single node dies we can still start all the
1595 instances it was primary for.
1596
1597 """
1598 for node, n_img in node_image.items():
1599
1600
1601
1602
1603
1604
1605
1606
1607 for prinode, instances in n_img.sbp.items():
1608 needed_mem = 0
1609 for instance in instances:
1610 bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
1611 if bep[constants.BE_AUTO_BALANCE]:
1612 needed_mem += bep[constants.BE_MEMORY]
1613 test = n_img.mfree < needed_mem
1614 self._ErrorIf(test, self.ENODEN1, node,
1615 "not enough memory on to accommodate"
1616 " failovers should peer node %s fail", prinode)
1617
1618 - def _VerifyNodeFiles(self, ninfo, nresult, file_list, local_cksum,
1619 master_files):
1620 """Verifies and computes the node required file checksums.
1621
1622 @type ninfo: L{objects.Node}
1623 @param ninfo: the node to check
1624 @param nresult: the remote results for the node
1625 @param file_list: required list of files
1626 @param local_cksum: dictionary of local files and their checksums
1627 @param master_files: list of files that only masters should have
1628
1629 """
1630 node = ninfo.name
1631 _ErrorIf = self._ErrorIf
1632
1633 remote_cksum = nresult.get(constants.NV_FILELIST, None)
1634 test = not isinstance(remote_cksum, dict)
1635 _ErrorIf(test, self.ENODEFILECHECK, node,
1636 "node hasn't returned file checksum data")
1637 if test:
1638 return
1639
1640 for file_name in file_list:
1641 node_is_mc = ninfo.master_candidate
1642 must_have = (file_name not in master_files) or node_is_mc
1643
1644 test1 = file_name not in remote_cksum
1645
1646 test2 = not test1 and remote_cksum[file_name] != local_cksum[file_name]
1647
1648 test3 = not test1 and remote_cksum[file_name] == local_cksum[file_name]
1649 _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
1650 "file '%s' missing", file_name)
1651 _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
1652 "file '%s' has wrong checksum", file_name)
1653
1654 _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
1655 "file '%s' should not exist on non master"
1656 " candidates (and the file is outdated)", file_name)
1657
1658 _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
1659 "file '%s' should not exist"
1660 " on non master candidates", file_name)
1661
1662 - def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
1663 drbd_map):
1664 """Verifies and the node DRBD status.
1665
1666 @type ninfo: L{objects.Node}
1667 @param ninfo: the node to check
1668 @param nresult: the remote results for the node
1669 @param instanceinfo: the dict of instances
1670 @param drbd_helper: the configured DRBD usermode helper
1671 @param drbd_map: the DRBD map as returned by
1672 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1673
1674 """
1675 node = ninfo.name
1676 _ErrorIf = self._ErrorIf
1677
1678 if drbd_helper:
1679 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
1680 test = (helper_result == None)
1681 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1682 "no drbd usermode helper returned")
1683 if helper_result:
1684 status, payload = helper_result
1685 test = not status
1686 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1687 "drbd usermode helper check unsuccessful: %s", payload)
1688 test = status and (payload != drbd_helper)
1689 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1690 "wrong drbd usermode helper: %s", payload)
1691
1692
1693 node_drbd = {}
1694 for minor, instance in drbd_map[node].items():
1695 test = instance not in instanceinfo
1696 _ErrorIf(test, self.ECLUSTERCFG, None,
1697 "ghost instance '%s' in temporary DRBD map", instance)
1698
1699
1700
1701 if test:
1702 node_drbd[minor] = (instance, False)
1703 else:
1704 instance = instanceinfo[instance]
1705 node_drbd[minor] = (instance.name, instance.admin_up)
1706
1707
1708 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1709 test = not isinstance(used_minors, (tuple, list))
1710 _ErrorIf(test, self.ENODEDRBD, node,
1711 "cannot parse drbd status file: %s", str(used_minors))
1712 if test:
1713
1714 return
1715
1716 for minor, (iname, must_exist) in node_drbd.items():
1717 test = minor not in used_minors and must_exist
1718 _ErrorIf(test, self.ENODEDRBD, node,
1719 "drbd minor %d of instance %s is not active", minor, iname)
1720 for minor in used_minors:
1721 test = minor not in node_drbd
1722 _ErrorIf(test, self.ENODEDRBD, node,
1723 "unallocated drbd minor %d is in use", minor)
1724
1726 """Builds the node OS structures.
1727
1728 @type ninfo: L{objects.Node}
1729 @param ninfo: the node to check
1730 @param nresult: the remote results for the node
1731 @param nimg: the node image object
1732
1733 """
1734 node = ninfo.name
1735 _ErrorIf = self._ErrorIf
1736
1737 remote_os = nresult.get(constants.NV_OSLIST, None)
1738 test = (not isinstance(remote_os, list) or
1739 not compat.all(isinstance(v, list) and len(v) == 7
1740 for v in remote_os))
1741
1742 _ErrorIf(test, self.ENODEOS, node,
1743 "node hasn't returned valid OS data")
1744
1745 nimg.os_fail = test
1746
1747 if test:
1748 return
1749
1750 os_dict = {}
1751
1752 for (name, os_path, status, diagnose,
1753 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
1754
1755 if name not in os_dict:
1756 os_dict[name] = []
1757
1758
1759
1760 parameters = [tuple(v) for v in parameters]
1761 os_dict[name].append((os_path, status, diagnose,
1762 set(variants), set(parameters), set(api_ver)))
1763
1764 nimg.oslist = os_dict
1765
1767 """Verifies the node OS list.
1768
1769 @type ninfo: L{objects.Node}
1770 @param ninfo: the node to check
1771 @param nimg: the node image object
1772 @param base: the 'template' node we match against (e.g. from the master)
1773
1774 """
1775 node = ninfo.name
1776 _ErrorIf = self._ErrorIf
1777
1778 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1779
1780 for os_name, os_data in nimg.oslist.items():
1781 assert os_data, "Empty OS status for OS %s?!" % os_name
1782 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
1783 _ErrorIf(not f_status, self.ENODEOS, node,
1784 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
1785 _ErrorIf(len(os_data) > 1, self.ENODEOS, node,
1786 "OS '%s' has multiple entries (first one shadows the rest): %s",
1787 os_name, utils.CommaJoin([v[0] for v in os_data]))
1788
1789 _ErrorIf(compat.any(v >= constants.OS_API_V15 for v in f_api)
1790 and not f_var, self.ENODEOS, node,
1791 "OS %s with API at least %d does not declare any variant",
1792 os_name, constants.OS_API_V15)
1793
1794 test = os_name not in base.oslist
1795 _ErrorIf(test, self.ENODEOS, node,
1796 "Extra OS %s not present on reference node (%s)",
1797 os_name, base.name)
1798 if test:
1799 continue
1800 assert base.oslist[os_name], "Base node has empty OS status?"
1801 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
1802 if not b_status:
1803
1804 continue
1805 for kind, a, b in [("API version", f_api, b_api),
1806 ("variants list", f_var, b_var),
1807 ("parameters", f_param, b_param)]:
1808 _ErrorIf(a != b, self.ENODEOS, node,
1809 "OS %s %s differs from reference node %s: %s vs. %s",
1810 kind, os_name, base.name,
1811 utils.CommaJoin(a), utils.CommaJoin(b))
1812
1813
1814 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
1815 _ErrorIf(missing, self.ENODEOS, node,
1816 "OSes present on reference node %s but missing on this node: %s",
1817 base.name, utils.CommaJoin(missing))
1818
1820 """Verifies and updates the node volume data.
1821
1822 This function will update a L{NodeImage}'s internal structures
1823 with data from the remote call.
1824
1825 @type ninfo: L{objects.Node}
1826 @param ninfo: the node to check
1827 @param nresult: the remote results for the node
1828 @param nimg: the node image object
1829 @param vg_name: the configured VG name
1830
1831 """
1832 node = ninfo.name
1833 _ErrorIf = self._ErrorIf
1834
1835 nimg.lvm_fail = True
1836 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1837 if vg_name is None:
1838 pass
1839 elif isinstance(lvdata, basestring):
1840 _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
1841 utils.SafeEncode(lvdata))
1842 elif not isinstance(lvdata, dict):
1843 _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
1844 else:
1845 nimg.volumes = lvdata
1846 nimg.lvm_fail = False
1847
1849 """Verifies and updates the node instance list.
1850
1851 If the listing was successful, then updates this node's instance
1852 list. Otherwise, it marks the RPC call as failed for the instance
1853 list key.
1854
1855 @type ninfo: L{objects.Node}
1856 @param ninfo: the node to check
1857 @param nresult: the remote results for the node
1858 @param nimg: the node image object
1859
1860 """
1861 idata = nresult.get(constants.NV_INSTANCELIST, None)
1862 test = not isinstance(idata, list)
1863 self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
1864 " (instancelist): %s", utils.SafeEncode(str(idata)))
1865 if test:
1866 nimg.hyp_fail = True
1867 else:
1868 nimg.instances = idata
1869
1871 """Verifies and computes a node information map
1872
1873 @type ninfo: L{objects.Node}
1874 @param ninfo: the node to check
1875 @param nresult: the remote results for the node
1876 @param nimg: the node image object
1877 @param vg_name: the configured VG name
1878
1879 """
1880 node = ninfo.name
1881 _ErrorIf = self._ErrorIf
1882
1883
1884 hv_info = nresult.get(constants.NV_HVINFO, None)
1885 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
1886 _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
1887 if not test:
1888 try:
1889 nimg.mfree = int(hv_info["memory_free"])
1890 except (ValueError, TypeError):
1891 _ErrorIf(True, self.ENODERPC, node,
1892 "node returned invalid nodeinfo, check hypervisor")
1893
1894
1895 if vg_name is not None:
1896 test = (constants.NV_VGLIST not in nresult or
1897 vg_name not in nresult[constants.NV_VGLIST])
1898 _ErrorIf(test, self.ENODELVM, node,
1899 "node didn't return data for the volume group '%s'"
1900 " - it is either missing or broken", vg_name)
1901 if not test:
1902 try:
1903 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1904 except (ValueError, TypeError):
1905 _ErrorIf(True, self.ENODERPC, node,
1906 "node returned invalid LVM info, check LVM status")
1907
1909 """Gets per-disk status information for all instances.
1910
1911 @type nodelist: list of strings
1912 @param nodelist: Node names
1913 @type node_image: dict of (name, L{objects.Node})
1914 @param node_image: Node objects
1915 @type instanceinfo: dict of (name, L{objects.Instance})
1916 @param instanceinfo: Instance objects
1917 @rtype: {instance: {node: [(succes, payload)]}}
1918 @return: a dictionary of per-instance dictionaries with nodes as
1919 keys and disk information as values; the disk information is a
1920 list of tuples (success, payload)
1921
1922 """
1923 _ErrorIf = self._ErrorIf
1924
1925 node_disks = {}
1926 node_disks_devonly = {}
1927
1928 for nname in nodelist:
1929 disks = [(inst, disk)
1930 for instlist in [node_image[nname].pinst,
1931 node_image[nname].sinst]
1932 for inst in instlist
1933 for disk in instanceinfo[inst].disks]
1934
1935 if not disks:
1936
1937 continue
1938
1939 node_disks[nname] = disks
1940
1941
1942
1943 devonly = [dev.Copy() for (_, dev) in disks]
1944
1945 for dev in devonly:
1946 self.cfg.SetDiskID(dev, nname)
1947
1948 node_disks_devonly[nname] = devonly
1949
1950 assert len(node_disks) == len(node_disks_devonly)
1951
1952
1953 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
1954 node_disks_devonly)
1955
1956 assert len(result) == len(node_disks)
1957
1958 instdisk = {}
1959
1960 for (nname, nres) in result.items():
1961 disks = node_disks[nname]
1962
1963 if nres.offline:
1964
1965 data = len(disks) * [(False, "node offline")]
1966 else:
1967 msg = nres.fail_msg
1968 _ErrorIf(msg, self.ENODERPC, nname,
1969 "while getting disk information: %s", msg)
1970 if msg:
1971
1972 data = len(disks) * [(False, msg)]
1973 else:
1974 data = []
1975 for idx, i in enumerate(nres.payload):
1976 if isinstance(i, (tuple, list)) and len(i) == 2:
1977 data.append(i)
1978 else:
1979 logging.warning("Invalid result from node %s, entry %d: %s",
1980 nname, idx, i)
1981 data.append((False, "Invalid result from the remote node"))
1982
1983 for ((inst, _), status) in zip(disks, data):
1984 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
1985
1986 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
1987 len(nnames) <= len(instanceinfo[inst].all_nodes) and
1988 compat.all(isinstance(s, (tuple, list)) and
1989 len(s) == 2 for s in statuses)
1990 for inst, nnames in instdisk.items()
1991 for nname, statuses in nnames.items())
1992 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
1993
1994 return instdisk
1995
1997 """Build hooks env.
1998
1999 Cluster-Verify hooks just ran in the post phase and their failure makes
2000 the output be logged in the verify output and the verification to fail.
2001
2002 """
2003 all_nodes = self.cfg.GetNodeList()
2004 env = {
2005 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2006 }
2007 for node in self.cfg.GetAllNodesInfo().values():
2008 env["NODE_TAGS_%s" % node.name] = " ".join(node.GetTags())
2009
2010 return env, [], all_nodes
2011
2012 - def Exec(self, feedback_fn):
2013 """Verify integrity of cluster, performing various test on nodes.
2014
2015 """
2016 self.bad = False
2017 _ErrorIf = self._ErrorIf
2018 verbose = self.op.verbose
2019 self._feedback_fn = feedback_fn
2020 feedback_fn("* Verifying global settings")
2021 for msg in self.cfg.VerifyConfig():
2022 _ErrorIf(True, self.ECLUSTERCFG, None, msg)
2023
2024
2025 for cert_filename in constants.ALL_CERT_FILES:
2026 (errcode, msg) = _VerifyCertificate(cert_filename)
2027 _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
2028
2029 vg_name = self.cfg.GetVGName()
2030 drbd_helper = self.cfg.GetDRBDHelper()
2031 hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
2032 cluster = self.cfg.GetClusterInfo()
2033 nodelist = utils.NiceSort(self.cfg.GetNodeList())
2034 nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist]
2035 instancelist = utils.NiceSort(self.cfg.GetInstanceList())
2036 instanceinfo = dict((iname, self.cfg.GetInstanceInfo(iname))
2037 for iname in instancelist)
2038 i_non_redundant = []
2039 i_non_a_balanced = []
2040 n_offline = 0
2041 n_drained = 0
2042 node_vol_should = {}
2043
2044
2045
2046 master_files = [constants.CLUSTER_CONF_FILE]
2047 master_node = self.master_node = self.cfg.GetMasterNode()
2048 master_ip = self.cfg.GetMasterIP()
2049
2050 file_names = ssconf.SimpleStore().GetFileList()
2051 file_names.extend(constants.ALL_CERT_FILES)
2052 file_names.extend(master_files)
2053 if cluster.modify_etc_hosts:
2054 file_names.append(constants.ETC_HOSTS)
2055
2056 local_checksums = utils.FingerprintFiles(file_names)
2057
2058 feedback_fn("* Gathering data (%d nodes)" % len(nodelist))
2059 node_verify_param = {
2060 constants.NV_FILELIST: file_names,
2061 constants.NV_NODELIST: [node.name for node in nodeinfo
2062 if not node.offline],
2063 constants.NV_HYPERVISOR: hypervisors,
2064 constants.NV_NODENETTEST: [(node.name, node.primary_ip,
2065 node.secondary_ip) for node in nodeinfo
2066 if not node.offline],
2067 constants.NV_INSTANCELIST: hypervisors,
2068 constants.NV_VERSION: None,
2069 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2070 constants.NV_NODESETUP: None,
2071 constants.NV_TIME: None,
2072 constants.NV_MASTERIP: (master_node, master_ip),
2073 constants.NV_OSLIST: None,
2074 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2075 }
2076
2077 if vg_name is not None:
2078 node_verify_param[constants.NV_VGLIST] = None
2079 node_verify_param[constants.NV_LVLIST] = vg_name
2080 node_verify_param[constants.NV_PVLIST] = [vg_name]
2081 node_verify_param[constants.NV_DRBDLIST] = None
2082
2083 if drbd_helper:
2084 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2085
2086
2087 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2088 name=node.name,
2089 vm_capable=node.vm_capable))
2090 for node in nodeinfo)
2091
2092 for instance in instancelist:
2093 inst_config = instanceinfo[instance]
2094
2095 for nname in inst_config.all_nodes:
2096 if nname not in node_image:
2097
2098 gnode = self.NodeImage(name=nname)
2099 gnode.ghost = True
2100 node_image[nname] = gnode
2101
2102 inst_config.MapLVsByNode(node_vol_should)
2103
2104 pnode = inst_config.primary_node
2105 node_image[pnode].pinst.append(instance)
2106
2107 for snode in inst_config.secondary_nodes:
2108 nimg = node_image[snode]
2109 nimg.sinst.append(instance)
2110 if pnode not in nimg.sbp:
2111 nimg.sbp[pnode] = []
2112 nimg.sbp[pnode].append(instance)
2113
2114
2115
2116
2117
2118
2119
2120
2121 nvinfo_starttime = time.time()
2122 all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
2123 self.cfg.GetClusterName())
2124 nvinfo_endtime = time.time()
2125
2126 all_drbd_map = self.cfg.ComputeDRBDMap()
2127
2128 feedback_fn("* Gathering disk information (%s nodes)" % len(nodelist))
2129 instdisk = self._CollectDiskInfo(nodelist, node_image, instanceinfo)
2130
2131 feedback_fn("* Verifying node status")
2132
2133 refos_img = None
2134
2135 for node_i in nodeinfo:
2136 node = node_i.name
2137 nimg = node_image[node]
2138
2139 if node_i.offline:
2140 if verbose:
2141 feedback_fn("* Skipping offline node %s" % (node,))
2142 n_offline += 1
2143 continue
2144
2145 if node == master_node:
2146 ntype = "master"
2147 elif node_i.master_candidate:
2148 ntype = "master candidate"
2149 elif node_i.drained:
2150 ntype = "drained"
2151 n_drained += 1
2152 else:
2153 ntype = "regular"
2154 if verbose:
2155 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2156
2157 msg = all_nvinfo[node].fail_msg
2158 _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
2159 if msg:
2160 nimg.rpc_fail = True
2161 continue
2162
2163 nresult = all_nvinfo[node].payload
2164
2165 nimg.call_ok = self._VerifyNode(node_i, nresult)
2166 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2167 self._VerifyNodeNetwork(node_i, nresult)
2168 self._VerifyNodeFiles(node_i, nresult, file_names, local_checksums,
2169 master_files)
2170
2171 if nimg.vm_capable:
2172 self._VerifyNodeLVM(node_i, nresult, vg_name)
2173 self._VerifyNodeDrbd(node_i, nresult, instanceinfo, drbd_helper,
2174 all_drbd_map)
2175
2176 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2177 self._UpdateNodeInstances(node_i, nresult, nimg)
2178 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2179 self._UpdateNodeOS(node_i, nresult, nimg)
2180 if not nimg.os_fail:
2181 if refos_img is None:
2182 refos_img = nimg
2183 self._VerifyNodeOS(node_i, nimg, refos_img)
2184
2185 feedback_fn("* Verifying instance status")
2186 for instance in instancelist:
2187 if verbose:
2188 feedback_fn("* Verifying instance %s" % instance)
2189 inst_config = instanceinfo[instance]
2190 self._VerifyInstance(instance, inst_config, node_image,
2191 instdisk[instance])
2192 inst_nodes_offline = []
2193
2194 pnode = inst_config.primary_node
2195 pnode_img = node_image[pnode]
2196 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2197 self.ENODERPC, pnode, "instance %s, connection to"
2198 " primary node failed", instance)
2199
2200 if pnode_img.offline:
2201 inst_nodes_offline.append(pnode)
2202
2203
2204
2205
2206
2207
2208 if not inst_config.secondary_nodes:
2209 i_non_redundant.append(instance)
2210 _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
2211 instance, "instance has multiple secondary nodes: %s",
2212 utils.CommaJoin(inst_config.secondary_nodes),
2213 code=self.ETYPE_WARNING)
2214
2215 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2216 i_non_a_balanced.append(instance)
2217
2218 for snode in inst_config.secondary_nodes:
2219 s_img = node_image[snode]
2220 _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
2221 "instance %s, connection to secondary node failed", instance)
2222
2223 if s_img.offline:
2224 inst_nodes_offline.append(snode)
2225
2226
2227 _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
2228 "instance lives on offline node(s) %s",
2229 utils.CommaJoin(inst_nodes_offline))
2230
2231 for node in inst_config.all_nodes:
2232 _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
2233 "instance lives on ghost node %s", node)
2234 _ErrorIf(not node_image[node].vm_capable, self.EINSTANCEBADNODE,
2235 instance, "instance lives on non-vm_capable node %s", node)
2236
2237 feedback_fn("* Verifying orphan volumes")
2238 reserved = utils.FieldSet(*cluster.reserved_lvs)
2239 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2240
2241 feedback_fn("* Verifying orphan instances")
2242 self._VerifyOrphanInstances(instancelist, node_image)
2243
2244 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2245 feedback_fn("* Verifying N+1 Memory redundancy")
2246 self._VerifyNPlusOneMemory(node_image, instanceinfo)
2247
2248 feedback_fn("* Other Notes")
2249 if i_non_redundant:
2250 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2251 % len(i_non_redundant))
2252
2253 if i_non_a_balanced:
2254 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2255 % len(i_non_a_balanced))
2256
2257 if n_offline:
2258 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2259
2260 if n_drained:
2261 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2262
2263 return not self.bad
2264
2265 - def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2266 """Analyze the post-hooks' result
2267
2268 This method analyses the hook result, handles it, and sends some
2269 nicely-formatted feedback back to the user.
2270
2271 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2272 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2273 @param hooks_results: the results of the multi-node hooks rpc call
2274 @param feedback_fn: function used send feedback back to the caller
2275 @param lu_result: previous Exec result
2276 @return: the new Exec result, based on the previous result
2277 and hook results
2278
2279 """
2280
2281
2282 if phase == constants.HOOKS_PHASE_POST:
2283
2284 indent_re = re.compile('^', re.M)
2285 feedback_fn("* Hooks Results")
2286 assert hooks_results, "invalid result from hooks"
2287
2288 for node_name in hooks_results:
2289 res = hooks_results[node_name]
2290 msg = res.fail_msg
2291 test = msg and not res.offline
2292 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2293 "Communication failure in hooks execution: %s", msg)
2294 if res.offline or msg:
2295
2296
2297
2298 lu_result = 1
2299 continue
2300 for script, hkr, output in res.payload:
2301 test = hkr == constants.HKR_FAIL
2302 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2303 "Script %s failed, output:", script)
2304 if test:
2305 output = indent_re.sub(' ', output)
2306 feedback_fn("%s" % output)
2307 lu_result = 0
2308
2309 return lu_result
2310
2313 """Verifies the cluster disks status.
2314
2315 """
2316 REQ_BGL = False
2317
2324
2325 - def Exec(self, feedback_fn):
2326 """Verify integrity of cluster disks.
2327
2328 @rtype: tuple of three items
2329 @return: a tuple of (dict of node-to-node_error, list of instances
2330 which need activate-disks, dict of instance: (node, volume) for
2331 missing volumes
2332
2333 """
2334 result = res_nodes, res_instances, res_missing = {}, [], {}
2335
2336 vg_name = self.cfg.GetVGName()
2337 nodes = utils.NiceSort(self.cfg.GetNodeList())
2338 instances = [self.cfg.GetInstanceInfo(name)
2339 for name in self.cfg.GetInstanceList()]
2340
2341 nv_dict = {}
2342 for inst in instances:
2343 inst_lvs = {}
2344 if (not inst.admin_up or
2345 inst.disk_template not in constants.DTS_NET_MIRROR):
2346 continue
2347 inst.MapLVsByNode(inst_lvs)
2348
2349 for node, vol_list in inst_lvs.iteritems():
2350 for vol in vol_list:
2351 nv_dict[(node, vol)] = inst
2352
2353 if not nv_dict:
2354 return result
2355
2356 node_lvs = self.rpc.call_lv_list(nodes, vg_name)
2357
2358 for node in nodes:
2359
2360 node_res = node_lvs[node]
2361 if node_res.offline:
2362 continue
2363 msg = node_res.fail_msg
2364 if msg:
2365 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
2366 res_nodes[node] = msg
2367 continue
2368
2369 lvs = node_res.payload
2370 for lv_name, (_, _, lv_online) in lvs.items():
2371 inst = nv_dict.pop((node, lv_name), None)
2372 if (not lv_online and inst is not None
2373 and inst.name not in res_instances):
2374 res_instances.append(inst.name)
2375
2376
2377
2378 for key, inst in nv_dict.iteritems():
2379 if inst.name not in res_missing:
2380 res_missing[inst.name] = []
2381 res_missing[inst.name].append(key)
2382
2383 return result
2384
2387 """Verifies the cluster disks sizes.
2388
2389 """
2390 _OP_PARAMS = [("instances", ht.EmptyList, ht.TListOf(ht.TNonEmptyString))]
2391 REQ_BGL = False
2392
2411
2415
2417 """Check prerequisites.
2418
2419 This only checks the optional instance list against the existing names.
2420
2421 """
2422 if self.wanted_names is None:
2423 self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
2424
2425 self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
2426 in self.wanted_names]
2427
2429 """Ensure children of the disk have the needed disk size.
2430
2431 This is valid mainly for DRBD8 and fixes an issue where the
2432 children have smaller disk size.
2433
2434 @param disk: an L{ganeti.objects.Disk} object
2435
2436 """
2437 if disk.dev_type == constants.LD_DRBD8:
2438 assert disk.children, "Empty children for DRBD8?"
2439 fchild = disk.children[0]
2440 mismatch = fchild.size < disk.size
2441 if mismatch:
2442 self.LogInfo("Child disk has size %d, parent %d, fixing",
2443 fchild.size, disk.size)
2444 fchild.size = disk.size
2445
2446
2447 return self._EnsureChildSizes(fchild) or mismatch
2448 else:
2449 return False
2450
2451 - def Exec(self, feedback_fn):
2452 """Verify the size of cluster disks.
2453
2454 """
2455
2456
2457 per_node_disks = {}
2458 for instance in self.wanted_instances:
2459 pnode = instance.primary_node
2460 if pnode not in per_node_disks:
2461 per_node_disks[pnode] = []
2462 for idx, disk in enumerate(instance.disks):
2463 per_node_disks[pnode].append((instance, idx, disk))
2464
2465 changed = []
2466 for node, dskl in per_node_disks.items():
2467 newl = [v[2].Copy() for v in dskl]
2468 for dsk in newl:
2469 self.cfg.SetDiskID(dsk, node)
2470 result = self.rpc.call_blockdev_getsizes(node, newl)
2471 if result.fail_msg:
2472 self.LogWarning("Failure in blockdev_getsizes call to node"
2473 " %s, ignoring", node)
2474 continue
2475 if len(result.data) != len(dskl):
2476 self.LogWarning("Invalid result from node %s, ignoring node results",
2477 node)
2478 continue
2479 for ((instance, idx, disk), size) in zip(dskl, result.data):
2480 if size is None:
2481 self.LogWarning("Disk %d of instance %s did not return size"
2482 " information, ignoring", idx, instance.name)
2483 continue
2484 if not isinstance(size, (int, long)):
2485 self.LogWarning("Disk %d of instance %s did not return valid"
2486 " size information, ignoring", idx, instance.name)
2487 continue
2488 size = size >> 20
2489 if size != disk.size:
2490 self.LogInfo("Disk %d of instance %s has mismatched size,"
2491 " correcting: recorded %d, actual %d", idx,
2492 instance.name, disk.size, size)
2493 disk.size = size
2494 self.cfg.Update(instance, feedback_fn)
2495 changed.append((instance.name, idx, size))
2496 if self._EnsureChildSizes(disk):
2497 self.cfg.Update(instance, feedback_fn)
2498 changed.append((instance.name, idx, disk.size))
2499 return changed
2500
2579
2582 """Change the parameters of the cluster.
2583
2584 """
2585 HPATH = "cluster-modify"
2586 HTYPE = constants.HTYPE_CLUSTER
2587 _OP_PARAMS = [
2588 ("vg_name", None, ht.TMaybeString),
2589 ("enabled_hypervisors", None,
2590 ht.TOr(ht.TAnd(ht.TListOf(ht.TElemOf(constants.HYPER_TYPES)), ht.TTrue),
2591 ht.TNone)),
2592 ("hvparams", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2593 ht.TNone)),
2594 ("beparams", None, ht.TOr(ht.TDict, ht.TNone)),
2595 ("os_hvp", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2596 ht.TNone)),
2597 ("osparams", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2598 ht.TNone)),
2599 ("candidate_pool_size", None, ht.TOr(ht.TStrictPositiveInt, ht.TNone)),
2600 ("uid_pool", None, ht.NoType),
2601 ("add_uids", None, ht.NoType),
2602 ("remove_uids", None, ht.NoType),
2603 ("maintain_node_health", None, ht.TMaybeBool),
2604 ("prealloc_wipe_disks", None, ht.TMaybeBool),
2605 ("nicparams", None, ht.TOr(ht.TDict, ht.TNone)),
2606 ("drbd_helper", None, ht.TOr(ht.TString, ht.TNone)),
2607 ("default_iallocator", None, ht.TOr(ht.TString, ht.TNone)),
2608 ("reserved_lvs", None, ht.TOr(ht.TListOf(ht.TNonEmptyString), ht.TNone)),
2609 ("hidden_os", None, ht.TOr(ht.TListOf(\
2610 ht.TAnd(ht.TList,
2611 ht.TIsLength(2),
2612 ht.TMap(lambda v: v[0], ht.TElemOf(constants.DDMS_VALUES)))),
2613 ht.TNone)),
2614 ("blacklisted_os", None, ht.TOr(ht.TListOf(\
2615 ht.TAnd(ht.TList,
2616 ht.TIsLength(2),
2617 ht.TMap(lambda v: v[0], ht.TElemOf(constants.DDMS_VALUES)))),
2618 ht.TNone)),
2619 ]
2620 REQ_BGL = False
2621
2634
2642
2644 """Build hooks env.
2645
2646 """
2647 env = {
2648 "OP_TARGET": self.cfg.GetClusterName(),
2649 "NEW_VG_NAME": self.op.vg_name,
2650 }
2651 mn = self.cfg.GetMasterNode()
2652 return env, [mn], [mn]
2653
2655 """Check prerequisites.
2656
2657 This checks whether the given params don't conflict and
2658 if the given volume group is valid.
2659
2660 """
2661 if self.op.vg_name is not None and not self.op.vg_name:
2662 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
2663 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
2664 " instances exist", errors.ECODE_INVAL)
2665
2666 if self.op.drbd_helper is not None and not self.op.drbd_helper:
2667 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
2668 raise errors.OpPrereqError("Cannot disable drbd helper while"
2669 " drbd-based instances exist",
2670 errors.ECODE_INVAL)
2671
2672 node_list = self.acquired_locks[locking.LEVEL_NODE]
2673
2674
2675 if self.op.vg_name:
2676 vglist = self.rpc.call_vg_list(node_list)
2677 for node in node_list:
2678 msg = vglist[node].fail_msg
2679 if msg:
2680
2681 self.LogWarning("Error while gathering data on node %s"
2682 " (ignoring node): %s", node, msg)
2683 continue
2684 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
2685 self.op.vg_name,
2686 constants.MIN_VG_SIZE)
2687 if vgstatus:
2688 raise errors.OpPrereqError("Error on node '%s': %s" %
2689 (node, vgstatus), errors.ECODE_ENVIRON)
2690
2691 if self.op.drbd_helper:
2692
2693 helpers = self.rpc.call_drbd_helper(node_list)
2694 for node in node_list:
2695 ninfo = self.cfg.GetNodeInfo(node)
2696 if ninfo.offline:
2697 self.LogInfo("Not checking drbd helper on offline node %s", node)
2698 continue
2699 msg = helpers[node].fail_msg
2700 if msg:
2701 raise errors.OpPrereqError("Error checking drbd helper on node"
2702 " '%s': %s" % (node, msg),
2703 errors.ECODE_ENVIRON)
2704 node_helper = helpers[node].payload
2705 if node_helper != self.op.drbd_helper:
2706 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
2707 (node, node_helper), errors.ECODE_ENVIRON)
2708
2709 self.cluster = cluster = self.cfg.GetClusterInfo()
2710
2711 if self.op.beparams:
2712 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
2713 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
2714
2715 if self.op.nicparams:
2716 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
2717 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
2718 objects.NIC.CheckParameterSyntax(self.new_nicparams)
2719 nic_errors = []
2720
2721
2722 for instance in self.cfg.GetAllInstancesInfo().values():
2723 for nic_idx, nic in enumerate(instance.nics):
2724 params_copy = copy.deepcopy(nic.nicparams)
2725 params_filled = objects.FillDict(self.new_nicparams, params_copy)
2726
2727
2728 try:
2729 objects.NIC.CheckParameterSyntax(params_filled)
2730 except errors.ConfigurationError, err:
2731 nic_errors.append("Instance %s, nic/%d: %s" %
2732 (instance.name, nic_idx, err))
2733
2734
2735 target_mode = params_filled[constants.NIC_MODE]
2736 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
2737 nic_errors.append("Instance %s, nic/%d: routed nick with no ip" %
2738 (instance.name, nic_idx))
2739 if nic_errors:
2740 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
2741 "\n".join(nic_errors))
2742
2743
2744 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
2745 if self.op.hvparams:
2746 for hv_name, hv_dict in self.op.hvparams.items():
2747 if hv_name not in self.new_hvparams:
2748 self.new_hvparams[hv_name] = hv_dict
2749 else:
2750 self.new_hvparams[hv_name].update(hv_dict)
2751
2752
2753 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
2754 if self.op.os_hvp:
2755 for os_name, hvs in self.op.os_hvp.items():
2756 if os_name not in self.new_os_hvp:
2757 self.new_os_hvp[os_name] = hvs
2758 else:
2759 for hv_name, hv_dict in hvs.items():
2760 if hv_name not in self.new_os_hvp[os_name]:
2761 self.new_os_hvp[os_name][hv_name] = hv_dict
2762 else:
2763 self.new_os_hvp[os_name][hv_name].update(hv_dict)
2764
2765
2766 self.new_osp = objects.FillDict(cluster.osparams, {})
2767 if self.op.osparams:
2768 for os_name, osp in self.op.osparams.items():
2769 if os_name not in self.new_osp:
2770 self.new_osp[os_name] = {}
2771
2772 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
2773 use_none=True)
2774
2775 if not self.new_osp[os_name]:
2776
2777 del self.new_osp[os_name]
2778 else:
2779
2780 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
2781 os_name, self.new_osp[os_name])
2782
2783
2784 if self.op.enabled_hypervisors is not None:
2785 self.hv_list = self.op.enabled_hypervisors
2786 for hv in self.hv_list:
2787
2788
2789
2790
2791
2792 if hv not in new_hvp:
2793 new_hvp[hv] = {}
2794 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
2795 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
2796 else:
2797 self.hv_list = cluster.enabled_hypervisors
2798
2799 if self.op.hvparams or self.op.enabled_hypervisors is not None:
2800
2801 for hv_name, hv_params in self.new_hvparams.items():
2802 if ((self.op.hvparams and hv_name in self.op.hvparams) or
2803 (self.op.enabled_hypervisors and
2804 hv_name in self.op.enabled_hypervisors)):
2805
2806 hv_class = hypervisor.GetHypervisor(hv_name)
2807 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2808 hv_class.CheckParameterSyntax(hv_params)
2809 _CheckHVParams(self, node_list, hv_name, hv_params)
2810
2811 if self.op.os_hvp:
2812
2813
2814 for os_name, os_hvp in self.new_os_hvp.items():
2815 for hv_name, hv_params in os_hvp.items():
2816 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2817
2818 cluster_defaults = self.new_hvparams.get(hv_name, {})
2819 new_osp = objects.FillDict(cluster_defaults, hv_params)
2820 hv_class = hypervisor.GetHypervisor(hv_name)
2821 hv_class.CheckParameterSyntax(new_osp)
2822 _CheckHVParams(self, node_list, hv_name, new_osp)
2823
2824 if self.op.default_iallocator:
2825 alloc_script = utils.FindFile(self.op.default_iallocator,
2826 constants.IALLOCATOR_SEARCH_PATH,
2827 os.path.isfile)
2828 if alloc_script is None:
2829 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
2830 " specified" % self.op.default_iallocator,
2831 errors.ECODE_INVAL)
2832
2833 - def Exec(self, feedback_fn):
2834 """Change the parameters of the cluster.
2835
2836 """
2837 if self.op.vg_name is not None:
2838 new_volume = self.op.vg_name
2839 if not new_volume:
2840 new_volume = None
2841 if new_volume != self.cfg.GetVGName():
2842 self.cfg.SetVGName(new_volume)
2843 else:
2844 feedback_fn("Cluster LVM configuration already in desired"
2845 " state, not changing")
2846 if self.op.drbd_helper is not None:
2847 new_helper = self.op.drbd_helper
2848 if not new_helper:
2849 new_helper = None
2850 if new_helper != self.cfg.GetDRBDHelper():
2851 self.cfg.SetDRBDHelper(new_helper)
2852 else:
2853 feedback_fn("Cluster DRBD helper already in desired state,"
2854 " not changing")
2855 if self.op.hvparams:
2856 self.cluster.hvparams = self.new_hvparams
2857 if self.op.os_hvp:
2858 self.cluster.os_hvp = self.new_os_hvp
2859 if self.op.enabled_hypervisors is not None:
2860 self.cluster.hvparams = self.new_hvparams
2861 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
2862 if self.op.beparams:
2863 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
2864 if self.op.nicparams:
2865 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
2866 if self.op.osparams:
2867 self.cluster.osparams = self.new_osp
2868
2869 if self.op.candidate_pool_size is not None:
2870 self.cluster.candidate_pool_size = self.op.candidate_pool_size
2871
2872 _AdjustCandidatePool(self, [])
2873
2874 if self.op.maintain_node_health is not None:
2875 self.cluster.maintain_node_health = self.op.maintain_node_health
2876
2877 if self.op.prealloc_wipe_disks is not None:
2878 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
2879
2880 if self.op.add_uids is not None:
2881 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
2882
2883 if self.op.remove_uids is not None:
2884 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
2885
2886 if self.op.uid_pool is not None:
2887 self.cluster.uid_pool = self.op.uid_pool
2888
2889 if self.op.default_iallocator is not None:
2890 self.cluster.default_iallocator = self.op.default_iallocator
2891
2892 if self.op.reserved_lvs is not None:
2893 self.cluster.reserved_lvs = self.op.reserved_lvs
2894
2895 def helper_os(aname, mods, desc):
2896 desc += " OS list"
2897 lst = getattr(self.cluster, aname)
2898 for key, val in mods:
2899 if key == constants.DDM_ADD:
2900 if val in lst:
2901 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
2902 else:
2903 lst.append(val)
2904 elif key == constants.DDM_REMOVE:
2905 if val in lst:
2906 lst.remove(val)
2907 else:
2908 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
2909 else:
2910 raise errors.ProgrammerError("Invalid modification '%s'" % key)
2911
2912 if self.op.hidden_os:
2913 helper_os("hidden_os", self.op.hidden_os, "hidden")
2914
2915 if self.op.blacklisted_os:
2916 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
2917
2918 self.cfg.Update(self.cluster, feedback_fn)
2919
2922 """Helper for uploading a file and showing warnings.
2923
2924 """
2925 if os.path.exists(fname):
2926 result = lu.rpc.call_upload_file(nodes, fname)
2927 for to_node, to_result in result.items():
2928 msg = to_result.fail_msg
2929 if msg:
2930 msg = ("Copy of file %s to node %s failed: %s" %
2931 (fname, to_node, msg))
2932 lu.proc.LogWarning(msg)
2933
2936 """Distribute additional files which are part of the cluster configuration.
2937
2938 ConfigWriter takes care of distributing the config and ssconf files, but
2939 there are more files which should be distributed to all nodes. This function
2940 makes sure those are copied.
2941
2942 @param lu: calling logical unit
2943 @param additional_nodes: list of nodes not in the config to distribute to
2944 @type additional_vm: boolean
2945 @param additional_vm: whether the additional nodes are vm-capable or not
2946
2947 """
2948
2949 myself = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
2950 dist_nodes = lu.cfg.GetOnlineNodeList()
2951 nvm_nodes = lu.cfg.GetNonVmCapableNodeList()
2952 vm_nodes = [name for name in dist_nodes if name not in nvm_nodes]
2953 if additional_nodes is not None:
2954 dist_nodes.extend(additional_nodes)
2955 if additional_vm:
2956 vm_nodes.extend(additional_nodes)
2957 if myself.name in dist_nodes:
2958 dist_nodes.remove(myself.name)
2959 if myself.name in vm_nodes:
2960 vm_nodes.remove(myself.name)
2961
2962
2963 dist_files = set([constants.ETC_HOSTS,
2964 constants.SSH_KNOWN_HOSTS_FILE,
2965 constants.RAPI_CERT_FILE,
2966 constants.RAPI_USERS_FILE,
2967 constants.CONFD_HMAC_KEY,
2968 constants.CLUSTER_DOMAIN_SECRET_FILE,
2969 ])
2970
2971 vm_files = set()
2972 enabled_hypervisors = lu.cfg.GetClusterInfo().enabled_hypervisors
2973 for hv_name in enabled_hypervisors:
2974 hv_class = hypervisor.GetHypervisor(hv_name)
2975 vm_files.update(hv_class.GetAncillaryFiles())
2976
2977
2978 for fname in dist_files:
2979 _UploadHelper(lu, dist_nodes, fname)
2980 for fname in vm_files:
2981 _UploadHelper(lu, vm_nodes, fname)
2982
2985 """Force the redistribution of cluster configuration.
2986
2987 This is a very simple LU.
2988
2989 """
2990 REQ_BGL = False
2991
2997
2998 - def Exec(self, feedback_fn):
3004