1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 """Cluster related commands"""
31
32
33
34
35
36
37
38 from cStringIO import StringIO
39 import os
40 import time
41 import OpenSSL
42 import tempfile
43 import itertools
44
45 from ganeti.cli import *
46 from ganeti import opcodes
47 from ganeti import constants
48 from ganeti import errors
49 from ganeti import utils
50 from ganeti import bootstrap
51 from ganeti import ssh
52 from ganeti import objects
53 from ganeti import uidpool
54 from ganeti import compat
55 from ganeti import netutils
56 from ganeti import ssconf
57 from ganeti import pathutils
58 from ganeti import serializer
59 from ganeti import qlang
60
61
62 ON_OPT = cli_option("--on", default=False,
63 action="store_true", dest="on",
64 help="Recover from an EPO")
65
66 GROUPS_OPT = cli_option("--groups", default=False,
67 action="store_true", dest="groups",
68 help="Arguments are node groups instead of nodes")
69
70 FORCE_FAILOVER = cli_option("--yes-do-it", dest="yes_do_it",
71 help="Override interactive check for --no-voting",
72 default=False, action="store_true")
73
74 FORCE_DISTRIBUTION = cli_option("--yes-do-it", dest="yes_do_it",
75 help="Unconditionally distribute the"
76 " configuration, even if the queue"
77 " is drained",
78 default=False, action="store_true")
79
80 TO_OPT = cli_option("--to", default=None, type="string",
81 help="The Ganeti version to upgrade to")
82
83 RESUME_OPT = cli_option("--resume", default=False, action="store_true",
84 help="Resume any pending Ganeti upgrades")
85
86 _EPO_PING_INTERVAL = 30
87 _EPO_PING_TIMEOUT = 1
88 _EPO_REACHABLE_TIMEOUT = 15 * 60
92 """Checks if the legacy option '--no-lvm-storage' is used.
93
94 """
95 if not opts.lvm_storage:
96 ToStderr("The option --no-lvm-storage is no longer supported. If you want"
97 " to disable lvm-based storage cluster-wide, use the option"
98 " --enabled-disk-templates to disable all of these lvm-base disk "
99 " templates: %s" %
100 utils.CommaJoin(constants.DTS_LVM))
101 return 1
102
105 """Initialize the list of enabled disk templates.
106
107 """
108 if opts.enabled_disk_templates:
109 return opts.enabled_disk_templates.split(",")
110 else:
111 return constants.DEFAULT_ENABLED_DISK_TEMPLATES
112
115 """Initialize the volume group name.
116
117 @type enabled_disk_templates: list of strings
118 @param enabled_disk_templates: cluster-wide enabled disk templates
119
120 """
121 vg_name = None
122 if opts.vg_name is not None:
123 vg_name = opts.vg_name
124 if vg_name:
125 if not utils.IsLvmEnabled(enabled_disk_templates):
126 ToStdout("You specified a volume group with --vg-name, but you did not"
127 " enable any disk template that uses lvm.")
128 elif utils.IsLvmEnabled(enabled_disk_templates):
129 raise errors.OpPrereqError(
130 "LVM disk templates are enabled, but vg name not set.")
131 elif utils.IsLvmEnabled(enabled_disk_templates):
132 vg_name = constants.DEFAULT_VG
133 return vg_name
134
137 """Initialize the DRBD usermode helper.
138
139 """
140 drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates
141
142 if not drbd_enabled and opts.drbd_helper is not None:
143 ToStdout("Note: You specified a DRBD usermode helper, while DRBD storage"
144 " is not enabled.")
145
146 if drbd_enabled:
147 if opts.drbd_helper is None:
148 return constants.DEFAULT_DRBD_HELPER
149 if opts.drbd_helper == '':
150 raise errors.OpPrereqError(
151 "Unsetting the drbd usermode helper while enabling DRBD is not"
152 " allowed.")
153
154 return opts.drbd_helper
155
159 """Initialize the cluster.
160
161 @param opts: the command line options selected by the user
162 @type args: list
163 @param args: should contain only one element, the desired
164 cluster name
165 @rtype: int
166 @return: the desired exit code
167
168 """
169 if _CheckNoLvmStorageOptDeprecated(opts):
170 return 1
171
172 enabled_disk_templates = _InitEnabledDiskTemplates(opts)
173
174 try:
175 vg_name = _InitVgName(opts, enabled_disk_templates)
176 drbd_helper = _InitDrbdHelper(opts, enabled_disk_templates)
177 except errors.OpPrereqError, e:
178 ToStderr(str(e))
179 return 1
180
181 master_netdev = opts.master_netdev
182 if master_netdev is None:
183 nic_mode = opts.nicparams.get(constants.NIC_MODE, None)
184 if not nic_mode:
185
186 master_netdev = constants.DEFAULT_BRIDGE
187 elif nic_mode == constants.NIC_MODE_OVS:
188
189 master_netdev = constants.DEFAULT_OVS
190 opts.nicparams[constants.NIC_LINK] = constants.DEFAULT_OVS
191
192 hvlist = opts.enabled_hypervisors
193 if hvlist is None:
194 hvlist = constants.DEFAULT_ENABLED_HYPERVISOR
195 hvlist = hvlist.split(",")
196
197 hvparams = dict(opts.hvparams)
198 beparams = opts.beparams
199 nicparams = opts.nicparams
200
201 diskparams = dict(opts.diskparams)
202
203
204
205 diskparams_keys = set(diskparams.keys())
206 if not (diskparams_keys <= constants.DISK_TEMPLATES):
207 unknown = utils.NiceSort(diskparams_keys - constants.DISK_TEMPLATES)
208 ToStderr("Disk templates unknown: %s" % utils.CommaJoin(unknown))
209 return 1
210
211
212 beparams = objects.FillDict(constants.BEC_DEFAULTS, beparams)
213 utils.ForceDictType(beparams, constants.BES_PARAMETER_COMPAT)
214
215
216 nicparams = objects.FillDict(constants.NICC_DEFAULTS, nicparams)
217 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
218
219
220 if opts.ndparams is None:
221 ndparams = dict(constants.NDC_DEFAULTS)
222 else:
223 ndparams = objects.FillDict(constants.NDC_DEFAULTS, opts.ndparams)
224 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
225
226
227 for hv in constants.HYPER_TYPES:
228 if hv not in hvparams:
229 hvparams[hv] = {}
230 hvparams[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], hvparams[hv])
231 utils.ForceDictType(hvparams[hv], constants.HVS_PARAMETER_TYPES)
232
233
234 for templ in constants.DISK_TEMPLATES:
235 if templ not in diskparams:
236 diskparams[templ] = {}
237 diskparams[templ] = objects.FillDict(constants.DISK_DT_DEFAULTS[templ],
238 diskparams[templ])
239 utils.ForceDictType(diskparams[templ], constants.DISK_DT_TYPES)
240
241
242 ipolicy = CreateIPolicyFromOpts(
243 ispecs_mem_size=opts.ispecs_mem_size,
244 ispecs_cpu_count=opts.ispecs_cpu_count,
245 ispecs_disk_count=opts.ispecs_disk_count,
246 ispecs_disk_size=opts.ispecs_disk_size,
247 ispecs_nic_count=opts.ispecs_nic_count,
248 minmax_ispecs=opts.ipolicy_bounds_specs,
249 std_ispecs=opts.ipolicy_std_specs,
250 ipolicy_disk_templates=opts.ipolicy_disk_templates,
251 ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
252 ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
253 fill_all=True)
254
255 if opts.candidate_pool_size is None:
256 opts.candidate_pool_size = constants.MASTER_POOL_SIZE_DEFAULT
257
258 if opts.mac_prefix is None:
259 opts.mac_prefix = constants.DEFAULT_MAC_PREFIX
260
261 uid_pool = opts.uid_pool
262 if uid_pool is not None:
263 uid_pool = uidpool.ParseUidPool(uid_pool)
264
265 if opts.prealloc_wipe_disks is None:
266 opts.prealloc_wipe_disks = False
267
268 external_ip_setup_script = opts.use_external_mip_script
269 if external_ip_setup_script is None:
270 external_ip_setup_script = False
271
272 try:
273 primary_ip_version = int(opts.primary_ip_version)
274 except (ValueError, TypeError), err:
275 ToStderr("Invalid primary ip version value: %s" % str(err))
276 return 1
277
278 master_netmask = opts.master_netmask
279 try:
280 if master_netmask is not None:
281 master_netmask = int(master_netmask)
282 except (ValueError, TypeError), err:
283 ToStderr("Invalid master netmask value: %s" % str(err))
284 return 1
285
286 if opts.disk_state:
287 disk_state = utils.FlatToDict(opts.disk_state)
288 else:
289 disk_state = {}
290
291 hv_state = dict(opts.hv_state)
292
293 bootstrap.InitCluster(cluster_name=args[0],
294 secondary_ip=opts.secondary_ip,
295 vg_name=vg_name,
296 mac_prefix=opts.mac_prefix,
297 master_netmask=master_netmask,
298 master_netdev=master_netdev,
299 file_storage_dir=opts.file_storage_dir,
300 shared_file_storage_dir=opts.shared_file_storage_dir,
301 enabled_hypervisors=hvlist,
302 hvparams=hvparams,
303 beparams=beparams,
304 nicparams=nicparams,
305 ndparams=ndparams,
306 diskparams=diskparams,
307 ipolicy=ipolicy,
308 candidate_pool_size=opts.candidate_pool_size,
309 modify_etc_hosts=opts.modify_etc_hosts,
310 modify_ssh_setup=opts.modify_ssh_setup,
311 maintain_node_health=opts.maintain_node_health,
312 drbd_helper=drbd_helper,
313 uid_pool=uid_pool,
314 default_iallocator=opts.default_iallocator,
315 primary_ip_version=primary_ip_version,
316 prealloc_wipe_disks=opts.prealloc_wipe_disks,
317 use_external_mip_script=external_ip_setup_script,
318 hv_state=hv_state,
319 disk_state=disk_state,
320 enabled_disk_templates=enabled_disk_templates,
321 )
322 op = opcodes.OpClusterPostInit()
323 SubmitOpCode(op, opts=opts)
324 return 0
325
329 """Destroy the cluster.
330
331 @param opts: the command line options selected by the user
332 @type args: list
333 @param args: should be an empty list
334 @rtype: int
335 @return: the desired exit code
336
337 """
338 if not opts.yes_do_it:
339 ToStderr("Destroying a cluster is irreversible. If you really want"
340 " destroy this cluster, supply the --yes-do-it option.")
341 return 1
342
343 op = opcodes.OpClusterDestroy()
344 master_uuid = SubmitOpCode(op, opts=opts)
345
346
347 bootstrap.FinalizeClusterDestroy(master_uuid)
348 return 0
349
352 """Rename the cluster.
353
354 @param opts: the command line options selected by the user
355 @type args: list
356 @param args: should contain only one element, the new cluster name
357 @rtype: int
358 @return: the desired exit code
359
360 """
361 cl = GetClient()
362
363 (cluster_name, ) = cl.QueryConfigValues(["cluster_name"])
364
365 new_name = args[0]
366 if not opts.force:
367 usertext = ("This will rename the cluster from '%s' to '%s'. If you are"
368 " connected over the network to the cluster name, the"
369 " operation is very dangerous as the IP address will be"
370 " removed from the node and the change may not go through."
371 " Continue?") % (cluster_name, new_name)
372 if not AskUser(usertext):
373 return 1
374
375 op = opcodes.OpClusterRename(name=new_name)
376 result = SubmitOpCode(op, opts=opts, cl=cl)
377
378 if result:
379 ToStdout("Cluster renamed from '%s' to '%s'", cluster_name, result)
380
381 return 0
382
391
394 """Deactivates the master IP.
395
396 """
397 if not opts.confirm:
398 usertext = ("This will disable the master IP. All the open connections to"
399 " the master IP will be closed. To reach the master you will"
400 " need to use its node IP."
401 " Continue?")
402 if not AskUser(usertext):
403 return 1
404
405 op = opcodes.OpClusterDeactivateMasterIp()
406 SubmitOpCode(op)
407 return 0
408
411 """Forces push of the cluster configuration.
412
413 @param opts: the command line options selected by the user
414 @type args: list
415 @param args: empty list
416 @rtype: int
417 @return: the desired exit code
418
419 """
420 op = opcodes.OpClusterRedistConf()
421 if opts.yes_do_it:
422 SubmitOpCodeToDrainedQueue(op)
423 else:
424 SubmitOrSend(op, opts)
425 return 0
426
429 """Write version of ganeti software to the standard output.
430
431 @param opts: the command line options selected by the user
432 @type args: list
433 @param args: should be an empty list
434 @rtype: int
435 @return: the desired exit code
436
437 """
438 cl = GetClient(query=True)
439 result = cl.QueryClusterInfo()
440 ToStdout("Software version: %s", result["software_version"])
441 ToStdout("Internode protocol: %s", result["protocol_version"])
442 ToStdout("Configuration format: %s", result["config_version"])
443 ToStdout("OS api version: %s", result["os_api_version"])
444 ToStdout("Export interface: %s", result["export_version"])
445 ToStdout("VCS version: %s", result["vcs_version"])
446 return 0
447
450 """Write name of master node to the standard output.
451
452 @param opts: the command line options selected by the user
453 @type args: list
454 @param args: should be an empty list
455 @rtype: int
456 @return: the desired exit code
457
458 """
459 master = bootstrap.GetMaster()
460 ToStdout(master)
461 return 0
462
482
485 """Shows cluster information.
486
487 @param opts: the command line options selected by the user
488 @type args: list
489 @param args: should be an empty list
490 @rtype: int
491 @return: the desired exit code
492
493 """
494 cl = GetClient(query=True)
495 result = cl.QueryClusterInfo()
496
497 if result["tags"]:
498 tags = utils.CommaJoin(utils.NiceSort(result["tags"]))
499 else:
500 tags = "(none)"
501 if result["reserved_lvs"]:
502 reserved_lvs = utils.CommaJoin(result["reserved_lvs"])
503 else:
504 reserved_lvs = "(none)"
505
506 enabled_hv = result["enabled_hypervisors"]
507 hvparams = dict((k, v) for k, v in result["hvparams"].iteritems()
508 if k in enabled_hv)
509
510 info = [
511 ("Cluster name", result["name"]),
512 ("Cluster UUID", result["uuid"]),
513
514 ("Creation time", utils.FormatTime(result["ctime"])),
515 ("Modification time", utils.FormatTime(result["mtime"])),
516
517 ("Master node", result["master"]),
518
519 ("Architecture (this node)",
520 "%s (%s)" % (result["architecture"][0], result["architecture"][1])),
521
522 ("Tags", tags),
523
524 ("Default hypervisor", result["default_hypervisor"]),
525 ("Enabled hypervisors", utils.CommaJoin(enabled_hv)),
526
527 ("Hypervisor parameters", _FormatGroupedParams(hvparams)),
528
529 ("OS-specific hypervisor parameters",
530 _FormatGroupedParams(result["os_hvp"])),
531
532 ("OS parameters", _FormatGroupedParams(result["osparams"])),
533
534 ("Hidden OSes", utils.CommaJoin(result["hidden_os"])),
535 ("Blacklisted OSes", utils.CommaJoin(result["blacklisted_os"])),
536
537 ("Cluster parameters", [
538 ("candidate pool size",
539 compat.TryToRoman(result["candidate_pool_size"],
540 convert=opts.roman_integers)),
541 ("master netdev", result["master_netdev"]),
542 ("master netmask", result["master_netmask"]),
543 ("use external master IP address setup script",
544 result["use_external_mip_script"]),
545 ("lvm volume group", result["volume_group_name"]),
546 ("lvm reserved volumes", reserved_lvs),
547 ("drbd usermode helper", result["drbd_usermode_helper"]),
548 ("file storage path", result["file_storage_dir"]),
549 ("shared file storage path", result["shared_file_storage_dir"]),
550 ("maintenance of node health", result["maintain_node_health"]),
551 ("uid pool", uidpool.FormatUidPool(result["uid_pool"])),
552 ("default instance allocator", result["default_iallocator"]),
553 ("primary ip version", result["primary_ip_version"]),
554 ("preallocation wipe disks", result["prealloc_wipe_disks"]),
555 ("OS search path", utils.CommaJoin(pathutils.OS_SEARCH_PATH)),
556 ("ExtStorage Providers search path",
557 utils.CommaJoin(pathutils.ES_SEARCH_PATH)),
558 ("enabled disk templates",
559 utils.CommaJoin(result["enabled_disk_templates"])),
560 ]),
561
562 ("Default node parameters",
563 _FormatGroupedParams(result["ndparams"], roman=opts.roman_integers)),
564
565 ("Default instance parameters",
566 _FormatGroupedParams(result["beparams"], roman=opts.roman_integers)),
567
568 ("Default nic parameters",
569 _FormatGroupedParams(result["nicparams"], roman=opts.roman_integers)),
570
571 ("Default disk parameters",
572 _FormatGroupedParams(result["diskparams"], roman=opts.roman_integers)),
573
574 ("Instance policy - limits for instances",
575 FormatPolicyInfo(result["ipolicy"], None, True)),
576 ]
577
578 PrintGenericInfo(info)
579 return 0
580
583 """Copy a file from master to some nodes.
584
585 @param opts: the command line options selected by the user
586 @type args: list
587 @param args: should contain only one element, the path of
588 the file to be copied
589 @rtype: int
590 @return: the desired exit code
591
592 """
593 filename = args[0]
594 filename = os.path.abspath(filename)
595
596 if not os.path.exists(filename):
597 raise errors.OpPrereqError("No such filename '%s'" % filename,
598 errors.ECODE_INVAL)
599
600 cl = GetClient()
601
602 cluster_name = cl.QueryConfigValues(["cluster_name"])[0]
603
604 results = GetOnlineNodes(nodes=opts.nodes, cl=cl, filter_master=True,
605 secondary_ips=opts.use_replication_network,
606 nodegroup=opts.nodegroup)
607
608 srun = ssh.SshRunner(cluster_name)
609 for node in results:
610 if not srun.CopyFileToNode(node, filename):
611 ToStderr("Copy of file %s to node %s failed", filename, node)
612
613 return 0
614
617 """Run a command on some nodes.
618
619 @param opts: the command line options selected by the user
620 @type args: list
621 @param args: should contain the command to be run and its arguments
622 @rtype: int
623 @return: the desired exit code
624
625 """
626 cl = GetClient()
627
628 command = " ".join(args)
629
630 nodes = GetOnlineNodes(nodes=opts.nodes, cl=cl, nodegroup=opts.nodegroup)
631
632 cluster_name, master_node = cl.QueryConfigValues(["cluster_name",
633 "master_node"])
634
635 srun = ssh.SshRunner(cluster_name=cluster_name)
636
637
638 if master_node in nodes:
639 nodes.remove(master_node)
640 nodes.append(master_node)
641
642 for name in nodes:
643 result = srun.Run(name, constants.SSH_LOGIN_USER, command)
644
645 if opts.failure_only and result.exit_code == constants.EXIT_SUCCESS:
646
647 continue
648
649 ToStdout("------------------------------------------------")
650 if opts.show_machine_names:
651 for line in result.output.splitlines():
652 ToStdout("%s: %s", name, line)
653 else:
654 ToStdout("node: %s", name)
655 ToStdout("%s", result.output)
656 ToStdout("return code = %s", result.exit_code)
657
658 return 0
659
662 """Verify integrity of cluster, performing various test on nodes.
663
664 @param opts: the command line options selected by the user
665 @type args: list
666 @param args: should be an empty list
667 @rtype: int
668 @return: the desired exit code
669
670 """
671 skip_checks = []
672
673 if opts.skip_nplusone_mem:
674 skip_checks.append(constants.VERIFY_NPLUSONE_MEM)
675
676 cl = GetClient()
677
678 op = opcodes.OpClusterVerify(verbose=opts.verbose,
679 error_codes=opts.error_codes,
680 debug_simulate_errors=opts.simulate_errors,
681 skip_checks=skip_checks,
682 ignore_errors=opts.ignore_errors,
683 group_name=opts.nodegroup)
684 result = SubmitOpCode(op, cl=cl, opts=opts)
685
686
687 jex = JobExecutor(cl=cl, opts=opts)
688
689 for (status, job_id) in result[constants.JOB_IDS_KEY]:
690 jex.AddJobId(None, status, job_id)
691
692 results = jex.GetResults()
693
694 (bad_jobs, bad_results) = \
695 map(len,
696
697 map(list,
698
699 map(compat.partial(itertools.ifilterfalse, bool),
700
701 zip(*((job_success, len(op_results) == 1 and op_results[0])
702 for (job_success, op_results) in results)))))
703
704 if bad_jobs == 0 and bad_results == 0:
705 rcode = constants.EXIT_SUCCESS
706 else:
707 rcode = constants.EXIT_FAILURE
708 if bad_jobs > 0:
709 ToStdout("%s job(s) failed while verifying the cluster.", bad_jobs)
710
711 return rcode
712
715 """Verify integrity of cluster disks.
716
717 @param opts: the command line options selected by the user
718 @type args: list
719 @param args: should be an empty list
720 @rtype: int
721 @return: the desired exit code
722
723 """
724 cl = GetClient()
725
726 op = opcodes.OpClusterVerifyDisks()
727
728 result = SubmitOpCode(op, cl=cl, opts=opts)
729
730
731 jex = JobExecutor(cl=cl, opts=opts)
732
733 for (status, job_id) in result[constants.JOB_IDS_KEY]:
734 jex.AddJobId(None, status, job_id)
735
736 retcode = constants.EXIT_SUCCESS
737
738 for (status, result) in jex.GetResults():
739 if not status:
740 ToStdout("Job failed: %s", result)
741 continue
742
743 ((bad_nodes, instances, missing), ) = result
744
745 for node, text in bad_nodes.items():
746 ToStdout("Error gathering data on node %s: %s",
747 node, utils.SafeEncode(text[-400:]))
748 retcode = constants.EXIT_FAILURE
749 ToStdout("You need to fix these nodes first before fixing instances")
750
751 for iname in instances:
752 if iname in missing:
753 continue
754 op = opcodes.OpInstanceActivateDisks(instance_name=iname)
755 try:
756 ToStdout("Activating disks for instance '%s'", iname)
757 SubmitOpCode(op, opts=opts, cl=cl)
758 except errors.GenericError, err:
759 nret, msg = FormatError(err)
760 retcode |= nret
761 ToStderr("Error activating disks for instance %s: %s", iname, msg)
762
763 if missing:
764 for iname, ival in missing.iteritems():
765 all_missing = compat.all(x[0] in bad_nodes for x in ival)
766 if all_missing:
767 ToStdout("Instance %s cannot be verified as it lives on"
768 " broken nodes", iname)
769 else:
770 ToStdout("Instance %s has missing logical volumes:", iname)
771 ival.sort()
772 for node, vol in ival:
773 if node in bad_nodes:
774 ToStdout("\tbroken node %s /dev/%s", node, vol)
775 else:
776 ToStdout("\t%s /dev/%s", node, vol)
777
778 ToStdout("You need to replace or recreate disks for all the above"
779 " instances if this message persists after fixing broken nodes.")
780 retcode = constants.EXIT_FAILURE
781 elif not instances:
782 ToStdout("No disks need to be activated.")
783
784 return retcode
785
788 """Verify sizes of cluster disks.
789
790 @param opts: the command line options selected by the user
791 @type args: list
792 @param args: optional list of instances to restrict check to
793 @rtype: int
794 @return: the desired exit code
795
796 """
797 op = opcodes.OpClusterRepairDiskSizes(instances=args)
798 SubmitOpCode(op, opts=opts)
799
803 """Failover the master node.
804
805 This command, when run on a non-master node, will cause the current
806 master to cease being master, and the non-master to become new
807 master.
808
809 @param opts: the command line options selected by the user
810 @type args: list
811 @param args: should be an empty list
812 @rtype: int
813 @return: the desired exit code
814
815 """
816 if opts.no_voting and not opts.yes_do_it:
817 usertext = ("This will perform the failover even if most other nodes"
818 " are down, or if this node is outdated. This is dangerous"
819 " as it can lead to a non-consistent cluster. Check the"
820 " gnt-cluster(8) man page before proceeding. Continue?")
821 if not AskUser(usertext):
822 return 1
823
824 rvlaue, msgs = bootstrap.MasterFailover(no_voting=opts.no_voting)
825 for msg in msgs:
826 ToStderr(msg)
827 return rvlaue
828
831 """Checks if the master is alive.
832
833 @param opts: the command line options selected by the user
834 @type args: list
835 @param args: should be an empty list
836 @rtype: int
837 @return: the desired exit code
838
839 """
840 try:
841 cl = GetClient()
842 cl.QueryClusterInfo()
843 return 0
844 except Exception:
845 return 1
846
866
869 """Reads and verifies an X509 certificate.
870
871 @type cert_filename: string
872 @param cert_filename: the path of the file containing the certificate to
873 verify encoded in PEM format
874 @type verify_private_key: bool
875 @param verify_private_key: whether to verify the private key in addition to
876 the public certificate
877 @rtype: string
878 @return: a string containing the PEM-encoded certificate.
879
880 """
881 try:
882 pem = utils.ReadFile(cert_filename)
883 except IOError, err:
884 raise errors.X509CertError(cert_filename,
885 "Unable to read certificate: %s" % str(err))
886
887 try:
888 OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, pem)
889 except Exception, err:
890 raise errors.X509CertError(cert_filename,
891 "Unable to load certificate: %s" % str(err))
892
893 if verify_private_key:
894 try:
895 OpenSSL.crypto.load_privatekey(OpenSSL.crypto.FILETYPE_PEM, pem)
896 except Exception, err:
897 raise errors.X509CertError(cert_filename,
898 "Unable to load private key: %s" % str(err))
899
900 return pem
901
902
903 -def _RenewCrypto(new_cluster_cert, new_rapi_cert,
904 rapi_cert_filename, new_spice_cert, spice_cert_filename,
905 spice_cacert_filename, new_confd_hmac_key, new_cds,
906 cds_filename, force):
907 """Renews cluster certificates, keys and secrets.
908
909 @type new_cluster_cert: bool
910 @param new_cluster_cert: Whether to generate a new cluster certificate
911 @type new_rapi_cert: bool
912 @param new_rapi_cert: Whether to generate a new RAPI certificate
913 @type rapi_cert_filename: string
914 @param rapi_cert_filename: Path to file containing new RAPI certificate
915 @type new_spice_cert: bool
916 @param new_spice_cert: Whether to generate a new SPICE certificate
917 @type spice_cert_filename: string
918 @param spice_cert_filename: Path to file containing new SPICE certificate
919 @type spice_cacert_filename: string
920 @param spice_cacert_filename: Path to file containing the certificate of the
921 CA that signed the SPICE certificate
922 @type new_confd_hmac_key: bool
923 @param new_confd_hmac_key: Whether to generate a new HMAC key
924 @type new_cds: bool
925 @param new_cds: Whether to generate a new cluster domain secret
926 @type cds_filename: string
927 @param cds_filename: Path to file containing new cluster domain secret
928 @type force: bool
929 @param force: Whether to ask user for confirmation
930
931 """
932 if new_rapi_cert and rapi_cert_filename:
933 ToStderr("Only one of the --new-rapi-certificate and --rapi-certificate"
934 " options can be specified at the same time.")
935 return 1
936
937 if new_cds and cds_filename:
938 ToStderr("Only one of the --new-cluster-domain-secret and"
939 " --cluster-domain-secret options can be specified at"
940 " the same time.")
941 return 1
942
943 if new_spice_cert and (spice_cert_filename or spice_cacert_filename):
944 ToStderr("When using --new-spice-certificate, the --spice-certificate"
945 " and --spice-ca-certificate must not be used.")
946 return 1
947
948 if bool(spice_cacert_filename) ^ bool(spice_cert_filename):
949 ToStderr("Both --spice-certificate and --spice-ca-certificate must be"
950 " specified.")
951 return 1
952
953 rapi_cert_pem, spice_cert_pem, spice_cacert_pem = (None, None, None)
954 try:
955 if rapi_cert_filename:
956 rapi_cert_pem = _ReadAndVerifyCert(rapi_cert_filename, True)
957 if spice_cert_filename:
958 spice_cert_pem = _ReadAndVerifyCert(spice_cert_filename, True)
959 spice_cacert_pem = _ReadAndVerifyCert(spice_cacert_filename)
960 except errors.X509CertError, err:
961 ToStderr("Unable to load X509 certificate from %s: %s", err[0], err[1])
962 return 1
963
964 if cds_filename:
965 try:
966 cds = utils.ReadFile(cds_filename)
967 except Exception, err:
968 ToStderr("Can't load new cluster domain secret from %s: %s" %
969 (cds_filename, str(err)))
970 return 1
971 else:
972 cds = None
973
974 if not force:
975 usertext = ("This requires all daemons on all nodes to be restarted and"
976 " may take some time. Continue?")
977 if not AskUser(usertext):
978 return 1
979
980 def _RenewCryptoInner(ctx):
981 ctx.feedback_fn("Updating certificates and keys")
982 bootstrap.GenerateClusterCrypto(new_cluster_cert,
983 new_rapi_cert,
984 new_spice_cert,
985 new_confd_hmac_key,
986 new_cds,
987 rapi_cert_pem=rapi_cert_pem,
988 spice_cert_pem=spice_cert_pem,
989 spice_cacert_pem=spice_cacert_pem,
990 cds=cds)
991
992 files_to_copy = []
993
994 if new_cluster_cert:
995 files_to_copy.append(pathutils.NODED_CERT_FILE)
996
997 if new_rapi_cert or rapi_cert_pem:
998 files_to_copy.append(pathutils.RAPI_CERT_FILE)
999
1000 if new_spice_cert or spice_cert_pem:
1001 files_to_copy.append(pathutils.SPICE_CERT_FILE)
1002 files_to_copy.append(pathutils.SPICE_CACERT_FILE)
1003
1004 if new_confd_hmac_key:
1005 files_to_copy.append(pathutils.CONFD_HMAC_KEY)
1006
1007 if new_cds or cds:
1008 files_to_copy.append(pathutils.CLUSTER_DOMAIN_SECRET_FILE)
1009
1010 if files_to_copy:
1011 for node_name in ctx.nonmaster_nodes:
1012 ctx.feedback_fn("Copying %s to %s" %
1013 (", ".join(files_to_copy), node_name))
1014 for file_name in files_to_copy:
1015 ctx.ssh.CopyFileToNode(node_name, file_name)
1016
1017 RunWhileClusterStopped(ToStdout, _RenewCryptoInner)
1018
1019 ToStdout("All requested certificates and keys have been replaced."
1020 " Running \"gnt-cluster verify\" now is recommended.")
1021
1022 return 0
1023
1026 """Renews cluster certificates, keys and secrets.
1027
1028 """
1029 return _RenewCrypto(opts.new_cluster_cert,
1030 opts.new_rapi_cert,
1031 opts.rapi_cert,
1032 opts.new_spice_cert,
1033 opts.spice_cert,
1034 opts.spice_cacert,
1035 opts.new_confd_hmac_key,
1036 opts.new_cluster_domain_secret,
1037 opts.cluster_domain_secret,
1038 opts.force)
1039
1042 """Determine the list of enabled disk templates.
1043
1044 """
1045 if opts.enabled_disk_templates:
1046 return opts.enabled_disk_templates.split(",")
1047 else:
1048 return None
1049
1050
1051 -def _GetVgName(opts, enabled_disk_templates):
1052 """Determine the volume group name.
1053
1054 @type enabled_disk_templates: list of strings
1055 @param enabled_disk_templates: cluster-wide enabled disk-templates
1056
1057 """
1058
1059 vg_name = None
1060 if opts.vg_name is not None:
1061 vg_name = opts.vg_name
1062 if enabled_disk_templates:
1063 if vg_name and not utils.IsLvmEnabled(enabled_disk_templates):
1064 ToStdout("You specified a volume group with --vg-name, but you did not"
1065 " enable any of the following lvm-based disk templates: %s" %
1066 utils.CommaJoin(constants.DTS_LVM))
1067 return vg_name
1068
1071 """Determine the DRBD usermode helper.
1072
1073 """
1074 drbd_helper = opts.drbd_helper
1075 if enabled_disk_templates:
1076 drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates
1077 if not drbd_enabled and opts.drbd_helper:
1078 ToStdout("You specified a DRBD usermode helper with "
1079 " --drbd-usermode-helper while DRBD is not enabled.")
1080 return drbd_helper
1081
1084 """Modify the cluster.
1085
1086 @param opts: the command line options selected by the user
1087 @type args: list
1088 @param args: should be an empty list
1089 @rtype: int
1090 @return: the desired exit code
1091
1092 """
1093 if not (opts.vg_name is not None or
1094 opts.drbd_helper is not None or
1095 opts.enabled_hypervisors or opts.hvparams or
1096 opts.beparams or opts.nicparams or
1097 opts.ndparams or opts.diskparams or
1098 opts.candidate_pool_size is not None or
1099 opts.uid_pool is not None or
1100 opts.maintain_node_health is not None or
1101 opts.add_uids is not None or
1102 opts.remove_uids is not None or
1103 opts.default_iallocator is not None or
1104 opts.reserved_lvs is not None or
1105 opts.master_netdev is not None or
1106 opts.master_netmask is not None or
1107 opts.use_external_mip_script is not None or
1108 opts.prealloc_wipe_disks is not None or
1109 opts.hv_state or
1110 opts.enabled_disk_templates or
1111 opts.disk_state or
1112 opts.ipolicy_bounds_specs is not None or
1113 opts.ipolicy_std_specs is not None or
1114 opts.ipolicy_disk_templates is not None or
1115 opts.ipolicy_vcpu_ratio is not None or
1116 opts.ipolicy_spindle_ratio is not None or
1117 opts.modify_etc_hosts is not None or
1118 opts.file_storage_dir is not None or
1119 opts.shared_file_storage_dir is not None):
1120 ToStderr("Please give at least one of the parameters.")
1121 return 1
1122
1123 if _CheckNoLvmStorageOptDeprecated(opts):
1124 return 1
1125
1126 enabled_disk_templates = _GetEnabledDiskTemplates(opts)
1127 vg_name = _GetVgName(opts, enabled_disk_templates)
1128
1129 try:
1130 drbd_helper = _GetDrbdHelper(opts, enabled_disk_templates)
1131 except errors.OpPrereqError, e:
1132 ToStderr(str(e))
1133 return 1
1134
1135 hvlist = opts.enabled_hypervisors
1136 if hvlist is not None:
1137 hvlist = hvlist.split(",")
1138
1139
1140 hvparams = dict(opts.hvparams)
1141 for hv_params in hvparams.values():
1142 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1143
1144 diskparams = dict(opts.diskparams)
1145
1146 for dt_params in diskparams.values():
1147 utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
1148
1149 beparams = opts.beparams
1150 utils.ForceDictType(beparams, constants.BES_PARAMETER_COMPAT)
1151
1152 nicparams = opts.nicparams
1153 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
1154
1155 ndparams = opts.ndparams
1156 if ndparams is not None:
1157 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
1158
1159 ipolicy = CreateIPolicyFromOpts(
1160 minmax_ispecs=opts.ipolicy_bounds_specs,
1161 std_ispecs=opts.ipolicy_std_specs,
1162 ipolicy_disk_templates=opts.ipolicy_disk_templates,
1163 ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
1164 ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
1165 )
1166
1167 mnh = opts.maintain_node_health
1168
1169 uid_pool = opts.uid_pool
1170 if uid_pool is not None:
1171 uid_pool = uidpool.ParseUidPool(uid_pool)
1172
1173 add_uids = opts.add_uids
1174 if add_uids is not None:
1175 add_uids = uidpool.ParseUidPool(add_uids)
1176
1177 remove_uids = opts.remove_uids
1178 if remove_uids is not None:
1179 remove_uids = uidpool.ParseUidPool(remove_uids)
1180
1181 if opts.reserved_lvs is not None:
1182 if opts.reserved_lvs == "":
1183 opts.reserved_lvs = []
1184 else:
1185 opts.reserved_lvs = utils.UnescapeAndSplit(opts.reserved_lvs, sep=",")
1186
1187 if opts.master_netmask is not None:
1188 try:
1189 opts.master_netmask = int(opts.master_netmask)
1190 except ValueError:
1191 ToStderr("The --master-netmask option expects an int parameter.")
1192 return 1
1193
1194 ext_ip_script = opts.use_external_mip_script
1195
1196 if opts.disk_state:
1197 disk_state = utils.FlatToDict(opts.disk_state)
1198 else:
1199 disk_state = {}
1200
1201 hv_state = dict(opts.hv_state)
1202
1203 op = opcodes.OpClusterSetParams(
1204 vg_name=vg_name,
1205 drbd_helper=drbd_helper,
1206 enabled_hypervisors=hvlist,
1207 hvparams=hvparams,
1208 os_hvp=None,
1209 beparams=beparams,
1210 nicparams=nicparams,
1211 ndparams=ndparams,
1212 diskparams=diskparams,
1213 ipolicy=ipolicy,
1214 candidate_pool_size=opts.candidate_pool_size,
1215 maintain_node_health=mnh,
1216 modify_etc_hosts=opts.modify_etc_hosts,
1217 uid_pool=uid_pool,
1218 add_uids=add_uids,
1219 remove_uids=remove_uids,
1220 default_iallocator=opts.default_iallocator,
1221 prealloc_wipe_disks=opts.prealloc_wipe_disks,
1222 master_netdev=opts.master_netdev,
1223 master_netmask=opts.master_netmask,
1224 reserved_lvs=opts.reserved_lvs,
1225 use_external_mip_script=ext_ip_script,
1226 hv_state=hv_state,
1227 disk_state=disk_state,
1228 enabled_disk_templates=enabled_disk_templates,
1229 force=opts.force,
1230 file_storage_dir=opts.file_storage_dir,
1231 shared_file_storage_dir=opts.shared_file_storage_dir,
1232 )
1233 SubmitOrSend(op, opts)
1234 return 0
1235
1238 """Queue operations.
1239
1240 @param opts: the command line options selected by the user
1241 @type args: list
1242 @param args: should contain only one element, the subcommand
1243 @rtype: int
1244 @return: the desired exit code
1245
1246 """
1247 command = args[0]
1248 client = GetClient()
1249 if command in ("drain", "undrain"):
1250 drain_flag = command == "drain"
1251 client.SetQueueDrainFlag(drain_flag)
1252 elif command == "info":
1253 result = client.QueryConfigValues(["drain_flag"])
1254 if result[0]:
1255 val = "set"
1256 else:
1257 val = "unset"
1258 ToStdout("The drain flag is %s" % val)
1259 else:
1260 raise errors.OpPrereqError("Command '%s' is not valid." % command,
1261 errors.ECODE_INVAL)
1262
1263 return 0
1264
1267 if until is None or until < time.time():
1268 ToStdout("The watcher is not paused.")
1269 else:
1270 ToStdout("The watcher is paused until %s.", time.ctime(until))
1271
1306
1309 """Puts the node in the list to desired power state.
1310
1311 @param opts: The command line options selected by the user
1312 @param node_list: The list of nodes to operate on
1313 @param power: True if they should be powered on, False otherwise
1314 @return: The success of the operation (none failed)
1315
1316 """
1317 if power:
1318 command = constants.OOB_POWER_ON
1319 else:
1320 command = constants.OOB_POWER_OFF
1321
1322 op = opcodes.OpOobCommand(node_names=node_list,
1323 command=command,
1324 ignore_status=True,
1325 timeout=opts.oob_timeout,
1326 power_delay=opts.power_delay)
1327 result = SubmitOpCode(op, opts=opts)
1328 errs = 0
1329 for node_result in result:
1330 (node_tuple, data_tuple) = node_result
1331 (_, node_name) = node_tuple
1332 (data_status, _) = data_tuple
1333 if data_status != constants.RS_NORMAL:
1334 assert data_status != constants.RS_UNAVAIL
1335 errs += 1
1336 ToStderr("There was a problem changing power for %s, please investigate",
1337 node_name)
1338
1339 if errs > 0:
1340 return False
1341
1342 return True
1343
1346 """Puts the instances in the list to desired state.
1347
1348 @param opts: The command line options selected by the user
1349 @param inst_list: The list of instances to operate on
1350 @param start: True if they should be started, False for shutdown
1351 @param no_remember: If the instance state should be remembered
1352 @return: The success of the operation (none failed)
1353
1354 """
1355 if start:
1356 opcls = opcodes.OpInstanceStartup
1357 text_submit, text_success, text_failed = ("startup", "started", "starting")
1358 else:
1359 opcls = compat.partial(opcodes.OpInstanceShutdown,
1360 timeout=opts.shutdown_timeout,
1361 no_remember=no_remember)
1362 text_submit, text_success, text_failed = ("shutdown", "stopped", "stopping")
1363
1364 jex = JobExecutor(opts=opts)
1365
1366 for inst in inst_list:
1367 ToStdout("Submit %s of instance %s", text_submit, inst)
1368 op = opcls(instance_name=inst)
1369 jex.QueueJob(inst, op)
1370
1371 results = jex.GetResults()
1372 bad_cnt = len([1 for (success, _) in results if not success])
1373
1374 if bad_cnt == 0:
1375 ToStdout("All instances have been %s successfully", text_success)
1376 else:
1377 ToStderr("There were errors while %s instances:\n"
1378 "%d error(s) out of %d instance(s)", text_failed, bad_cnt,
1379 len(results))
1380 return False
1381
1382 return True
1383
1386 """Helper class to make shared internal state sharing easier.
1387
1388 @ivar success: Indicates if all action_cb calls were successful
1389
1390 """
1391 - def __init__(self, node_list, action_cb, node2ip, port, feedback_fn,
1392 _ping_fn=netutils.TcpPing, _sleep_fn=time.sleep):
1393 """Init the object.
1394
1395 @param node_list: The list of nodes to be reachable
1396 @param action_cb: Callback called when a new host is reachable
1397 @type node2ip: dict
1398 @param node2ip: Node to ip mapping
1399 @param port: The port to use for the TCP ping
1400 @param feedback_fn: The function used for feedback
1401 @param _ping_fn: Function to check reachabilty (for unittest use only)
1402 @param _sleep_fn: Function to sleep (for unittest use only)
1403
1404 """
1405 self.down = set(node_list)
1406 self.up = set()
1407 self.node2ip = node2ip
1408 self.success = True
1409 self.action_cb = action_cb
1410 self.port = port
1411 self.feedback_fn = feedback_fn
1412 self._ping_fn = _ping_fn
1413 self._sleep_fn = _sleep_fn
1414
1416 """When called we run action_cb.
1417
1418 @raises utils.RetryAgain: When there are still down nodes
1419
1420 """
1421 if not self.action_cb(self.up):
1422 self.success = False
1423
1424 if self.down:
1425 raise utils.RetryAgain()
1426 else:
1427 return self.success
1428
1429 - def Wait(self, secs):
1430 """Checks if a host is up or waits remaining seconds.
1431
1432 @param secs: The secs remaining
1433
1434 """
1435 start = time.time()
1436 for node in self.down:
1437 if self._ping_fn(self.node2ip[node], self.port, timeout=_EPO_PING_TIMEOUT,
1438 live_port_needed=True):
1439 self.feedback_fn("Node %s became available" % node)
1440 self.up.add(node)
1441 self.down -= self.up
1442
1443
1444 return
1445
1446 self._sleep_fn(max(0.0, start + secs - time.time()))
1447
1450 """Run action_cb when nodes become reachable.
1451
1452 @param node_list: The list of nodes to be reachable
1453 @param action_cb: Callback called when a new host is reachable
1454 @param interval: The earliest time to retry
1455
1456 """
1457 client = GetClient()
1458 cluster_info = client.QueryClusterInfo()
1459 if cluster_info["primary_ip_version"] == constants.IP4_VERSION:
1460 family = netutils.IPAddress.family
1461 else:
1462 family = netutils.IP6Address.family
1463
1464 node2ip = dict((node, netutils.GetHostname(node, family=family).ip)
1465 for node in node_list)
1466
1467 port = netutils.GetDaemonPort(constants.NODED)
1468 helper = _RunWhenNodesReachableHelper(node_list, action_cb, node2ip, port,
1469 ToStdout)
1470
1471 try:
1472 return utils.Retry(helper, interval, _EPO_REACHABLE_TIMEOUT,
1473 wait_fn=helper.Wait)
1474 except utils.RetryTimeout:
1475 ToStderr("Time exceeded while waiting for nodes to become reachable"
1476 " again:\n - %s", " - ".join(helper.down))
1477 return False
1478
1482 """Start the instances conditional based on node_states.
1483
1484 @param opts: The command line options selected by the user
1485 @param inst_map: A dict of inst -> nodes mapping
1486 @param nodes_online: A list of nodes online
1487 @param _instance_start_fn: Callback to start instances (unittest use only)
1488 @return: Success of the operation on all instances
1489
1490 """
1491 start_inst_list = []
1492 for (inst, nodes) in inst_map.items():
1493 if not (nodes - nodes_online):
1494
1495 start_inst_list.append(inst)
1496
1497 for inst in start_inst_list:
1498 del inst_map[inst]
1499
1500 if start_inst_list:
1501 return _instance_start_fn(opts, start_inst_list, True)
1502
1503 return True
1504
1505
1506 -def _EpoOn(opts, full_node_list, node_list, inst_map):
1507 """Does the actual power on.
1508
1509 @param opts: The command line options selected by the user
1510 @param full_node_list: All nodes to operate on (includes nodes not supporting
1511 OOB)
1512 @param node_list: The list of nodes to operate on (all need to support OOB)
1513 @param inst_map: A dict of inst -> nodes mapping
1514 @return: The desired exit status
1515
1516 """
1517 if node_list and not _OobPower(opts, node_list, False):
1518 ToStderr("Not all nodes seem to get back up, investigate and start"
1519 " manually if needed")
1520
1521
1522 action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map))
1523
1524 ToStdout("Waiting until all nodes are available again")
1525 if not _RunWhenNodesReachable(full_node_list, action_cb, _EPO_PING_INTERVAL):
1526 ToStderr("Please investigate and start stopped instances manually")
1527 return constants.EXIT_FAILURE
1528
1529 return constants.EXIT_SUCCESS
1530
1531
1532 -def _EpoOff(opts, node_list, inst_map):
1533 """Does the actual power off.
1534
1535 @param opts: The command line options selected by the user
1536 @param node_list: The list of nodes to operate on (all need to support OOB)
1537 @param inst_map: A dict of inst -> nodes mapping
1538 @return: The desired exit status
1539
1540 """
1541 if not _InstanceStart(opts, inst_map.keys(), False, no_remember=True):
1542 ToStderr("Please investigate and stop instances manually before continuing")
1543 return constants.EXIT_FAILURE
1544
1545 if not node_list:
1546 return constants.EXIT_SUCCESS
1547
1548 if _OobPower(opts, node_list, False):
1549 return constants.EXIT_SUCCESS
1550 else:
1551 return constants.EXIT_FAILURE
1552
1557 """EPO operations.
1558
1559 @param opts: the command line options selected by the user
1560 @type args: list
1561 @param args: should contain only one element, the subcommand
1562 @rtype: int
1563 @return: the desired exit code
1564
1565 """
1566 if opts.groups and opts.show_all:
1567 _stderr_fn("Only one of --groups or --all are allowed")
1568 return constants.EXIT_FAILURE
1569 elif args and opts.show_all:
1570 _stderr_fn("Arguments in combination with --all are not allowed")
1571 return constants.EXIT_FAILURE
1572
1573 if cl is None:
1574 cl = GetClient()
1575
1576 if opts.groups:
1577 node_query_list = \
1578 itertools.chain(*cl.QueryGroups(args, ["node_list"], False))
1579 else:
1580 node_query_list = args
1581
1582 result = cl.QueryNodes(node_query_list, ["name", "master", "pinst_list",
1583 "sinst_list", "powered", "offline"],
1584 False)
1585
1586 all_nodes = map(compat.fst, result)
1587 node_list = []
1588 inst_map = {}
1589 for (node, master, pinsts, sinsts, powered, offline) in result:
1590 if not offline:
1591 for inst in (pinsts + sinsts):
1592 if inst in inst_map:
1593 if not master:
1594 inst_map[inst].add(node)
1595 elif master:
1596 inst_map[inst] = set()
1597 else:
1598 inst_map[inst] = set([node])
1599
1600 if master and opts.on:
1601
1602
1603 continue
1604 elif master and not opts.show_all:
1605 _stderr_fn("%s is the master node, please do a master-failover to another"
1606 " node not affected by the EPO or use --all if you intend to"
1607 " shutdown the whole cluster", node)
1608 return constants.EXIT_FAILURE
1609 elif powered is None:
1610 _stdout_fn("Node %s does not support out-of-band handling, it can not be"
1611 " handled in a fully automated manner", node)
1612 elif powered == opts.on:
1613 _stdout_fn("Node %s is already in desired power state, skipping", node)
1614 elif not offline or (offline and powered):
1615 node_list.append(node)
1616
1617 if not (opts.force or _confirm_fn(all_nodes, "nodes", "epo")):
1618 return constants.EXIT_FAILURE
1619
1620 if opts.on:
1621 return _on_fn(opts, all_nodes, node_list, inst_map)
1622 else:
1623 return _off_fn(opts, node_list, inst_map)
1624
1627 buf = StringIO()
1628 buf.write("gnt-cluster init")
1629 PrintIPolicyCommand(buf, info["ipolicy"], False)
1630 buf.write(" ")
1631 buf.write(info["name"])
1632 return buf.getvalue()
1633
1644
1647 """Run a command and report its output, iff it failed.
1648
1649 @param cmd: the command to execute
1650 @type cmd: list
1651 @rtype: bool
1652 @return: False, if the execution failed.
1653
1654 """
1655 result = utils.RunCmd(cmd)
1656 if result.failed:
1657 ToStderr("Command %s failed: %s; Output %s" %
1658 (cmd, result.fail_reason, result.output))
1659 return False
1660 return True
1661
1664 """Verify that a given command succeeds on all online nodes.
1665
1666 As this function is intended to run during upgrades, it
1667 is implemented in such a way that it still works, if all Ganeti
1668 daemons are down.
1669
1670 @param cmd: the command to execute
1671 @type cmd: list
1672 @rtype: list
1673 @return: the list of node names that are online where
1674 the command failed.
1675
1676 """
1677 command = utils.text.ShellQuoteArgs([str(val) for val in cmd])
1678
1679 nodes = ssconf.SimpleStore().GetOnlineNodeList()
1680 master_node = ssconf.SimpleStore().GetMasterNode()
1681 cluster_name = ssconf.SimpleStore().GetClusterName()
1682
1683
1684 if master_node in nodes:
1685 nodes.remove(master_node)
1686 nodes.append(master_node)
1687
1688 failed = []
1689
1690 srun = ssh.SshRunner(cluster_name=cluster_name)
1691 for name in nodes:
1692 result = srun.Run(name, constants.SSH_LOGIN_USER, command)
1693 if result.exit_code != 0:
1694 failed.append(name)
1695
1696 return failed
1697
1700 """Verify that the given version of ganeti is installed on all online nodes.
1701
1702 Do nothing, if this is the case, otherwise print an appropriate
1703 message to stderr.
1704
1705 @param versionstring: the version to check for
1706 @type versionstring: string
1707 @rtype: bool
1708 @return: True, if the version is installed on all online nodes
1709
1710 """
1711 badnodes = _VerifyCommand(["test", "-d",
1712 os.path.join(pathutils.PKGLIBDIR, versionstring)])
1713 if badnodes:
1714 ToStderr("Ganeti version %s not installed on nodes %s"
1715 % (versionstring, ", ".join(badnodes)))
1716 return False
1717
1718 return True
1719
1732
1735 """Set the active version of ganeti to the given versionstring
1736
1737 @type versionstring: string
1738 @rtype: list
1739 @return: the list of nodes where the version change failed
1740
1741 """
1742 failed = []
1743 if constants.HAS_GNU_LN:
1744 failed.extend(_VerifyCommand(
1745 ["ln", "-s", "-f", "-T",
1746 os.path.join(pathutils.PKGLIBDIR, versionstring),
1747 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")]))
1748 failed.extend(_VerifyCommand(
1749 ["ln", "-s", "-f", "-T",
1750 os.path.join(pathutils.SHAREDIR, versionstring),
1751 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]))
1752 else:
1753 failed.extend(_VerifyCommand(
1754 ["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")]))
1755 failed.extend(_VerifyCommand(
1756 ["ln", "-s", "-f", os.path.join(pathutils.PKGLIBDIR, versionstring),
1757 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")]))
1758 failed.extend(_VerifyCommand(
1759 ["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]))
1760 failed.extend(_VerifyCommand(
1761 ["ln", "-s", "-f", os.path.join(pathutils.SHAREDIR, versionstring),
1762 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]))
1763 return list(set(failed))
1764
1767 """Execute a list of functions, in reverse order.
1768
1769 @type fns: list of functions.
1770 @param fns: the functions to be executed.
1771
1772 """
1773 for fn in reversed(fns):
1774 fn()
1775
1778 """Determine the version the configuration file currently has.
1779
1780 @rtype: tuple or None
1781 @return: (major, minor, revision) if the version can be determined,
1782 None otherwise
1783
1784 """
1785 config_data = serializer.LoadJson(utils.ReadFile(pathutils.CLUSTER_CONF_FILE))
1786 try:
1787 config_version = config_data["version"]
1788 except KeyError:
1789 return None
1790 return utils.SplitVersion(config_version)
1791
1794 """Read the file documenting the intent to upgrade the cluster.
1795
1796 @rtype: (string, string) or (None, None)
1797 @return: (old version, version to upgrade to), if the file exists,
1798 and (None, None) otherwise.
1799
1800 """
1801 if not os.path.isfile(pathutils.INTENT_TO_UPGRADE):
1802 return (None, None)
1803
1804 contentstring = utils.ReadFile(pathutils.INTENT_TO_UPGRADE)
1805 contents = utils.UnescapeAndSplit(contentstring)
1806 if len(contents) != 3:
1807
1808 return (None, None)
1809 return (contents[0], contents[1])
1810
1822
1825 """
1826 Carry out all the tasks necessary for an upgrade that happen before
1827 the configuration file, or Ganeti version, changes.
1828
1829 @type versionstring: string
1830 @param versionstring: the version to upgrade to
1831 @rtype: (bool, list)
1832 @return: tuple of a bool indicating success and a list of rollback tasks
1833
1834 """
1835 rollback = []
1836
1837 if not _VerifyVersionInstalled(versionstring):
1838 return (False, rollback)
1839
1840 _WriteIntentToUpgrade(versionstring)
1841 rollback.append(
1842 lambda: utils.RunCmd(["rm", "-f", pathutils.INTENT_TO_UPGRADE]))
1843
1844 ToStdout("Draining queue")
1845 client = GetClient()
1846 client.SetQueueDrainFlag(True)
1847
1848 rollback.append(lambda: GetClient().SetQueueDrainFlag(False))
1849
1850 if utils.SimpleRetry(0, _GetRunning,
1851 constants.UPGRADE_QUEUE_POLL_INTERVAL,
1852 constants.UPGRADE_QUEUE_DRAIN_TIMEOUT):
1853 ToStderr("Failed to completely empty the queue.")
1854 return (False, rollback)
1855
1856 ToStdout("Pausing the watcher for one hour.")
1857 rollback.append(lambda: GetClient().SetWatcherPause(None))
1858 GetClient().SetWatcherPause(time.time() + 60 * 60)
1859
1860 ToStdout("Stopping daemons on master node.")
1861 if not _RunCommandAndReport([pathutils.DAEMON_UTIL, "stop-all"]):
1862 return (False, rollback)
1863
1864 if not _VerifyVersionInstalled(versionstring):
1865 utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"])
1866 return (False, rollback)
1867
1868 ToStdout("Stopping daemons everywhere.")
1869 rollback.append(lambda: _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"]))
1870 badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"])
1871 if badnodes:
1872 ToStderr("Failed to stop daemons on %s." % (", ".join(badnodes),))
1873 return (False, rollback)
1874
1875 backuptar = os.path.join(pathutils.BACKUP_DIR, "ganeti%d.tar" % time.time())
1876 ToStdout("Backing up configuration as %s" % backuptar)
1877 if not _RunCommandAndReport(["mkdir", "-p", pathutils.BACKUP_DIR]):
1878 return (False, rollback)
1879
1880
1881
1882 (_, tmp_name) = tempfile.mkstemp(prefix=backuptar, dir=pathutils.BACKUP_DIR)
1883 if not _RunCommandAndReport(["tar", "-cf", tmp_name,
1884 "--exclude=queue/archive",
1885 pathutils.DATA_DIR]):
1886 return (False, rollback)
1887
1888 os.rename(tmp_name, backuptar)
1889 return (True, rollback)
1890
1893 """
1894 Switch to the new Ganeti version and change the configuration,
1895 in correct order.
1896
1897 @type versionstring: string
1898 @param versionstring: the version to change to
1899 @type downgrade: bool
1900 @param downgrade: True, if the configuration should be downgraded
1901 @rtype: (bool, list)
1902 @return: tupe of a bool indicating success, and a list of
1903 additional rollback tasks
1904
1905 """
1906 rollback = []
1907 if downgrade:
1908 ToStdout("Downgrading configuration")
1909 if not _RunCommandAndReport([pathutils.CFGUPGRADE, "--downgrade", "-f"]):
1910 return (False, rollback)
1911
1912
1913
1914
1915 ToStdout("Switching to version %s on all nodes" % versionstring)
1916 rollback.append(lambda: _SetGanetiVersion(constants.DIR_VERSION))
1917 badnodes = _SetGanetiVersion(versionstring)
1918 if badnodes:
1919 ToStderr("Failed to switch to Ganeti version %s on nodes %s"
1920 % (versionstring, ", ".join(badnodes)))
1921 if not downgrade:
1922 return (False, rollback)
1923
1924
1925
1926
1927
1928
1929 if not downgrade:
1930 ToStdout("Upgrading configuration")
1931 if not _RunCommandAndReport([pathutils.CFGUPGRADE, "-f"]):
1932 return (False, rollback)
1933
1934 return (True, rollback)
1935
1938 """
1939 Carry out the upgrade actions necessary after switching to the new
1940 Ganeti version and updating the configuration.
1941
1942 As this part is run at a time where the new version of Ganeti is already
1943 running, no communication should happen via luxi, as this is not a stable
1944 interface. Also, as the configuration change is the point of no return,
1945 all actions are pushed trough, even if some of them fail.
1946
1947 @param oldversion: the version the upgrade started from
1948 @type oldversion: string
1949 @rtype: int
1950 @return: the intended return value
1951
1952 """
1953 returnvalue = 0
1954
1955 ToStdout("Ensuring directories everywhere.")
1956 badnodes = _VerifyCommand([pathutils.ENSURE_DIRS])
1957 if badnodes:
1958 ToStderr("Warning: failed to ensure directories on %s." %
1959 (", ".join(badnodes)))
1960 returnvalue = 1
1961
1962 ToStdout("Starting daemons everywhere.")
1963 badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
1964 if badnodes:
1965 ToStderr("Warning: failed to start daemons on %s." % (", ".join(badnodes),))
1966 returnvalue = 1
1967
1968 ToStdout("Redistributing the configuration.")
1969 if not _RunCommandAndReport(["gnt-cluster", "redist-conf", "--yes-do-it"]):
1970 returnvalue = 1
1971
1972 ToStdout("Restarting daemons everywhere.")
1973 badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"])
1974 badnodes.extend(_VerifyCommand([pathutils.DAEMON_UTIL, "start-all"]))
1975 if badnodes:
1976 ToStderr("Warning: failed to start daemons on %s." %
1977 (", ".join(list(set(badnodes))),))
1978 returnvalue = 1
1979
1980 ToStdout("Undraining the queue.")
1981 if not _RunCommandAndReport(["gnt-cluster", "queue", "undrain"]):
1982 returnvalue = 1
1983
1984 _RunCommandAndReport(["rm", "-f", pathutils.INTENT_TO_UPGRADE])
1985
1986 ToStdout("Running post-upgrade hooks")
1987 if not _RunCommandAndReport([pathutils.POST_UPGRADE, oldversion]):
1988 returnvalue = 1
1989
1990 ToStdout("Unpausing the watcher.")
1991 if not _RunCommandAndReport(["gnt-cluster", "watcher", "continue"]):
1992 returnvalue = 1
1993
1994 ToStdout("Verifying cluster.")
1995 if not _RunCommandAndReport(["gnt-cluster", "verify"]):
1996 returnvalue = 1
1997
1998 return returnvalue
1999
2002 """Upgrade a cluster to a new ganeti version.
2003
2004 @param opts: the command line options selected by the user
2005 @type args: list
2006 @param args: should be an empty list
2007 @rtype: int
2008 @return: the desired exit code
2009
2010 """
2011 if ((not opts.resume and opts.to is None)
2012 or (opts.resume and opts.to is not None)):
2013 ToStderr("Precisely one of the options --to and --resume"
2014 " has to be given")
2015 return 1
2016
2017
2018
2019 if not opts.resume:
2020 oldversion, versionstring = _ReadIntentToUpgrade()
2021 if versionstring is not None:
2022
2023 if versionstring == opts.to:
2024 ToStderr("An upgrade is already in progress. Target version matches,"
2025 " resuming.")
2026 opts.resume = True
2027 opts.to = None
2028 else:
2029 ToStderr("An upgrade from %s to %s is in progress; use --resume to"
2030 " finish it first" % (oldversion, versionstring))
2031 return 1
2032
2033 oldversion = constants.RELEASE_VERSION
2034
2035 if opts.resume:
2036 ssconf.CheckMaster(False)
2037 oldversion, versionstring = _ReadIntentToUpgrade()
2038 if versionstring is None:
2039 return 0
2040 version = utils.version.ParseVersion(versionstring)
2041 if version is None:
2042 return 1
2043 configversion = _GetConfigVersion()
2044 if configversion is None:
2045 return 1
2046
2047
2048
2049
2050
2051
2052 config_already_modified = \
2053 (utils.IsCorrectConfigVersion(version, configversion) and
2054 not (versionstring != constants.DIR_VERSION and
2055 configversion == (constants.CONFIG_MAJOR, constants.CONFIG_MINOR,
2056 constants.CONFIG_REVISION)))
2057 if not config_already_modified:
2058
2059
2060
2061 _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
2062 else:
2063 versionstring = opts.to
2064 config_already_modified = False
2065 version = utils.version.ParseVersion(versionstring)
2066 if version is None:
2067 ToStderr("Could not parse version string %s" % versionstring)
2068 return 1
2069
2070 msg = utils.version.UpgradeRange(version)
2071 if msg is not None:
2072 ToStderr("Cannot upgrade to %s: %s" % (versionstring, msg))
2073 return 1
2074
2075 if not config_already_modified:
2076 success, rollback = _UpgradeBeforeConfigurationChange(versionstring)
2077 if not success:
2078 _ExecuteCommands(rollback)
2079 return 1
2080 else:
2081 rollback = []
2082
2083 downgrade = utils.version.ShouldCfgdowngrade(version)
2084
2085 success, additionalrollback = \
2086 _SwitchVersionAndConfig(versionstring, downgrade)
2087 if not success:
2088 rollback.extend(additionalrollback)
2089 _ExecuteCommands(rollback)
2090 return 1
2091
2092 return _UpgradeAfterConfigurationChange(oldversion)
2093
2094
2095 commands = {
2096 "init": (
2097 InitCluster, [ArgHost(min=1, max=1)],
2098 [BACKEND_OPT, CP_SIZE_OPT, ENABLED_HV_OPT, GLOBAL_FILEDIR_OPT,
2099 HVLIST_OPT, MAC_PREFIX_OPT, MASTER_NETDEV_OPT, MASTER_NETMASK_OPT,
2100 NIC_PARAMS_OPT, NOLVM_STORAGE_OPT, NOMODIFY_ETCHOSTS_OPT,
2101 NOMODIFY_SSH_SETUP_OPT, SECONDARY_IP_OPT, VG_NAME_OPT,
2102 MAINTAIN_NODE_HEALTH_OPT, UIDPOOL_OPT, DRBD_HELPER_OPT,
2103 DEFAULT_IALLOCATOR_OPT, PRIMARY_IP_VERSION_OPT, PREALLOC_WIPE_DISKS_OPT,
2104 NODE_PARAMS_OPT, GLOBAL_SHARED_FILEDIR_OPT, USE_EXTERNAL_MIP_SCRIPT,
2105 DISK_PARAMS_OPT, HV_STATE_OPT, DISK_STATE_OPT, ENABLED_DISK_TEMPLATES_OPT,
2106 IPOLICY_STD_SPECS_OPT] + INSTANCE_POLICY_OPTS + SPLIT_ISPECS_OPTS,
2107 "[opts...] <cluster_name>", "Initialises a new cluster configuration"),
2108 "destroy": (
2109 DestroyCluster, ARGS_NONE, [YES_DOIT_OPT],
2110 "", "Destroy cluster"),
2111 "rename": (
2112 RenameCluster, [ArgHost(min=1, max=1)],
2113 [FORCE_OPT, DRY_RUN_OPT],
2114 "<new_name>",
2115 "Renames the cluster"),
2116 "redist-conf": (
2117 RedistributeConfig, ARGS_NONE, SUBMIT_OPTS +
2118 [DRY_RUN_OPT, PRIORITY_OPT, FORCE_DISTRIBUTION],
2119 "", "Forces a push of the configuration file and ssconf files"
2120 " to the nodes in the cluster"),
2121 "verify": (
2122 VerifyCluster, ARGS_NONE,
2123 [VERBOSE_OPT, DEBUG_SIMERR_OPT, ERROR_CODES_OPT, NONPLUS1_OPT,
2124 DRY_RUN_OPT, PRIORITY_OPT, NODEGROUP_OPT, IGNORE_ERRORS_OPT],
2125 "", "Does a check on the cluster configuration"),
2126 "verify-disks": (
2127 VerifyDisks, ARGS_NONE, [PRIORITY_OPT],
2128 "", "Does a check on the cluster disk status"),
2129 "repair-disk-sizes": (
2130 RepairDiskSizes, ARGS_MANY_INSTANCES, [DRY_RUN_OPT, PRIORITY_OPT],
2131 "[instance...]", "Updates mismatches in recorded disk sizes"),
2132 "master-failover": (
2133 MasterFailover, ARGS_NONE, [NOVOTING_OPT, FORCE_FAILOVER],
2134 "", "Makes the current node the master"),
2135 "master-ping": (
2136 MasterPing, ARGS_NONE, [],
2137 "", "Checks if the master is alive"),
2138 "version": (
2139 ShowClusterVersion, ARGS_NONE, [],
2140 "", "Shows the cluster version"),
2141 "getmaster": (
2142 ShowClusterMaster, ARGS_NONE, [],
2143 "", "Shows the cluster master"),
2144 "copyfile": (
2145 ClusterCopyFile, [ArgFile(min=1, max=1)],
2146 [NODE_LIST_OPT, USE_REPL_NET_OPT, NODEGROUP_OPT],
2147 "[-n node...] <filename>", "Copies a file to all (or only some) nodes"),
2148 "command": (
2149 RunClusterCommand, [ArgCommand(min=1)],
2150 [NODE_LIST_OPT, NODEGROUP_OPT, SHOW_MACHINE_OPT, FAILURE_ONLY_OPT],
2151 "[-n node...] <command>", "Runs a command on all (or only some) nodes"),
2152 "info": (
2153 ShowClusterConfig, ARGS_NONE, [ROMAN_OPT],
2154 "[--roman]", "Show cluster configuration"),
2155 "list-tags": (
2156 ListTags, ARGS_NONE, [], "", "List the tags of the cluster"),
2157 "add-tags": (
2158 AddTags, [ArgUnknown()], [TAG_SRC_OPT, PRIORITY_OPT] + SUBMIT_OPTS,
2159 "tag...", "Add tags to the cluster"),
2160 "remove-tags": (
2161 RemoveTags, [ArgUnknown()], [TAG_SRC_OPT, PRIORITY_OPT] + SUBMIT_OPTS,
2162 "tag...", "Remove tags from the cluster"),
2163 "search-tags": (
2164 SearchTags, [ArgUnknown(min=1, max=1)], [PRIORITY_OPT], "",
2165 "Searches the tags on all objects on"
2166 " the cluster for a given pattern (regex)"),
2167 "queue": (
2168 QueueOps,
2169 [ArgChoice(min=1, max=1, choices=["drain", "undrain", "info"])],
2170 [], "drain|undrain|info", "Change queue properties"),
2171 "watcher": (
2172 WatcherOps,
2173 [ArgChoice(min=1, max=1, choices=["pause", "continue", "info"]),
2174 ArgSuggest(min=0, max=1, choices=["30m", "1h", "4h"])],
2175 [],
2176 "{pause <timespec>|continue|info}", "Change watcher properties"),
2177 "modify": (
2178 SetClusterParams, ARGS_NONE,
2179 [FORCE_OPT,
2180 BACKEND_OPT, CP_SIZE_OPT, ENABLED_HV_OPT, HVLIST_OPT, MASTER_NETDEV_OPT,
2181 MASTER_NETMASK_OPT, NIC_PARAMS_OPT, NOLVM_STORAGE_OPT, VG_NAME_OPT,
2182 MAINTAIN_NODE_HEALTH_OPT, UIDPOOL_OPT, ADD_UIDS_OPT, REMOVE_UIDS_OPT,
2183 DRBD_HELPER_OPT, DEFAULT_IALLOCATOR_OPT,
2184 RESERVED_LVS_OPT, DRY_RUN_OPT, PRIORITY_OPT, PREALLOC_WIPE_DISKS_OPT,
2185 NODE_PARAMS_OPT, USE_EXTERNAL_MIP_SCRIPT, DISK_PARAMS_OPT, HV_STATE_OPT,
2186 DISK_STATE_OPT] + SUBMIT_OPTS +
2187 [ENABLED_DISK_TEMPLATES_OPT, IPOLICY_STD_SPECS_OPT, MODIFY_ETCHOSTS_OPT] +
2188 INSTANCE_POLICY_OPTS + [GLOBAL_FILEDIR_OPT, GLOBAL_SHARED_FILEDIR_OPT],
2189 "[opts...]",
2190 "Alters the parameters of the cluster"),
2191 "renew-crypto": (
2192 RenewCrypto, ARGS_NONE,
2193 [NEW_CLUSTER_CERT_OPT, NEW_RAPI_CERT_OPT, RAPI_CERT_OPT,
2194 NEW_CONFD_HMAC_KEY_OPT, FORCE_OPT,
2195 NEW_CLUSTER_DOMAIN_SECRET_OPT, CLUSTER_DOMAIN_SECRET_OPT,
2196 NEW_SPICE_CERT_OPT, SPICE_CERT_OPT, SPICE_CACERT_OPT],
2197 "[opts...]",
2198 "Renews cluster certificates, keys and secrets"),
2199 "epo": (
2200 Epo, [ArgUnknown()],
2201 [FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT,
2202 SHUTDOWN_TIMEOUT_OPT, POWER_DELAY_OPT],
2203 "[opts...] [args]",
2204 "Performs an emergency power-off on given args"),
2205 "activate-master-ip": (
2206 ActivateMasterIp, ARGS_NONE, [], "", "Activates the master IP"),
2207 "deactivate-master-ip": (
2208 DeactivateMasterIp, ARGS_NONE, [CONFIRM_OPT], "",
2209 "Deactivates the master IP"),
2210 "show-ispecs-cmd": (
2211 ShowCreateCommand, ARGS_NONE, [], "",
2212 "Show the command line to re-create the cluster"),
2213 "upgrade": (
2214 UpgradeGanetiCommand, ARGS_NONE, [TO_OPT, RESUME_OPT], "",
2215 "Upgrade (or downgrade) to a new Ganeti version"),
2216 }
2217
2218
2219
2220 aliases = {
2221 "masterfailover": "master-failover",
2222 "show": "info",
2223 }
2224
2225
2226 -def Main():
2229