1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Cluster related commands"""
22
23
24
25
26
27
28
29 import os.path
30 import time
31 import OpenSSL
32 import itertools
33
34 from ganeti.cli import *
35 from ganeti import opcodes
36 from ganeti import constants
37 from ganeti import errors
38 from ganeti import utils
39 from ganeti import bootstrap
40 from ganeti import ssh
41 from ganeti import objects
42 from ganeti import uidpool
43 from ganeti import compat
44 from ganeti import netutils
45
46
47 ON_OPT = cli_option("--on", default=False,
48 action="store_true", dest="on",
49 help="Recover from an EPO")
50
51 GROUPS_OPT = cli_option("--groups", default=False,
52 action="store_true", dest="groups",
53 help="Arguments are node groups instead of nodes")
54
55 _EPO_PING_INTERVAL = 30
56 _EPO_PING_TIMEOUT = 1
57 _EPO_REACHABLE_TIMEOUT = 15 * 60
62 """Initialize the cluster.
63
64 @param opts: the command line options selected by the user
65 @type args: list
66 @param args: should contain only one element, the desired
67 cluster name
68 @rtype: int
69 @return: the desired exit code
70
71 """
72 if not opts.lvm_storage and opts.vg_name:
73 ToStderr("Options --no-lvm-storage and --vg-name conflict.")
74 return 1
75
76 vg_name = opts.vg_name
77 if opts.lvm_storage and not opts.vg_name:
78 vg_name = constants.DEFAULT_VG
79
80 if not opts.drbd_storage and opts.drbd_helper:
81 ToStderr("Options --no-drbd-storage and --drbd-usermode-helper conflict.")
82 return 1
83
84 drbd_helper = opts.drbd_helper
85 if opts.drbd_storage and not opts.drbd_helper:
86 drbd_helper = constants.DEFAULT_DRBD_HELPER
87
88 master_netdev = opts.master_netdev
89 if master_netdev is None:
90 master_netdev = constants.DEFAULT_BRIDGE
91
92 hvlist = opts.enabled_hypervisors
93 if hvlist is None:
94 hvlist = constants.DEFAULT_ENABLED_HYPERVISOR
95 hvlist = hvlist.split(",")
96
97 hvparams = dict(opts.hvparams)
98 beparams = opts.beparams
99 nicparams = opts.nicparams
100
101
102 beparams = objects.FillDict(constants.BEC_DEFAULTS, beparams)
103 utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
104
105
106 nicparams = objects.FillDict(constants.NICC_DEFAULTS, nicparams)
107 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
108
109
110 if opts.ndparams is None:
111 ndparams = dict(constants.NDC_DEFAULTS)
112 else:
113 ndparams = objects.FillDict(constants.NDC_DEFAULTS, opts.ndparams)
114 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
115
116
117 for hv in constants.HYPER_TYPES:
118 if hv not in hvparams:
119 hvparams[hv] = {}
120 hvparams[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], hvparams[hv])
121 utils.ForceDictType(hvparams[hv], constants.HVS_PARAMETER_TYPES)
122
123 if opts.candidate_pool_size is None:
124 opts.candidate_pool_size = constants.MASTER_POOL_SIZE_DEFAULT
125
126 if opts.mac_prefix is None:
127 opts.mac_prefix = constants.DEFAULT_MAC_PREFIX
128
129 uid_pool = opts.uid_pool
130 if uid_pool is not None:
131 uid_pool = uidpool.ParseUidPool(uid_pool)
132
133 if opts.prealloc_wipe_disks is None:
134 opts.prealloc_wipe_disks = False
135
136 try:
137 primary_ip_version = int(opts.primary_ip_version)
138 except (ValueError, TypeError), err:
139 ToStderr("Invalid primary ip version value: %s" % str(err))
140 return 1
141
142 bootstrap.InitCluster(cluster_name=args[0],
143 secondary_ip=opts.secondary_ip,
144 vg_name=vg_name,
145 mac_prefix=opts.mac_prefix,
146 master_netdev=master_netdev,
147 file_storage_dir=opts.file_storage_dir,
148 shared_file_storage_dir=opts.shared_file_storage_dir,
149 enabled_hypervisors=hvlist,
150 hvparams=hvparams,
151 beparams=beparams,
152 nicparams=nicparams,
153 ndparams=ndparams,
154 candidate_pool_size=opts.candidate_pool_size,
155 modify_etc_hosts=opts.modify_etc_hosts,
156 modify_ssh_setup=opts.modify_ssh_setup,
157 maintain_node_health=opts.maintain_node_health,
158 drbd_helper=drbd_helper,
159 uid_pool=uid_pool,
160 default_iallocator=opts.default_iallocator,
161 primary_ip_version=primary_ip_version,
162 prealloc_wipe_disks=opts.prealloc_wipe_disks,
163 )
164 op = opcodes.OpClusterPostInit()
165 SubmitOpCode(op, opts=opts)
166 return 0
167
171 """Destroy the cluster.
172
173 @param opts: the command line options selected by the user
174 @type args: list
175 @param args: should be an empty list
176 @rtype: int
177 @return: the desired exit code
178
179 """
180 if not opts.yes_do_it:
181 ToStderr("Destroying a cluster is irreversible. If you really want"
182 " destroy this cluster, supply the --yes-do-it option.")
183 return 1
184
185 op = opcodes.OpClusterDestroy()
186 master = SubmitOpCode(op, opts=opts)
187
188
189 bootstrap.FinalizeClusterDestroy(master)
190 return 0
191
194 """Rename the cluster.
195
196 @param opts: the command line options selected by the user
197 @type args: list
198 @param args: should contain only one element, the new cluster name
199 @rtype: int
200 @return: the desired exit code
201
202 """
203 cl = GetClient()
204
205 (cluster_name, ) = cl.QueryConfigValues(["cluster_name"])
206
207 new_name = args[0]
208 if not opts.force:
209 usertext = ("This will rename the cluster from '%s' to '%s'. If you are"
210 " connected over the network to the cluster name, the"
211 " operation is very dangerous as the IP address will be"
212 " removed from the node and the change may not go through."
213 " Continue?") % (cluster_name, new_name)
214 if not AskUser(usertext):
215 return 1
216
217 op = opcodes.OpClusterRename(name=new_name)
218 result = SubmitOpCode(op, opts=opts, cl=cl)
219
220 if result:
221 ToStdout("Cluster renamed from '%s' to '%s'", cluster_name, result)
222
223 return 0
224
227 """Forces push of the cluster configuration.
228
229 @param opts: the command line options selected by the user
230 @type args: list
231 @param args: empty list
232 @rtype: int
233 @return: the desired exit code
234
235 """
236 op = opcodes.OpClusterRedistConf()
237 SubmitOrSend(op, opts)
238 return 0
239
242 """Write version of ganeti software to the standard output.
243
244 @param opts: the command line options selected by the user
245 @type args: list
246 @param args: should be an empty list
247 @rtype: int
248 @return: the desired exit code
249
250 """
251 cl = GetClient()
252 result = cl.QueryClusterInfo()
253 ToStdout("Software version: %s", result["software_version"])
254 ToStdout("Internode protocol: %s", result["protocol_version"])
255 ToStdout("Configuration format: %s", result["config_version"])
256 ToStdout("OS api version: %s", result["os_api_version"])
257 ToStdout("Export interface: %s", result["export_version"])
258 return 0
259
262 """Write name of master node to the standard output.
263
264 @param opts: the command line options selected by the user
265 @type args: list
266 @param args: should be an empty list
267 @rtype: int
268 @return: the desired exit code
269
270 """
271 master = bootstrap.GetMaster()
272 ToStdout(master)
273 return 0
274
277 """Print Grouped parameters (be, nic, disk) by group.
278
279 @type paramsdict: dict of dicts
280 @param paramsdict: {group: {param: value, ...}, ...}
281 @type level: int
282 @param level: Level of indention
283
284 """
285 indent = " " * level
286 for item, val in sorted(paramsdict.items()):
287 if isinstance(val, dict):
288 ToStdout("%s- %s:", indent, item)
289 _PrintGroupedParams(val, level=level + 1, roman=roman)
290 elif roman and isinstance(val, int):
291 ToStdout("%s %s: %s", indent, item, compat.TryToRoman(val))
292 else:
293 ToStdout("%s %s: %s", indent, item, val)
294
297 """Shows cluster information.
298
299 @param opts: the command line options selected by the user
300 @type args: list
301 @param args: should be an empty list
302 @rtype: int
303 @return: the desired exit code
304
305 """
306 cl = GetClient()
307 result = cl.QueryClusterInfo()
308
309 ToStdout("Cluster name: %s", result["name"])
310 ToStdout("Cluster UUID: %s", result["uuid"])
311
312 ToStdout("Creation time: %s", utils.FormatTime(result["ctime"]))
313 ToStdout("Modification time: %s", utils.FormatTime(result["mtime"]))
314
315 ToStdout("Master node: %s", result["master"])
316
317 ToStdout("Architecture (this node): %s (%s)",
318 result["architecture"][0], result["architecture"][1])
319
320 if result["tags"]:
321 tags = utils.CommaJoin(utils.NiceSort(result["tags"]))
322 else:
323 tags = "(none)"
324
325 ToStdout("Tags: %s", tags)
326
327 ToStdout("Default hypervisor: %s", result["default_hypervisor"])
328 ToStdout("Enabled hypervisors: %s",
329 utils.CommaJoin(result["enabled_hypervisors"]))
330
331 ToStdout("Hypervisor parameters:")
332 _PrintGroupedParams(result["hvparams"])
333
334 ToStdout("OS-specific hypervisor parameters:")
335 _PrintGroupedParams(result["os_hvp"])
336
337 ToStdout("OS parameters:")
338 _PrintGroupedParams(result["osparams"])
339
340 ToStdout("Hidden OSes: %s", utils.CommaJoin(result["hidden_os"]))
341 ToStdout("Blacklisted OSes: %s", utils.CommaJoin(result["blacklisted_os"]))
342
343 ToStdout("Cluster parameters:")
344 ToStdout(" - candidate pool size: %s",
345 compat.TryToRoman(result["candidate_pool_size"],
346 convert=opts.roman_integers))
347 ToStdout(" - master netdev: %s", result["master_netdev"])
348 ToStdout(" - lvm volume group: %s", result["volume_group_name"])
349 if result["reserved_lvs"]:
350 reserved_lvs = utils.CommaJoin(result["reserved_lvs"])
351 else:
352 reserved_lvs = "(none)"
353 ToStdout(" - lvm reserved volumes: %s", reserved_lvs)
354 ToStdout(" - drbd usermode helper: %s", result["drbd_usermode_helper"])
355 ToStdout(" - file storage path: %s", result["file_storage_dir"])
356 ToStdout(" - shared file storage path: %s",
357 result["shared_file_storage_dir"])
358 ToStdout(" - maintenance of node health: %s",
359 result["maintain_node_health"])
360 ToStdout(" - uid pool: %s",
361 uidpool.FormatUidPool(result["uid_pool"],
362 roman=opts.roman_integers))
363 ToStdout(" - default instance allocator: %s", result["default_iallocator"])
364 ToStdout(" - primary ip version: %d", result["primary_ip_version"])
365 ToStdout(" - preallocation wipe disks: %s", result["prealloc_wipe_disks"])
366 ToStdout(" - OS search path: %s", utils.CommaJoin(constants.OS_SEARCH_PATH))
367
368 ToStdout("Default node parameters:")
369 _PrintGroupedParams(result["ndparams"], roman=opts.roman_integers)
370
371 ToStdout("Default instance parameters:")
372 _PrintGroupedParams(result["beparams"], roman=opts.roman_integers)
373
374 ToStdout("Default nic parameters:")
375 _PrintGroupedParams(result["nicparams"], roman=opts.roman_integers)
376
377 return 0
378
381 """Copy a file from master to some nodes.
382
383 @param opts: the command line options selected by the user
384 @type args: list
385 @param args: should contain only one element, the path of
386 the file to be copied
387 @rtype: int
388 @return: the desired exit code
389
390 """
391 filename = args[0]
392 if not os.path.exists(filename):
393 raise errors.OpPrereqError("No such filename '%s'" % filename,
394 errors.ECODE_INVAL)
395
396 cl = GetClient()
397
398 cluster_name = cl.QueryConfigValues(["cluster_name"])[0]
399
400 results = GetOnlineNodes(nodes=opts.nodes, cl=cl, filter_master=True,
401 secondary_ips=opts.use_replication_network,
402 nodegroup=opts.nodegroup)
403
404 srun = ssh.SshRunner(cluster_name=cluster_name)
405 for node in results:
406 if not srun.CopyFileToNode(node, filename):
407 ToStderr("Copy of file %s to node %s failed", filename, node)
408
409 return 0
410
413 """Run a command on some nodes.
414
415 @param opts: the command line options selected by the user
416 @type args: list
417 @param args: should contain the command to be run and its arguments
418 @rtype: int
419 @return: the desired exit code
420
421 """
422 cl = GetClient()
423
424 command = " ".join(args)
425
426 nodes = GetOnlineNodes(nodes=opts.nodes, cl=cl, nodegroup=opts.nodegroup)
427
428 cluster_name, master_node = cl.QueryConfigValues(["cluster_name",
429 "master_node"])
430
431 srun = ssh.SshRunner(cluster_name=cluster_name)
432
433
434 if master_node in nodes:
435 nodes.remove(master_node)
436 nodes.append(master_node)
437
438 for name in nodes:
439 result = srun.Run(name, "root", command)
440 ToStdout("------------------------------------------------")
441 ToStdout("node: %s", name)
442 ToStdout("%s", result.output)
443 ToStdout("return code = %s", result.exit_code)
444
445 return 0
446
449 """Verify integrity of cluster, performing various test on nodes.
450
451 @param opts: the command line options selected by the user
452 @type args: list
453 @param args: should be an empty list
454 @rtype: int
455 @return: the desired exit code
456
457 """
458 skip_checks = []
459
460 if opts.skip_nplusone_mem:
461 skip_checks.append(constants.VERIFY_NPLUSONE_MEM)
462
463 cl = GetClient()
464
465 op = opcodes.OpClusterVerify(verbose=opts.verbose,
466 error_codes=opts.error_codes,
467 debug_simulate_errors=opts.simulate_errors,
468 skip_checks=skip_checks,
469 group_name=opts.nodegroup)
470 result = SubmitOpCode(op, cl=cl, opts=opts)
471
472
473 jex = JobExecutor(cl=cl, opts=opts)
474
475 for (status, job_id) in result[constants.JOB_IDS_KEY]:
476 jex.AddJobId(None, status, job_id)
477
478 results = jex.GetResults()
479
480 (bad_jobs, bad_results) = \
481 map(len,
482
483 map(list,
484
485 map(compat.partial(itertools.ifilterfalse, bool),
486
487 zip(*((job_success, len(op_results) == 1 and op_results[0])
488 for (job_success, op_results) in results)))))
489
490 if bad_jobs == 0 and bad_results == 0:
491 rcode = constants.EXIT_SUCCESS
492 else:
493 rcode = constants.EXIT_FAILURE
494 if bad_jobs > 0:
495 ToStdout("%s job(s) failed while verifying the cluster.", bad_jobs)
496
497 return rcode
498
501 """Verify integrity of cluster disks.
502
503 @param opts: the command line options selected by the user
504 @type args: list
505 @param args: should be an empty list
506 @rtype: int
507 @return: the desired exit code
508
509 """
510 cl = GetClient()
511
512 op = opcodes.OpClusterVerifyDisks()
513
514 result = SubmitOpCode(op, cl=cl, opts=opts)
515
516
517 jex = JobExecutor(cl=cl, opts=opts)
518
519 for (status, job_id) in result[constants.JOB_IDS_KEY]:
520 jex.AddJobId(None, status, job_id)
521
522 retcode = constants.EXIT_SUCCESS
523
524 for (status, result) in jex.GetResults():
525 if not status:
526 ToStdout("Job failed: %s", result)
527 continue
528
529 ((bad_nodes, instances, missing), ) = result
530
531 for node, text in bad_nodes.items():
532 ToStdout("Error gathering data on node %s: %s",
533 node, utils.SafeEncode(text[-400:]))
534 retcode = constants.EXIT_FAILURE
535 ToStdout("You need to fix these nodes first before fixing instances")
536
537 for iname in instances:
538 if iname in missing:
539 continue
540 op = opcodes.OpInstanceActivateDisks(instance_name=iname)
541 try:
542 ToStdout("Activating disks for instance '%s'", iname)
543 SubmitOpCode(op, opts=opts, cl=cl)
544 except errors.GenericError, err:
545 nret, msg = FormatError(err)
546 retcode |= nret
547 ToStderr("Error activating disks for instance %s: %s", iname, msg)
548
549 if missing:
550 for iname, ival in missing.iteritems():
551 all_missing = compat.all(x[0] in bad_nodes for x in ival)
552 if all_missing:
553 ToStdout("Instance %s cannot be verified as it lives on"
554 " broken nodes", iname)
555 else:
556 ToStdout("Instance %s has missing logical volumes:", iname)
557 ival.sort()
558 for node, vol in ival:
559 if node in bad_nodes:
560 ToStdout("\tbroken node %s /dev/%s", node, vol)
561 else:
562 ToStdout("\t%s /dev/%s", node, vol)
563
564 ToStdout("You need to replace or recreate disks for all the above"
565 " instances if this message persists after fixing broken nodes.")
566 retcode = constants.EXIT_FAILURE
567
568 return retcode
569
572 """Verify sizes of cluster disks.
573
574 @param opts: the command line options selected by the user
575 @type args: list
576 @param args: optional list of instances to restrict check to
577 @rtype: int
578 @return: the desired exit code
579
580 """
581 op = opcodes.OpClusterRepairDiskSizes(instances=args)
582 SubmitOpCode(op, opts=opts)
583
587 """Failover the master node.
588
589 This command, when run on a non-master node, will cause the current
590 master to cease being master, and the non-master to become new
591 master.
592
593 @param opts: the command line options selected by the user
594 @type args: list
595 @param args: should be an empty list
596 @rtype: int
597 @return: the desired exit code
598
599 """
600 if opts.no_voting:
601 usertext = ("This will perform the failover even if most other nodes"
602 " are down, or if this node is outdated. This is dangerous"
603 " as it can lead to a non-consistent cluster. Check the"
604 " gnt-cluster(8) man page before proceeding. Continue?")
605 if not AskUser(usertext):
606 return 1
607
608 return bootstrap.MasterFailover(no_voting=opts.no_voting)
609
612 """Checks if the master is alive.
613
614 @param opts: the command line options selected by the user
615 @type args: list
616 @param args: should be an empty list
617 @rtype: int
618 @return: the desired exit code
619
620 """
621 try:
622 cl = GetClient()
623 cl.QueryClusterInfo()
624 return 0
625 except Exception:
626 return 1
627
647
648
649 -def _RenewCrypto(new_cluster_cert, new_rapi_cert, rapi_cert_filename,
650 new_confd_hmac_key, new_cds, cds_filename,
651 force):
652 """Renews cluster certificates, keys and secrets.
653
654 @type new_cluster_cert: bool
655 @param new_cluster_cert: Whether to generate a new cluster certificate
656 @type new_rapi_cert: bool
657 @param new_rapi_cert: Whether to generate a new RAPI certificate
658 @type rapi_cert_filename: string
659 @param rapi_cert_filename: Path to file containing new RAPI certificate
660 @type new_confd_hmac_key: bool
661 @param new_confd_hmac_key: Whether to generate a new HMAC key
662 @type new_cds: bool
663 @param new_cds: Whether to generate a new cluster domain secret
664 @type cds_filename: string
665 @param cds_filename: Path to file containing new cluster domain secret
666 @type force: bool
667 @param force: Whether to ask user for confirmation
668
669 """
670 if new_rapi_cert and rapi_cert_filename:
671 ToStderr("Only one of the --new-rapi-certficate and --rapi-certificate"
672 " options can be specified at the same time.")
673 return 1
674
675 if new_cds and cds_filename:
676 ToStderr("Only one of the --new-cluster-domain-secret and"
677 " --cluster-domain-secret options can be specified at"
678 " the same time.")
679 return 1
680
681 if rapi_cert_filename:
682
683 try:
684 rapi_cert_pem = utils.ReadFile(rapi_cert_filename)
685
686 OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
687 rapi_cert_pem)
688 except Exception, err:
689 ToStderr("Can't load new RAPI certificate from %s: %s" %
690 (rapi_cert_filename, str(err)))
691 return 1
692
693 try:
694 OpenSSL.crypto.load_privatekey(OpenSSL.crypto.FILETYPE_PEM, rapi_cert_pem)
695 except Exception, err:
696 ToStderr("Can't load new RAPI private key from %s: %s" %
697 (rapi_cert_filename, str(err)))
698 return 1
699
700 else:
701 rapi_cert_pem = None
702
703 if cds_filename:
704 try:
705 cds = utils.ReadFile(cds_filename)
706 except Exception, err:
707 ToStderr("Can't load new cluster domain secret from %s: %s" %
708 (cds_filename, str(err)))
709 return 1
710 else:
711 cds = None
712
713 if not force:
714 usertext = ("This requires all daemons on all nodes to be restarted and"
715 " may take some time. Continue?")
716 if not AskUser(usertext):
717 return 1
718
719 def _RenewCryptoInner(ctx):
720 ctx.feedback_fn("Updating certificates and keys")
721 bootstrap.GenerateClusterCrypto(new_cluster_cert, new_rapi_cert,
722 new_confd_hmac_key,
723 new_cds,
724 rapi_cert_pem=rapi_cert_pem,
725 cds=cds)
726
727 files_to_copy = []
728
729 if new_cluster_cert:
730 files_to_copy.append(constants.NODED_CERT_FILE)
731
732 if new_rapi_cert or rapi_cert_pem:
733 files_to_copy.append(constants.RAPI_CERT_FILE)
734
735 if new_confd_hmac_key:
736 files_to_copy.append(constants.CONFD_HMAC_KEY)
737
738 if new_cds or cds:
739 files_to_copy.append(constants.CLUSTER_DOMAIN_SECRET_FILE)
740
741 if files_to_copy:
742 for node_name in ctx.nonmaster_nodes:
743 ctx.feedback_fn("Copying %s to %s" %
744 (", ".join(files_to_copy), node_name))
745 for file_name in files_to_copy:
746 ctx.ssh.CopyFileToNode(node_name, file_name)
747
748 RunWhileClusterStopped(ToStdout, _RenewCryptoInner)
749
750 ToStdout("All requested certificates and keys have been replaced."
751 " Running \"gnt-cluster verify\" now is recommended.")
752
753 return 0
754
757 """Renews cluster certificates, keys and secrets.
758
759 """
760 return _RenewCrypto(opts.new_cluster_cert,
761 opts.new_rapi_cert,
762 opts.rapi_cert,
763 opts.new_confd_hmac_key,
764 opts.new_cluster_domain_secret,
765 opts.cluster_domain_secret,
766 opts.force)
767
770 """Modify the cluster.
771
772 @param opts: the command line options selected by the user
773 @type args: list
774 @param args: should be an empty list
775 @rtype: int
776 @return: the desired exit code
777
778 """
779 if not (not opts.lvm_storage or opts.vg_name or
780 not opts.drbd_storage or opts.drbd_helper or
781 opts.enabled_hypervisors or opts.hvparams or
782 opts.beparams or opts.nicparams or opts.ndparams or
783 opts.candidate_pool_size is not None or
784 opts.uid_pool is not None or
785 opts.maintain_node_health is not None or
786 opts.add_uids is not None or
787 opts.remove_uids is not None or
788 opts.default_iallocator is not None or
789 opts.reserved_lvs is not None or
790 opts.master_netdev is not None or
791 opts.prealloc_wipe_disks is not None):
792 ToStderr("Please give at least one of the parameters.")
793 return 1
794
795 vg_name = opts.vg_name
796 if not opts.lvm_storage and opts.vg_name:
797 ToStderr("Options --no-lvm-storage and --vg-name conflict.")
798 return 1
799
800 if not opts.lvm_storage:
801 vg_name = ""
802
803 drbd_helper = opts.drbd_helper
804 if not opts.drbd_storage and opts.drbd_helper:
805 ToStderr("Options --no-drbd-storage and --drbd-usermode-helper conflict.")
806 return 1
807
808 if not opts.drbd_storage:
809 drbd_helper = ""
810
811 hvlist = opts.enabled_hypervisors
812 if hvlist is not None:
813 hvlist = hvlist.split(",")
814
815
816 hvparams = dict(opts.hvparams)
817 for hv_params in hvparams.values():
818 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
819
820 beparams = opts.beparams
821 utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
822
823 nicparams = opts.nicparams
824 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
825
826 ndparams = opts.ndparams
827 if ndparams is not None:
828 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
829
830 mnh = opts.maintain_node_health
831
832 uid_pool = opts.uid_pool
833 if uid_pool is not None:
834 uid_pool = uidpool.ParseUidPool(uid_pool)
835
836 add_uids = opts.add_uids
837 if add_uids is not None:
838 add_uids = uidpool.ParseUidPool(add_uids)
839
840 remove_uids = opts.remove_uids
841 if remove_uids is not None:
842 remove_uids = uidpool.ParseUidPool(remove_uids)
843
844 if opts.reserved_lvs is not None:
845 if opts.reserved_lvs == "":
846 opts.reserved_lvs = []
847 else:
848 opts.reserved_lvs = utils.UnescapeAndSplit(opts.reserved_lvs, sep=",")
849
850 op = opcodes.OpClusterSetParams(vg_name=vg_name,
851 drbd_helper=drbd_helper,
852 enabled_hypervisors=hvlist,
853 hvparams=hvparams,
854 os_hvp=None,
855 beparams=beparams,
856 nicparams=nicparams,
857 ndparams=ndparams,
858 candidate_pool_size=opts.candidate_pool_size,
859 maintain_node_health=mnh,
860 uid_pool=uid_pool,
861 add_uids=add_uids,
862 remove_uids=remove_uids,
863 default_iallocator=opts.default_iallocator,
864 prealloc_wipe_disks=opts.prealloc_wipe_disks,
865 master_netdev=opts.master_netdev,
866 reserved_lvs=opts.reserved_lvs)
867 SubmitOpCode(op, opts=opts)
868 return 0
869
872 """Queue operations.
873
874 @param opts: the command line options selected by the user
875 @type args: list
876 @param args: should contain only one element, the subcommand
877 @rtype: int
878 @return: the desired exit code
879
880 """
881 command = args[0]
882 client = GetClient()
883 if command in ("drain", "undrain"):
884 drain_flag = command == "drain"
885 client.SetQueueDrainFlag(drain_flag)
886 elif command == "info":
887 result = client.QueryConfigValues(["drain_flag"])
888 if result[0]:
889 val = "set"
890 else:
891 val = "unset"
892 ToStdout("The drain flag is %s" % val)
893 else:
894 raise errors.OpPrereqError("Command '%s' is not valid." % command,
895 errors.ECODE_INVAL)
896
897 return 0
898
901 if until is None or until < time.time():
902 ToStdout("The watcher is not paused.")
903 else:
904 ToStdout("The watcher is paused until %s.", time.ctime(until))
905
940
943 """Puts the node in the list to desired power state.
944
945 @param opts: The command line options selected by the user
946 @param node_list: The list of nodes to operate on
947 @param power: True if they should be powered on, False otherwise
948 @return: The success of the operation (none failed)
949
950 """
951 if power:
952 command = constants.OOB_POWER_ON
953 else:
954 command = constants.OOB_POWER_OFF
955
956 op = opcodes.OpOobCommand(node_names=node_list,
957 command=command,
958 ignore_status=True,
959 timeout=opts.oob_timeout,
960 power_delay=opts.power_delay)
961 result = SubmitOpCode(op, opts=opts)
962 errs = 0
963 for node_result in result:
964 (node_tuple, data_tuple) = node_result
965 (_, node_name) = node_tuple
966 (data_status, _) = data_tuple
967 if data_status != constants.RS_NORMAL:
968 assert data_status != constants.RS_UNAVAIL
969 errs += 1
970 ToStderr("There was a problem changing power for %s, please investigate",
971 node_name)
972
973 if errs > 0:
974 return False
975
976 return True
977
980 """Puts the instances in the list to desired state.
981
982 @param opts: The command line options selected by the user
983 @param inst_list: The list of instances to operate on
984 @param start: True if they should be started, False for shutdown
985 @return: The success of the operation (none failed)
986
987 """
988 if start:
989 opcls = opcodes.OpInstanceStartup
990 text_submit, text_success, text_failed = ("startup", "started", "starting")
991 else:
992 opcls = compat.partial(opcodes.OpInstanceShutdown,
993 timeout=opts.shutdown_timeout)
994 text_submit, text_success, text_failed = ("shutdown", "stopped", "stopping")
995
996 jex = JobExecutor(opts=opts)
997
998 for inst in inst_list:
999 ToStdout("Submit %s of instance %s", text_submit, inst)
1000 op = opcls(instance_name=inst)
1001 jex.QueueJob(inst, op)
1002
1003 results = jex.GetResults()
1004 bad_cnt = len([1 for (success, _) in results if not success])
1005
1006 if bad_cnt == 0:
1007 ToStdout("All instances have been %s successfully", text_success)
1008 else:
1009 ToStderr("There were errors while %s instances:\n"
1010 "%d error(s) out of %d instance(s)", text_failed, bad_cnt,
1011 len(results))
1012 return False
1013
1014 return True
1015
1018 """Helper class to make shared internal state sharing easier.
1019
1020 @ivar success: Indicates if all action_cb calls were successful
1021
1022 """
1023 - def __init__(self, node_list, action_cb, node2ip, port, feedback_fn,
1024 _ping_fn=netutils.TcpPing, _sleep_fn=time.sleep):
1025 """Init the object.
1026
1027 @param node_list: The list of nodes to be reachable
1028 @param action_cb: Callback called when a new host is reachable
1029 @type node2ip: dict
1030 @param node2ip: Node to ip mapping
1031 @param port: The port to use for the TCP ping
1032 @param feedback_fn: The function used for feedback
1033 @param _ping_fn: Function to check reachabilty (for unittest use only)
1034 @param _sleep_fn: Function to sleep (for unittest use only)
1035
1036 """
1037 self.down = set(node_list)
1038 self.up = set()
1039 self.node2ip = node2ip
1040 self.success = True
1041 self.action_cb = action_cb
1042 self.port = port
1043 self.feedback_fn = feedback_fn
1044 self._ping_fn = _ping_fn
1045 self._sleep_fn = _sleep_fn
1046
1048 """When called we run action_cb.
1049
1050 @raises utils.RetryAgain: When there are still down nodes
1051
1052 """
1053 if not self.action_cb(self.up):
1054 self.success = False
1055
1056 if self.down:
1057 raise utils.RetryAgain()
1058 else:
1059 return self.success
1060
1061 - def Wait(self, secs):
1062 """Checks if a host is up or waits remaining seconds.
1063
1064 @param secs: The secs remaining
1065
1066 """
1067 start = time.time()
1068 for node in self.down:
1069 if self._ping_fn(self.node2ip[node], self.port, timeout=_EPO_PING_TIMEOUT,
1070 live_port_needed=True):
1071 self.feedback_fn("Node %s became available" % node)
1072 self.up.add(node)
1073 self.down -= self.up
1074
1075
1076 return
1077
1078 self._sleep_fn(max(0.0, start + secs - time.time()))
1079
1082 """Run action_cb when nodes become reachable.
1083
1084 @param node_list: The list of nodes to be reachable
1085 @param action_cb: Callback called when a new host is reachable
1086 @param interval: The earliest time to retry
1087
1088 """
1089 client = GetClient()
1090 cluster_info = client.QueryClusterInfo()
1091 if cluster_info["primary_ip_version"] == constants.IP4_VERSION:
1092 family = netutils.IPAddress.family
1093 else:
1094 family = netutils.IP6Address.family
1095
1096 node2ip = dict((node, netutils.GetHostname(node, family=family).ip)
1097 for node in node_list)
1098
1099 port = netutils.GetDaemonPort(constants.NODED)
1100 helper = _RunWhenNodesReachableHelper(node_list, action_cb, node2ip, port,
1101 ToStdout)
1102
1103 try:
1104 return utils.Retry(helper, interval, _EPO_REACHABLE_TIMEOUT,
1105 wait_fn=helper.Wait)
1106 except utils.RetryTimeout:
1107 ToStderr("Time exceeded while waiting for nodes to become reachable"
1108 " again:\n - %s", " - ".join(helper.down))
1109 return False
1110
1114 """Start the instances conditional based on node_states.
1115
1116 @param opts: The command line options selected by the user
1117 @param inst_map: A dict of inst -> nodes mapping
1118 @param nodes_online: A list of nodes online
1119 @param _instance_start_fn: Callback to start instances (unittest use only)
1120 @return: Success of the operation on all instances
1121
1122 """
1123 start_inst_list = []
1124 for (inst, nodes) in inst_map.items():
1125 if not (nodes - nodes_online):
1126
1127 start_inst_list.append(inst)
1128
1129 for inst in start_inst_list:
1130 del inst_map[inst]
1131
1132 if start_inst_list:
1133 return _instance_start_fn(opts, start_inst_list, True)
1134
1135 return True
1136
1137
1138 -def _EpoOn(opts, full_node_list, node_list, inst_map):
1139 """Does the actual power on.
1140
1141 @param opts: The command line options selected by the user
1142 @param full_node_list: All nodes to operate on (includes nodes not supporting
1143 OOB)
1144 @param node_list: The list of nodes to operate on (all need to support OOB)
1145 @param inst_map: A dict of inst -> nodes mapping
1146 @return: The desired exit status
1147
1148 """
1149 if node_list and not _OobPower(opts, node_list, False):
1150 ToStderr("Not all nodes seem to get back up, investigate and start"
1151 " manually if needed")
1152
1153
1154 action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map))
1155
1156 ToStdout("Waiting until all nodes are available again")
1157 if not _RunWhenNodesReachable(full_node_list, action_cb, _EPO_PING_INTERVAL):
1158 ToStderr("Please investigate and start stopped instances manually")
1159 return constants.EXIT_FAILURE
1160
1161 return constants.EXIT_SUCCESS
1162
1163
1164 -def _EpoOff(opts, node_list, inst_map):
1165 """Does the actual power off.
1166
1167 @param opts: The command line options selected by the user
1168 @param node_list: The list of nodes to operate on (all need to support OOB)
1169 @param inst_map: A dict of inst -> nodes mapping
1170 @return: The desired exit status
1171
1172 """
1173 if not _InstanceStart(opts, inst_map.keys(), False):
1174 ToStderr("Please investigate and stop instances manually before continuing")
1175 return constants.EXIT_FAILURE
1176
1177 if not node_list:
1178 return constants.EXIT_SUCCESS
1179
1180 if _OobPower(opts, node_list, False):
1181 return constants.EXIT_SUCCESS
1182 else:
1183 return constants.EXIT_FAILURE
1184
1185
1186 -def Epo(opts, args):
1187 """EPO operations.
1188
1189 @param opts: the command line options selected by the user
1190 @type args: list
1191 @param args: should contain only one element, the subcommand
1192 @rtype: int
1193 @return: the desired exit code
1194
1195 """
1196 if opts.groups and opts.show_all:
1197 ToStderr("Only one of --groups or --all are allowed")
1198 return constants.EXIT_FAILURE
1199 elif args and opts.show_all:
1200 ToStderr("Arguments in combination with --all are not allowed")
1201 return constants.EXIT_FAILURE
1202
1203 client = GetClient()
1204
1205 if opts.groups:
1206 node_query_list = itertools.chain(*client.QueryGroups(names=args,
1207 fields=["node_list"],
1208 use_locking=False))
1209 else:
1210 node_query_list = args
1211
1212 result = client.QueryNodes(names=node_query_list,
1213 fields=["name", "master", "pinst_list",
1214 "sinst_list", "powered", "offline"],
1215 use_locking=False)
1216 node_list = []
1217 inst_map = {}
1218 for (idx, (node, master, pinsts, sinsts, powered,
1219 offline)) in enumerate(result):
1220
1221 if not opts.show_all:
1222 node_query_list[idx] = node
1223 if not offline:
1224 for inst in (pinsts + sinsts):
1225 if inst in inst_map:
1226 if not master:
1227 inst_map[inst].add(node)
1228 elif master:
1229 inst_map[inst] = set()
1230 else:
1231 inst_map[inst] = set([node])
1232
1233 if master and opts.on:
1234
1235
1236 continue
1237 elif master and not opts.show_all:
1238 ToStderr("%s is the master node, please do a master-failover to another"
1239 " node not affected by the EPO or use --all if you intend to"
1240 " shutdown the whole cluster", node)
1241 return constants.EXIT_FAILURE
1242 elif powered is None:
1243 ToStdout("Node %s does not support out-of-band handling, it can not be"
1244 " handled in a fully automated manner", node)
1245 elif powered == opts.on:
1246 ToStdout("Node %s is already in desired power state, skipping", node)
1247 elif not offline or (offline and powered):
1248 node_list.append(node)
1249
1250 if not opts.force and not ConfirmOperation(node_query_list, "nodes", "epo"):
1251 return constants.EXIT_FAILURE
1252
1253 if opts.on:
1254 return _EpoOn(opts, node_query_list, node_list, inst_map)
1255 else:
1256 return _EpoOff(opts, node_list, inst_map)
1257
1258
1259 commands = {
1260 "init": (
1261 InitCluster, [ArgHost(min=1, max=1)],
1262 [BACKEND_OPT, CP_SIZE_OPT, ENABLED_HV_OPT, GLOBAL_FILEDIR_OPT,
1263 HVLIST_OPT, MAC_PREFIX_OPT, MASTER_NETDEV_OPT, NIC_PARAMS_OPT,
1264 NOLVM_STORAGE_OPT, NOMODIFY_ETCHOSTS_OPT, NOMODIFY_SSH_SETUP_OPT,
1265 SECONDARY_IP_OPT, VG_NAME_OPT, MAINTAIN_NODE_HEALTH_OPT,
1266 UIDPOOL_OPT, DRBD_HELPER_OPT, NODRBD_STORAGE_OPT,
1267 DEFAULT_IALLOCATOR_OPT, PRIMARY_IP_VERSION_OPT, PREALLOC_WIPE_DISKS_OPT,
1268 NODE_PARAMS_OPT, GLOBAL_SHARED_FILEDIR_OPT],
1269 "[opts...] <cluster_name>", "Initialises a new cluster configuration"),
1270 "destroy": (
1271 DestroyCluster, ARGS_NONE, [YES_DOIT_OPT],
1272 "", "Destroy cluster"),
1273 "rename": (
1274 RenameCluster, [ArgHost(min=1, max=1)],
1275 [FORCE_OPT, DRY_RUN_OPT],
1276 "<new_name>",
1277 "Renames the cluster"),
1278 "redist-conf": (
1279 RedistributeConfig, ARGS_NONE, [SUBMIT_OPT, DRY_RUN_OPT, PRIORITY_OPT],
1280 "", "Forces a push of the configuration file and ssconf files"
1281 " to the nodes in the cluster"),
1282 "verify": (
1283 VerifyCluster, ARGS_NONE,
1284 [VERBOSE_OPT, DEBUG_SIMERR_OPT, ERROR_CODES_OPT, NONPLUS1_OPT,
1285 DRY_RUN_OPT, PRIORITY_OPT, NODEGROUP_OPT],
1286 "", "Does a check on the cluster configuration"),
1287 "verify-disks": (
1288 VerifyDisks, ARGS_NONE, [PRIORITY_OPT],
1289 "", "Does a check on the cluster disk status"),
1290 "repair-disk-sizes": (
1291 RepairDiskSizes, ARGS_MANY_INSTANCES, [DRY_RUN_OPT, PRIORITY_OPT],
1292 "[instance...]", "Updates mismatches in recorded disk sizes"),
1293 "master-failover": (
1294 MasterFailover, ARGS_NONE, [NOVOTING_OPT],
1295 "", "Makes the current node the master"),
1296 "master-ping": (
1297 MasterPing, ARGS_NONE, [],
1298 "", "Checks if the master is alive"),
1299 "version": (
1300 ShowClusterVersion, ARGS_NONE, [],
1301 "", "Shows the cluster version"),
1302 "getmaster": (
1303 ShowClusterMaster, ARGS_NONE, [],
1304 "", "Shows the cluster master"),
1305 "copyfile": (
1306 ClusterCopyFile, [ArgFile(min=1, max=1)],
1307 [NODE_LIST_OPT, USE_REPL_NET_OPT, NODEGROUP_OPT],
1308 "[-n node...] <filename>", "Copies a file to all (or only some) nodes"),
1309 "command": (
1310 RunClusterCommand, [ArgCommand(min=1)],
1311 [NODE_LIST_OPT, NODEGROUP_OPT],
1312 "[-n node...] <command>", "Runs a command on all (or only some) nodes"),
1313 "info": (
1314 ShowClusterConfig, ARGS_NONE, [ROMAN_OPT],
1315 "[--roman]", "Show cluster configuration"),
1316 "list-tags": (
1317 ListTags, ARGS_NONE, [], "", "List the tags of the cluster"),
1318 "add-tags": (
1319 AddTags, [ArgUnknown()], [TAG_SRC_OPT, PRIORITY_OPT],
1320 "tag...", "Add tags to the cluster"),
1321 "remove-tags": (
1322 RemoveTags, [ArgUnknown()], [TAG_SRC_OPT, PRIORITY_OPT],
1323 "tag...", "Remove tags from the cluster"),
1324 "search-tags": (
1325 SearchTags, [ArgUnknown(min=1, max=1)], [PRIORITY_OPT], "",
1326 "Searches the tags on all objects on"
1327 " the cluster for a given pattern (regex)"),
1328 "queue": (
1329 QueueOps,
1330 [ArgChoice(min=1, max=1, choices=["drain", "undrain", "info"])],
1331 [], "drain|undrain|info", "Change queue properties"),
1332 "watcher": (
1333 WatcherOps,
1334 [ArgChoice(min=1, max=1, choices=["pause", "continue", "info"]),
1335 ArgSuggest(min=0, max=1, choices=["30m", "1h", "4h"])],
1336 [],
1337 "{pause <timespec>|continue|info}", "Change watcher properties"),
1338 "modify": (
1339 SetClusterParams, ARGS_NONE,
1340 [BACKEND_OPT, CP_SIZE_OPT, ENABLED_HV_OPT, HVLIST_OPT, MASTER_NETDEV_OPT,
1341 NIC_PARAMS_OPT, NOLVM_STORAGE_OPT, VG_NAME_OPT, MAINTAIN_NODE_HEALTH_OPT,
1342 UIDPOOL_OPT, ADD_UIDS_OPT, REMOVE_UIDS_OPT, DRBD_HELPER_OPT,
1343 NODRBD_STORAGE_OPT, DEFAULT_IALLOCATOR_OPT, RESERVED_LVS_OPT,
1344 DRY_RUN_OPT, PRIORITY_OPT, PREALLOC_WIPE_DISKS_OPT, NODE_PARAMS_OPT],
1345 "[opts...]",
1346 "Alters the parameters of the cluster"),
1347 "renew-crypto": (
1348 RenewCrypto, ARGS_NONE,
1349 [NEW_CLUSTER_CERT_OPT, NEW_RAPI_CERT_OPT, RAPI_CERT_OPT,
1350 NEW_CONFD_HMAC_KEY_OPT, FORCE_OPT,
1351 NEW_CLUSTER_DOMAIN_SECRET_OPT, CLUSTER_DOMAIN_SECRET_OPT],
1352 "[opts...]",
1353 "Renews cluster certificates, keys and secrets"),
1354 "epo": (
1355 Epo, [ArgUnknown()],
1356 [FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT,
1357 SHUTDOWN_TIMEOUT_OPT, POWER_DELAY_OPT],
1358 "[opts...] [args]",
1359 "Performs an emergency power-off on given args"),
1360 }
1361
1362
1363
1364 aliases = {
1365 "masterfailover": "master-failover",
1366 }
1367
1368
1369 -def Main():
1372