1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """Functions to bootstrap a new cluster.
23
24 """
25
26 import os
27 import os.path
28 import re
29 import logging
30 import time
31 import tempfile
32
33 from ganeti import rpc
34 from ganeti import ssh
35 from ganeti import utils
36 from ganeti import errors
37 from ganeti import config
38 from ganeti import constants
39 from ganeti import objects
40 from ganeti import ssconf
41 from ganeti import serializer
42 from ganeti import hypervisor
43 from ganeti import bdev
44 from ganeti import netutils
45 from ganeti import luxi
46 from ganeti import jstore
47 from ganeti import pathutils
48
49
50
51 _INITCONF_ECID = "initconfig-ecid"
52
53
54 _DAEMON_READY_TIMEOUT = 10.0
55
56
58 """Setup the SSH configuration for the cluster.
59
60 This generates a dsa keypair for root, adds the pub key to the
61 permitted hosts and adds the hostkey to its own known hosts.
62
63 """
64 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
65
66 for name in priv_key, pub_key:
67 if os.path.exists(name):
68 utils.CreateBackup(name)
69 utils.RemoveFile(name)
70
71 result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
72 "-f", priv_key,
73 "-q", "-N", ""])
74 if result.failed:
75 raise errors.OpExecError("Could not generate ssh keypair, error %s" %
76 result.output)
77
78 utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
79
80
82 """Writes a new HMAC key.
83
84 @type file_name: str
85 @param file_name: Path to output file
86
87 """
88 utils.WriteFile(file_name, data="%s\n" % utils.GenerateSecret(), mode=0400,
89 backup=True)
90
91
92 -def GenerateClusterCrypto(new_cluster_cert, new_rapi_cert, new_spice_cert,
93 new_confd_hmac_key, new_cds,
94 rapi_cert_pem=None, spice_cert_pem=None,
95 spice_cacert_pem=None, cds=None,
96 nodecert_file=pathutils.NODED_CERT_FILE,
97 rapicert_file=pathutils.RAPI_CERT_FILE,
98 spicecert_file=pathutils.SPICE_CERT_FILE,
99 spicecacert_file=pathutils.SPICE_CACERT_FILE,
100 hmackey_file=pathutils.CONFD_HMAC_KEY,
101 cds_file=pathutils.CLUSTER_DOMAIN_SECRET_FILE):
102 """Updates the cluster certificates, keys and secrets.
103
104 @type new_cluster_cert: bool
105 @param new_cluster_cert: Whether to generate a new cluster certificate
106 @type new_rapi_cert: bool
107 @param new_rapi_cert: Whether to generate a new RAPI certificate
108 @type new_spice_cert: bool
109 @param new_spice_cert: Whether to generate a new SPICE certificate
110 @type new_confd_hmac_key: bool
111 @param new_confd_hmac_key: Whether to generate a new HMAC key
112 @type new_cds: bool
113 @param new_cds: Whether to generate a new cluster domain secret
114 @type rapi_cert_pem: string
115 @param rapi_cert_pem: New RAPI certificate in PEM format
116 @type spice_cert_pem: string
117 @param spice_cert_pem: New SPICE certificate in PEM format
118 @type spice_cacert_pem: string
119 @param spice_cacert_pem: Certificate of the CA that signed the SPICE
120 certificate, in PEM format
121 @type cds: string
122 @param cds: New cluster domain secret
123 @type nodecert_file: string
124 @param nodecert_file: optional override of the node cert file path
125 @type rapicert_file: string
126 @param rapicert_file: optional override of the rapi cert file path
127 @type spicecert_file: string
128 @param spicecert_file: optional override of the spice cert file path
129 @type spicecacert_file: string
130 @param spicecacert_file: optional override of the spice CA cert file path
131 @type hmackey_file: string
132 @param hmackey_file: optional override of the hmac key file path
133
134 """
135
136 cluster_cert_exists = os.path.exists(nodecert_file)
137 if new_cluster_cert or not cluster_cert_exists:
138 if cluster_cert_exists:
139 utils.CreateBackup(nodecert_file)
140
141 logging.debug("Generating new cluster certificate at %s", nodecert_file)
142 utils.GenerateSelfSignedSslCert(nodecert_file)
143
144
145 if new_confd_hmac_key or not os.path.exists(hmackey_file):
146 logging.debug("Writing new confd HMAC key to %s", hmackey_file)
147 GenerateHmacKey(hmackey_file)
148
149
150 rapi_cert_exists = os.path.exists(rapicert_file)
151
152 if rapi_cert_pem:
153
154 logging.debug("Writing RAPI certificate at %s", rapicert_file)
155 utils.WriteFile(rapicert_file, data=rapi_cert_pem, backup=True)
156
157 elif new_rapi_cert or not rapi_cert_exists:
158 if rapi_cert_exists:
159 utils.CreateBackup(rapicert_file)
160
161 logging.debug("Generating new RAPI certificate at %s", rapicert_file)
162 utils.GenerateSelfSignedSslCert(rapicert_file)
163
164
165 spice_cert_exists = os.path.exists(spicecert_file)
166 spice_cacert_exists = os.path.exists(spicecacert_file)
167 if spice_cert_pem:
168
169 logging.debug("Writing SPICE certificate at %s", spicecert_file)
170 utils.WriteFile(spicecert_file, data=spice_cert_pem, backup=True)
171 logging.debug("Writing SPICE CA certificate at %s", spicecacert_file)
172 utils.WriteFile(spicecacert_file, data=spice_cacert_pem, backup=True)
173 elif new_spice_cert or not spice_cert_exists:
174 if spice_cert_exists:
175 utils.CreateBackup(spicecert_file)
176 if spice_cacert_exists:
177 utils.CreateBackup(spicecacert_file)
178
179 logging.debug("Generating new self-signed SPICE certificate at %s",
180 spicecert_file)
181 (_, cert_pem) = utils.GenerateSelfSignedSslCert(spicecert_file)
182
183
184
185 logging.debug("Writing the public certificate to %s",
186 spicecert_file)
187 utils.io.WriteFile(spicecacert_file, mode=0400, data=cert_pem)
188
189
190 if cds:
191 logging.debug("Writing cluster domain secret to %s", cds_file)
192 utils.WriteFile(cds_file, data=cds, backup=True)
193
194 elif new_cds or not os.path.exists(cds_file):
195 logging.debug("Generating new cluster domain secret at %s", cds_file)
196 GenerateHmacKey(cds_file)
197
198
200 """Setup the necessary configuration for the initial node daemon.
201
202 This creates the nodepass file containing the shared password for
203 the cluster, generates the SSL certificate and starts the node daemon.
204
205 @type master_name: str
206 @param master_name: Name of the master node
207
208 """
209
210 GenerateClusterCrypto(True, False, False, False, False)
211
212 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.NODED])
213 if result.failed:
214 raise errors.OpExecError("Could not start the node daemon, command %s"
215 " had exitcode %s and error %s" %
216 (result.cmd, result.exit_code, result.output))
217
218 _WaitForNodeDaemon(master_name)
219
220
231
232 try:
233 utils.Retry(_CheckNodeDaemon, 1.0, _DAEMON_READY_TIMEOUT)
234 except utils.RetryTimeout:
235 raise errors.OpExecError("Node daemon on %s didn't answer queries within"
236 " %s seconds" % (node_name, _DAEMON_READY_TIMEOUT))
237
238
251
252 try:
253 utils.Retry(_CheckMasterDaemon, 1.0, _DAEMON_READY_TIMEOUT)
254 except utils.RetryTimeout:
255 raise errors.OpExecError("Master daemon didn't answer queries within"
256 " %s seconds" % _DAEMON_READY_TIMEOUT)
257
258
259 -def RunNodeSetupCmd(cluster_name, node, basecmd, debug, verbose,
260 use_cluster_key, ask_key, strict_host_check, data):
261 """Runs a command to configure something on a remote machine.
262
263 @type cluster_name: string
264 @param cluster_name: Cluster name
265 @type node: string
266 @param node: Node name
267 @type basecmd: string
268 @param basecmd: Base command (path on the remote machine)
269 @type debug: bool
270 @param debug: Enable debug output
271 @type verbose: bool
272 @param verbose: Enable verbose output
273 @type use_cluster_key: bool
274 @param use_cluster_key: See L{ssh.SshRunner.BuildCmd}
275 @type ask_key: bool
276 @param ask_key: See L{ssh.SshRunner.BuildCmd}
277 @type strict_host_check: bool
278 @param strict_host_check: See L{ssh.SshRunner.BuildCmd}
279 @param data: JSON-serializable input data for script (passed to stdin)
280
281 """
282 cmd = [basecmd]
283
284
285 if debug:
286 cmd.append("--debug")
287
288 if verbose:
289 cmd.append("--verbose")
290
291 family = ssconf.SimpleStore().GetPrimaryIPFamily()
292 srun = ssh.SshRunner(cluster_name,
293 ipv6=(family == netutils.IP6Address.family))
294 scmd = srun.BuildCmd(node, constants.SSH_LOGIN_USER,
295 utils.ShellQuoteArgs(cmd),
296 batch=False, ask_key=ask_key, quiet=False,
297 strict_host_check=strict_host_check,
298 use_cluster_key=use_cluster_key)
299
300 tempfh = tempfile.TemporaryFile()
301 try:
302 tempfh.write(serializer.DumpJson(data))
303 tempfh.seek(0)
304
305 result = utils.RunCmd(scmd, interactive=True, input_fd=tempfh)
306 finally:
307 tempfh.close()
308
309 if result.failed:
310 raise errors.OpExecError("Command '%s' failed: %s" %
311 (result.cmd, result.fail_reason))
312
313
315 """Initialize if needed the file storage.
316
317 @param file_storage_dir: the user-supplied value
318 @return: either empty string (if file storage was disabled at build
319 time) or the normalized path to the storage directory
320
321 """
322 file_storage_dir = os.path.normpath(file_storage_dir)
323
324 if not os.path.isabs(file_storage_dir):
325 raise errors.OpPrereqError("File storage directory '%s' is not an absolute"
326 " path" % file_storage_dir, errors.ECODE_INVAL)
327
328 if not os.path.exists(file_storage_dir):
329 try:
330 os.makedirs(file_storage_dir, 0750)
331 except OSError, err:
332 raise errors.OpPrereqError("Cannot create file storage directory"
333 " '%s': %s" % (file_storage_dir, err),
334 errors.ECODE_ENVIRON)
335
336 if not os.path.isdir(file_storage_dir):
337 raise errors.OpPrereqError("The file storage directory '%s' is not"
338 " a directory." % file_storage_dir,
339 errors.ECODE_ENVIRON)
340 return file_storage_dir
341
342
343 -def InitCluster(cluster_name, mac_prefix,
344 master_netmask, master_netdev, file_storage_dir,
345 shared_file_storage_dir, candidate_pool_size, secondary_ip=None,
346 vg_name=None, beparams=None, nicparams=None, ndparams=None,
347 hvparams=None, diskparams=None, enabled_hypervisors=None,
348 modify_etc_hosts=True, modify_ssh_setup=True,
349 maintain_node_health=False, drbd_helper=None, uid_pool=None,
350 default_iallocator=None, primary_ip_version=None, ipolicy=None,
351 prealloc_wipe_disks=False, use_external_mip_script=False,
352 hv_state=None, disk_state=None):
353 """Initialise the cluster.
354
355 @type candidate_pool_size: int
356 @param candidate_pool_size: master candidate pool size
357
358 """
359
360 if config.ConfigWriter.IsCluster():
361 raise errors.OpPrereqError("Cluster is already initialised",
362 errors.ECODE_STATE)
363
364 if not enabled_hypervisors:
365 raise errors.OpPrereqError("Enabled hypervisors list must contain at"
366 " least one member", errors.ECODE_INVAL)
367 invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
368 if invalid_hvs:
369 raise errors.OpPrereqError("Enabled hypervisors contains invalid"
370 " entries: %s" % invalid_hvs,
371 errors.ECODE_INVAL)
372
373 try:
374 ipcls = netutils.IPAddress.GetClassFromIpVersion(primary_ip_version)
375 except errors.ProgrammerError:
376 raise errors.OpPrereqError("Invalid primary ip version: %d." %
377 primary_ip_version, errors.ECODE_INVAL)
378
379 hostname = netutils.GetHostname(family=ipcls.family)
380 if not ipcls.IsValid(hostname.ip):
381 raise errors.OpPrereqError("This host's IP (%s) is not a valid IPv%d"
382 " address." % (hostname.ip, primary_ip_version),
383 errors.ECODE_INVAL)
384
385 if ipcls.IsLoopback(hostname.ip):
386 raise errors.OpPrereqError("This host's IP (%s) resolves to a loopback"
387 " address. Please fix DNS or %s." %
388 (hostname.ip, pathutils.ETC_HOSTS),
389 errors.ECODE_ENVIRON)
390
391 if not ipcls.Own(hostname.ip):
392 raise errors.OpPrereqError("Inconsistency: this host's name resolves"
393 " to %s,\nbut this ip address does not"
394 " belong to this host" %
395 hostname.ip, errors.ECODE_ENVIRON)
396
397 clustername = netutils.GetHostname(name=cluster_name, family=ipcls.family)
398
399 if netutils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT, timeout=5):
400 raise errors.OpPrereqError("Cluster IP already active",
401 errors.ECODE_NOTUNIQUE)
402
403 if not secondary_ip:
404 if primary_ip_version == constants.IP6_VERSION:
405 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
406 " IPv4 address must be given as secondary",
407 errors.ECODE_INVAL)
408 secondary_ip = hostname.ip
409
410 if not netutils.IP4Address.IsValid(secondary_ip):
411 raise errors.OpPrereqError("Secondary IP address (%s) has to be a valid"
412 " IPv4 address." % secondary_ip,
413 errors.ECODE_INVAL)
414
415 if not netutils.IP4Address.Own(secondary_ip):
416 raise errors.OpPrereqError("You gave %s as secondary IP,"
417 " but it does not belong to this host." %
418 secondary_ip, errors.ECODE_ENVIRON)
419
420 if master_netmask is not None:
421 if not ipcls.ValidateNetmask(master_netmask):
422 raise errors.OpPrereqError("CIDR netmask (%s) not valid for IPv%s " %
423 (master_netmask, primary_ip_version),
424 errors.ECODE_INVAL)
425 else:
426 master_netmask = ipcls.iplen
427
428 if vg_name is not None:
429
430 vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
431 constants.MIN_VG_SIZE)
432 if vgstatus:
433 raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
434 " you are not using lvm" % vgstatus,
435 errors.ECODE_INVAL)
436
437 if drbd_helper is not None:
438 try:
439 curr_helper = bdev.BaseDRBD.GetUsermodeHelper()
440 except errors.BlockDeviceError, err:
441 raise errors.OpPrereqError("Error while checking drbd helper"
442 " (specify --no-drbd-storage if you are not"
443 " using drbd): %s" % str(err),
444 errors.ECODE_ENVIRON)
445 if drbd_helper != curr_helper:
446 raise errors.OpPrereqError("Error: requiring %s as drbd helper but %s"
447 " is the current helper" % (drbd_helper,
448 curr_helper),
449 errors.ECODE_INVAL)
450
451 logging.debug("Stopping daemons (if any are running)")
452 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-all"])
453 if result.failed:
454 raise errors.OpExecError("Could not stop daemons, command %s"
455 " had exitcode %s and error '%s'" %
456 (result.cmd, result.exit_code, result.output))
457
458 if constants.ENABLE_FILE_STORAGE:
459 file_storage_dir = _InitFileStorage(file_storage_dir)
460 else:
461 file_storage_dir = ""
462
463 if constants.ENABLE_SHARED_FILE_STORAGE:
464 shared_file_storage_dir = _InitFileStorage(shared_file_storage_dir)
465 else:
466 shared_file_storage_dir = ""
467
468 if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
469 raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix,
470 errors.ECODE_INVAL)
471
472 result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
473 if result.failed:
474 raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
475 (master_netdev,
476 result.output.strip()), errors.ECODE_INVAL)
477
478 dirs = [(pathutils.RUN_DIR, constants.RUN_DIRS_MODE)]
479 utils.EnsureDirs(dirs)
480
481 objects.UpgradeBeParams(beparams)
482 utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
483 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
484
485 objects.NIC.CheckParameterSyntax(nicparams)
486
487 full_ipolicy = objects.FillIPolicy(constants.IPOLICY_DEFAULTS, ipolicy)
488
489 if ndparams is not None:
490 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
491 else:
492 ndparams = dict(constants.NDC_DEFAULTS)
493
494
495
496
497 if hv_state:
498 for hvname, hvs_data in hv_state.items():
499 utils.ForceDictType(hvs_data, constants.HVSTS_PARAMETER_TYPES)
500 hv_state[hvname] = objects.Cluster.SimpleFillHvState(hvs_data)
501 else:
502 hv_state = dict((hvname, constants.HVST_DEFAULTS)
503 for hvname in enabled_hypervisors)
504
505
506 if disk_state:
507 for storage, ds_data in disk_state.items():
508 if storage not in constants.DS_VALID_TYPES:
509 raise errors.OpPrereqError("Invalid storage type in disk state: %s" %
510 storage, errors.ECODE_INVAL)
511 for ds_name, state in ds_data.items():
512 utils.ForceDictType(state, constants.DSS_PARAMETER_TYPES)
513 ds_data[ds_name] = objects.Cluster.SimpleFillDiskState(state)
514
515
516 for hv_name, hv_params in hvparams.iteritems():
517 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
518 hv_class = hypervisor.GetHypervisor(hv_name)
519 hv_class.CheckParameterSyntax(hv_params)
520
521
522 for template, dt_params in diskparams.items():
523 param_keys = set(dt_params.keys())
524 default_param_keys = set(constants.DISK_DT_DEFAULTS[template].keys())
525 if not (param_keys <= default_param_keys):
526 unknown_params = param_keys - default_param_keys
527 raise errors.OpPrereqError("Invalid parameters for disk template %s:"
528 " %s" % (template,
529 utils.CommaJoin(unknown_params)),
530 errors.ECODE_INVAL)
531 utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
532 if template == constants.DT_DRBD8 and vg_name is not None:
533
534
535 dt_params[constants.DRBD_DEFAULT_METAVG] = vg_name
536
537 try:
538 utils.VerifyDictOptions(diskparams, constants.DISK_DT_DEFAULTS)
539 except errors.OpPrereqError, err:
540 raise errors.OpPrereqError("While verify diskparam options: %s" % err,
541 errors.ECODE_INVAL)
542
543
544 sshline = utils.ReadFile(pathutils.SSH_HOST_RSA_PUB)
545 sshkey = sshline.split(" ")[1]
546
547 if modify_etc_hosts:
548 utils.AddHostToEtcHosts(hostname.name, hostname.ip)
549
550 if modify_ssh_setup:
551 _InitSSHSetup()
552
553 if default_iallocator is not None:
554 alloc_script = utils.FindFile(default_iallocator,
555 constants.IALLOCATOR_SEARCH_PATH,
556 os.path.isfile)
557 if alloc_script is None:
558 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
559 " specified" % default_iallocator,
560 errors.ECODE_INVAL)
561 elif constants.HTOOLS:
562
563 if utils.FindFile(constants.IALLOC_HAIL,
564 constants.IALLOCATOR_SEARCH_PATH,
565 os.path.isfile):
566 default_iallocator = constants.IALLOC_HAIL
567
568 now = time.time()
569
570
571 cluster_config = objects.Cluster(
572 serial_no=1,
573 rsahostkeypub=sshkey,
574 highest_used_port=(constants.FIRST_DRBD_PORT - 1),
575 mac_prefix=mac_prefix,
576 volume_group_name=vg_name,
577 tcpudp_port_pool=set(),
578 master_node=hostname.name,
579 master_ip=clustername.ip,
580 master_netmask=master_netmask,
581 master_netdev=master_netdev,
582 cluster_name=clustername.name,
583 file_storage_dir=file_storage_dir,
584 shared_file_storage_dir=shared_file_storage_dir,
585 enabled_hypervisors=enabled_hypervisors,
586 beparams={constants.PP_DEFAULT: beparams},
587 nicparams={constants.PP_DEFAULT: nicparams},
588 ndparams=ndparams,
589 hvparams=hvparams,
590 diskparams=diskparams,
591 candidate_pool_size=candidate_pool_size,
592 modify_etc_hosts=modify_etc_hosts,
593 modify_ssh_setup=modify_ssh_setup,
594 uid_pool=uid_pool,
595 ctime=now,
596 mtime=now,
597 maintain_node_health=maintain_node_health,
598 drbd_usermode_helper=drbd_helper,
599 default_iallocator=default_iallocator,
600 primary_ip_family=ipcls.family,
601 prealloc_wipe_disks=prealloc_wipe_disks,
602 use_external_mip_script=use_external_mip_script,
603 ipolicy=full_ipolicy,
604 hv_state_static=hv_state,
605 disk_state_static=disk_state,
606 )
607 master_node_config = objects.Node(name=hostname.name,
608 primary_ip=hostname.ip,
609 secondary_ip=secondary_ip,
610 serial_no=1,
611 master_candidate=True,
612 offline=False, drained=False,
613 ctime=now, mtime=now,
614 )
615 InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
616 cfg = config.ConfigWriter(offline=True)
617 ssh.WriteKnownHostsFile(cfg, pathutils.SSH_KNOWN_HOSTS_FILE)
618 cfg.Update(cfg.GetClusterInfo(), logging.error)
619 ssconf.WriteSsconfFiles(cfg.GetSsconfValues())
620
621
622 _InitGanetiServerSetup(hostname.name)
623
624 logging.debug("Starting daemons")
625 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"])
626 if result.failed:
627 raise errors.OpExecError("Could not start daemons, command %s"
628 " had exitcode %s and error %s" %
629 (result.cmd, result.exit_code, result.output))
630
631 _WaitForMasterDaemon()
632
633
636 """Create the initial cluster configuration.
637
638 It will contain the current node, which will also be the master
639 node, and no instances.
640
641 @type version: int
642 @param version: configuration version
643 @type cluster_config: L{objects.Cluster}
644 @param cluster_config: cluster configuration
645 @type master_node_config: L{objects.Node}
646 @param master_node_config: master node configuration
647 @type cfg_file: string
648 @param cfg_file: configuration file path
649
650 """
651 uuid_generator = config.TemporaryReservationManager()
652 cluster_config.uuid = uuid_generator.Generate([], utils.NewUUID,
653 _INITCONF_ECID)
654 master_node_config.uuid = uuid_generator.Generate([], utils.NewUUID,
655 _INITCONF_ECID)
656 nodes = {
657 master_node_config.name: master_node_config,
658 }
659 default_nodegroup = objects.NodeGroup(
660 uuid=uuid_generator.Generate([], utils.NewUUID, _INITCONF_ECID),
661 name=constants.INITIAL_NODE_GROUP_NAME,
662 members=[master_node_config.name],
663 diskparams={},
664 )
665 nodegroups = {
666 default_nodegroup.uuid: default_nodegroup,
667 }
668 now = time.time()
669 config_data = objects.ConfigData(version=version,
670 cluster=cluster_config,
671 nodegroups=nodegroups,
672 nodes=nodes,
673 instances={},
674 networks={},
675 serial_no=1,
676 ctime=now, mtime=now)
677 utils.WriteFile(cfg_file,
678 data=serializer.Dump(config_data.ToDict()),
679 mode=0600)
680
681
683 """Execute the last steps of cluster destroy
684
685 This function shuts down all the daemons, completing the destroy
686 begun in cmdlib.LUDestroyOpcode.
687
688 """
689 cfg = config.ConfigWriter()
690 modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
691 runner = rpc.BootstrapRunner()
692
693 master_params = cfg.GetMasterNetworkParameters()
694 master_params.name = master
695 ems = cfg.GetUseExternalMipScript()
696 result = runner.call_node_deactivate_master_ip(master_params.name,
697 master_params, ems)
698
699 msg = result.fail_msg
700 if msg:
701 logging.warning("Could not disable the master IP: %s", msg)
702
703 result = runner.call_node_stop_master(master)
704 msg = result.fail_msg
705 if msg:
706 logging.warning("Could not disable the master role: %s", msg)
707
708 result = runner.call_node_leave_cluster(master, modify_ssh_setup)
709 msg = result.fail_msg
710 if msg:
711 logging.warning("Could not shutdown the node daemon and cleanup"
712 " the node: %s", msg)
713
714
716 """Add a node to the cluster.
717
718 This function must be called before the actual opcode, and will ssh
719 to the remote node, copy the needed files, and start ganeti-noded,
720 allowing the master to do the rest via normal rpc calls.
721
722 @param cluster_name: the cluster name
723 @param node: the name of the new node
724
725 """
726 data = {
727 constants.NDS_CLUSTER_NAME: cluster_name,
728 constants.NDS_NODE_DAEMON_CERTIFICATE:
729 utils.ReadFile(pathutils.NODED_CERT_FILE),
730 constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
731 constants.NDS_START_NODE_DAEMON: True,
732 }
733
734 RunNodeSetupCmd(cluster_name, node, pathutils.NODE_DAEMON_SETUP,
735 opts.debug, opts.verbose,
736 True, opts.ssh_key_check, opts.ssh_key_check, data)
737
738 _WaitForNodeDaemon(node)
739
740
742 """Failover the master node.
743
744 This checks that we are not already the master, and will cause the
745 current master to cease being master, and the non-master to become
746 new master.
747
748 @type no_voting: boolean
749 @param no_voting: force the operation without remote nodes agreement
750 (dangerous)
751
752 """
753 sstore = ssconf.SimpleStore()
754
755 old_master, new_master = ssconf.GetMasterAndMyself(sstore)
756 node_list = sstore.GetNodeList()
757 mc_list = sstore.GetMasterCandidates()
758
759 if old_master == new_master:
760 raise errors.OpPrereqError("This commands must be run on the node"
761 " where you want the new master to be."
762 " %s is already the master" %
763 old_master, errors.ECODE_INVAL)
764
765 if new_master not in mc_list:
766 mc_no_master = [name for name in mc_list if name != old_master]
767 raise errors.OpPrereqError("This node is not among the nodes marked"
768 " as master candidates. Only these nodes"
769 " can become masters. Current list of"
770 " master candidates is:\n"
771 "%s" % ("\n".join(mc_no_master)),
772 errors.ECODE_STATE)
773
774 if not no_voting:
775 vote_list = GatherMasterVotes(node_list)
776
777 if vote_list:
778 voted_master = vote_list[0][0]
779 if voted_master is None:
780 raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
781 " not respond.", errors.ECODE_ENVIRON)
782 elif voted_master != old_master:
783 raise errors.OpPrereqError("I have a wrong configuration, I believe"
784 " the master is %s but the other nodes"
785 " voted %s. Please resync the configuration"
786 " of this node." %
787 (old_master, voted_master),
788 errors.ECODE_STATE)
789
790
791 rcode = 0
792
793 logging.info("Setting master to %s, old master: %s", new_master, old_master)
794
795 try:
796
797
798 cfg = config.ConfigWriter(accept_foreign=True)
799
800 cluster_info = cfg.GetClusterInfo()
801 cluster_info.master_node = new_master
802
803
804 cfg.Update(cluster_info, logging.error)
805 except errors.ConfigurationError, err:
806 logging.error("Error while trying to set the new master: %s",
807 str(err))
808 return 1
809
810
811
812
813
814
815 logging.info("Stopping the master daemon on node %s", old_master)
816
817 runner = rpc.BootstrapRunner()
818 master_params = cfg.GetMasterNetworkParameters()
819 master_params.name = old_master
820 ems = cfg.GetUseExternalMipScript()
821 result = runner.call_node_deactivate_master_ip(master_params.name,
822 master_params, ems)
823
824 msg = result.fail_msg
825 if msg:
826 logging.warning("Could not disable the master IP: %s", msg)
827
828 result = runner.call_node_stop_master(old_master)
829 msg = result.fail_msg
830 if msg:
831 logging.error("Could not disable the master role on the old master"
832 " %s, please disable manually: %s", old_master, msg)
833
834 logging.info("Checking master IP non-reachability...")
835
836 master_ip = sstore.GetMasterIP()
837 total_timeout = 30
838
839
840 def _check_ip():
841 if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
842 raise utils.RetryAgain()
843
844 try:
845 utils.Retry(_check_ip, (1, 1.5, 5), total_timeout)
846 except utils.RetryTimeout:
847 logging.warning("The master IP is still reachable after %s seconds,"
848 " continuing but activating the master on the current"
849 " node will probably fail", total_timeout)
850
851 if jstore.CheckDrainFlag():
852 logging.info("Undraining job queue")
853 jstore.SetDrainFlag(False)
854
855 logging.info("Starting the master daemons on the new master")
856
857 result = rpc.BootstrapRunner().call_node_start_master_daemons(new_master,
858 no_voting)
859 msg = result.fail_msg
860 if msg:
861 logging.error("Could not start the master role on the new master"
862 " %s, please check: %s", new_master, msg)
863 rcode = 1
864
865 logging.info("Master failed over from %s to %s", old_master, new_master)
866 return rcode
867
868
870 """Returns the current master node.
871
872 This is a separate function in bootstrap since it's needed by
873 gnt-cluster, and instead of importing directly ssconf, it's better
874 to abstract it in bootstrap, where we do use ssconf in other
875 functions too.
876
877 """
878 sstore = ssconf.SimpleStore()
879
880 old_master, _ = ssconf.GetMasterAndMyself(sstore)
881
882 return old_master
883
884
886 """Check the agreement on who is the master.
887
888 This function will return a list of (node, number of votes), ordered
889 by the number of votes. Errors will be denoted by the key 'None'.
890
891 Note that the sum of votes is the number of nodes this machine
892 knows, whereas the number of entries in the list could be different
893 (if some nodes vote for another master).
894
895 We remove ourselves from the list since we know that (bugs aside)
896 since we use the same source for configuration information for both
897 backend and boostrap, we'll always vote for ourselves.
898
899 @type node_list: list
900 @param node_list: the list of nodes to query for master info; the current
901 node will be removed if it is in the list
902 @rtype: list
903 @return: list of (node, votes)
904
905 """
906 myself = netutils.Hostname.GetSysName()
907 try:
908 node_list.remove(myself)
909 except ValueError:
910 pass
911 if not node_list:
912
913 return []
914 results = rpc.BootstrapRunner().call_master_info(node_list)
915 if not isinstance(results, dict):
916
917 logging.critical("Can't complete rpc call, aborting master startup")
918 return [(None, len(node_list))]
919 votes = {}
920 for node in results:
921 nres = results[node]
922 data = nres.payload
923 msg = nres.fail_msg
924 fail = False
925 if msg:
926 logging.warning("Error contacting node %s: %s", node, msg)
927 fail = True
928
929
930 elif not isinstance(data, (tuple, list)) or len(data) < 3:
931 logging.warning("Invalid data received from node %s: %s", node, data)
932 fail = True
933 if fail:
934 if None not in votes:
935 votes[None] = 0
936 votes[None] += 1
937 continue
938 master_node = data[2]
939 if master_node not in votes:
940 votes[master_node] = 0
941 votes[master_node] += 1
942
943 vote_list = [v for v in votes.items()]
944
945
946
947 vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
948
949 return vote_list
950