1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """Functions to bootstrap a new cluster.
32
33 """
34
35 import os
36 import os.path
37 import re
38 import logging
39 import time
40 import tempfile
41
42 from ganeti.cmdlib import cluster
43 from ganeti import rpc
44 from ganeti import ssh
45 from ganeti import utils
46 from ganeti import errors
47 from ganeti import config
48 from ganeti import constants
49 from ganeti import objects
50 from ganeti import ssconf
51 from ganeti import serializer
52 from ganeti import hypervisor
53 from ganeti.storage import drbd
54 from ganeti.storage import filestorage
55 from ganeti import netutils
56 from ganeti import luxi
57 from ganeti import jstore
58 from ganeti import pathutils
59
60
61
62 _INITCONF_ECID = "initconfig-ecid"
63
64
65 _DAEMON_READY_TIMEOUT = 10.0
66
67
69 """Setup the SSH configuration for the cluster.
70
71 This generates a dsa keypair for root, adds the pub key to the
72 permitted hosts and adds the hostkey to its own known hosts.
73
74 """
75 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
76
77 for name in priv_key, pub_key:
78 if os.path.exists(name):
79 utils.CreateBackup(name)
80 utils.RemoveFile(name)
81
82 result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
83 "-f", priv_key,
84 "-q", "-N", ""])
85 if result.failed:
86 raise errors.OpExecError("Could not generate ssh keypair, error %s" %
87 result.output)
88
89 utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
90
91
93 """Writes a new HMAC key.
94
95 @type file_name: str
96 @param file_name: Path to output file
97
98 """
99 utils.WriteFile(file_name, data="%s\n" % utils.GenerateSecret(), mode=0400,
100 backup=True)
101
102
103 -def GenerateClusterCrypto(new_cluster_cert, new_rapi_cert, new_spice_cert,
104 new_confd_hmac_key, new_cds,
105 rapi_cert_pem=None, spice_cert_pem=None,
106 spice_cacert_pem=None, cds=None,
107 nodecert_file=pathutils.NODED_CERT_FILE,
108 rapicert_file=pathutils.RAPI_CERT_FILE,
109 spicecert_file=pathutils.SPICE_CERT_FILE,
110 spicecacert_file=pathutils.SPICE_CACERT_FILE,
111 hmackey_file=pathutils.CONFD_HMAC_KEY,
112 cds_file=pathutils.CLUSTER_DOMAIN_SECRET_FILE):
113 """Updates the cluster certificates, keys and secrets.
114
115 @type new_cluster_cert: bool
116 @param new_cluster_cert: Whether to generate a new cluster certificate
117 @type new_rapi_cert: bool
118 @param new_rapi_cert: Whether to generate a new RAPI certificate
119 @type new_spice_cert: bool
120 @param new_spice_cert: Whether to generate a new SPICE certificate
121 @type new_confd_hmac_key: bool
122 @param new_confd_hmac_key: Whether to generate a new HMAC key
123 @type new_cds: bool
124 @param new_cds: Whether to generate a new cluster domain secret
125 @type rapi_cert_pem: string
126 @param rapi_cert_pem: New RAPI certificate in PEM format
127 @type spice_cert_pem: string
128 @param spice_cert_pem: New SPICE certificate in PEM format
129 @type spice_cacert_pem: string
130 @param spice_cacert_pem: Certificate of the CA that signed the SPICE
131 certificate, in PEM format
132 @type cds: string
133 @param cds: New cluster domain secret
134 @type nodecert_file: string
135 @param nodecert_file: optional override of the node cert file path
136 @type rapicert_file: string
137 @param rapicert_file: optional override of the rapi cert file path
138 @type spicecert_file: string
139 @param spicecert_file: optional override of the spice cert file path
140 @type spicecacert_file: string
141 @param spicecacert_file: optional override of the spice CA cert file path
142 @type hmackey_file: string
143 @param hmackey_file: optional override of the hmac key file path
144
145 """
146
147 cluster_cert_exists = os.path.exists(nodecert_file)
148 if new_cluster_cert or not cluster_cert_exists:
149 if cluster_cert_exists:
150 utils.CreateBackup(nodecert_file)
151
152 logging.debug("Generating new cluster certificate at %s", nodecert_file)
153 utils.GenerateSelfSignedSslCert(nodecert_file)
154
155
156 if new_confd_hmac_key or not os.path.exists(hmackey_file):
157 logging.debug("Writing new confd HMAC key to %s", hmackey_file)
158 GenerateHmacKey(hmackey_file)
159
160
161 rapi_cert_exists = os.path.exists(rapicert_file)
162
163 if rapi_cert_pem:
164
165 logging.debug("Writing RAPI certificate at %s", rapicert_file)
166 utils.WriteFile(rapicert_file, data=rapi_cert_pem, backup=True)
167
168 elif new_rapi_cert or not rapi_cert_exists:
169 if rapi_cert_exists:
170 utils.CreateBackup(rapicert_file)
171
172 logging.debug("Generating new RAPI certificate at %s", rapicert_file)
173 utils.GenerateSelfSignedSslCert(rapicert_file)
174
175
176 spice_cert_exists = os.path.exists(spicecert_file)
177 spice_cacert_exists = os.path.exists(spicecacert_file)
178 if spice_cert_pem:
179
180 logging.debug("Writing SPICE certificate at %s", spicecert_file)
181 utils.WriteFile(spicecert_file, data=spice_cert_pem, backup=True)
182 logging.debug("Writing SPICE CA certificate at %s", spicecacert_file)
183 utils.WriteFile(spicecacert_file, data=spice_cacert_pem, backup=True)
184 elif new_spice_cert or not spice_cert_exists:
185 if spice_cert_exists:
186 utils.CreateBackup(spicecert_file)
187 if spice_cacert_exists:
188 utils.CreateBackup(spicecacert_file)
189
190 logging.debug("Generating new self-signed SPICE certificate at %s",
191 spicecert_file)
192 (_, cert_pem) = utils.GenerateSelfSignedSslCert(spicecert_file)
193
194
195
196 logging.debug("Writing the public certificate to %s",
197 spicecert_file)
198 utils.io.WriteFile(spicecacert_file, mode=0400, data=cert_pem)
199
200
201 if cds:
202 logging.debug("Writing cluster domain secret to %s", cds_file)
203 utils.WriteFile(cds_file, data=cds, backup=True)
204
205 elif new_cds or not os.path.exists(cds_file):
206 logging.debug("Generating new cluster domain secret at %s", cds_file)
207 GenerateHmacKey(cds_file)
208
209
211 """Setup the necessary configuration for the initial node daemon.
212
213 This creates the nodepass file containing the shared password for
214 the cluster, generates the SSL certificate and starts the node daemon.
215
216 @type master_name: str
217 @param master_name: Name of the master node
218
219 """
220
221 GenerateClusterCrypto(True, False, False, False, False)
222
223 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.NODED])
224 if result.failed:
225 raise errors.OpExecError("Could not start the node daemon, command %s"
226 " had exitcode %s and error %s" %
227 (result.cmd, result.exit_code, result.output))
228
229 _WaitForNodeDaemon(master_name)
230
231
242
243 try:
244 utils.Retry(_CheckNodeDaemon, 1.0, _DAEMON_READY_TIMEOUT)
245 except utils.RetryTimeout:
246 raise errors.OpExecError("Node daemon on %s didn't answer queries within"
247 " %s seconds" % (node_name, _DAEMON_READY_TIMEOUT))
248
249
262
263 try:
264 utils.Retry(_CheckMasterDaemon, 1.0, _DAEMON_READY_TIMEOUT)
265 except utils.RetryTimeout:
266 raise errors.OpExecError("Master daemon didn't answer queries within"
267 " %s seconds" % _DAEMON_READY_TIMEOUT)
268
269
271 """Wait for SSH daemon to become responsive.
272
273 """
274 hostip = netutils.GetHostname(name=hostname, family=family).ip
275
276 def _CheckSshDaemon():
277 if netutils.TcpPing(hostip, port, timeout=1.0, live_port_needed=True):
278 logging.debug("SSH daemon on %s:%s (IP address %s) has become"
279 " responsive", hostname, port, hostip)
280 else:
281 raise utils.RetryAgain()
282
283 try:
284 utils.Retry(_CheckSshDaemon, 1.0, _DAEMON_READY_TIMEOUT)
285 except utils.RetryTimeout:
286 raise errors.OpExecError("SSH daemon on %s:%s (IP address %s) didn't"
287 " become responsive within %s seconds" %
288 (hostname, port, hostip, _DAEMON_READY_TIMEOUT))
289
290
291 -def RunNodeSetupCmd(cluster_name, node, basecmd, debug, verbose,
292 use_cluster_key, ask_key, strict_host_check, data):
293 """Runs a command to configure something on a remote machine.
294
295 @type cluster_name: string
296 @param cluster_name: Cluster name
297 @type node: string
298 @param node: Node name
299 @type basecmd: string
300 @param basecmd: Base command (path on the remote machine)
301 @type debug: bool
302 @param debug: Enable debug output
303 @type verbose: bool
304 @param verbose: Enable verbose output
305 @type use_cluster_key: bool
306 @param use_cluster_key: See L{ssh.SshRunner.BuildCmd}
307 @type ask_key: bool
308 @param ask_key: See L{ssh.SshRunner.BuildCmd}
309 @type strict_host_check: bool
310 @param strict_host_check: See L{ssh.SshRunner.BuildCmd}
311 @param data: JSON-serializable input data for script (passed to stdin)
312
313 """
314 cmd = [basecmd]
315
316
317 if debug:
318 cmd.append("--debug")
319
320 if verbose:
321 cmd.append("--verbose")
322
323 logging.debug("Node setup command: %s", cmd)
324
325 version = constants.DIR_VERSION
326 all_cmds = [["test", "-d", os.path.join(pathutils.PKGLIBDIR, version)]]
327 if constants.HAS_GNU_LN:
328 all_cmds.extend([["ln", "-s", "-f", "-T",
329 os.path.join(pathutils.PKGLIBDIR, version),
330 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")],
331 ["ln", "-s", "-f", "-T",
332 os.path.join(pathutils.SHAREDIR, version),
333 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]])
334 else:
335 all_cmds.extend([["rm", "-f",
336 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")],
337 ["ln", "-s", "-f",
338 os.path.join(pathutils.PKGLIBDIR, version),
339 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")],
340 ["rm", "-f",
341 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")],
342 ["ln", "-s", "-f",
343 os.path.join(pathutils.SHAREDIR, version),
344 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]])
345 all_cmds.append(cmd)
346
347 family = ssconf.SimpleStore().GetPrimaryIPFamily()
348 srun = ssh.SshRunner(cluster_name,
349 ipv6=(family == netutils.IP6Address.family))
350 scmd = srun.BuildCmd(node, constants.SSH_LOGIN_USER,
351 utils.ShellQuoteArgs(
352 utils.ShellCombineCommands(all_cmds)),
353 batch=False, ask_key=ask_key, quiet=False,
354 strict_host_check=strict_host_check,
355 use_cluster_key=use_cluster_key)
356
357 tempfh = tempfile.TemporaryFile()
358 try:
359 tempfh.write(serializer.DumpJson(data))
360 tempfh.seek(0)
361
362 result = utils.RunCmd(scmd, interactive=True, input_fd=tempfh)
363 finally:
364 tempfh.close()
365
366 if result.failed:
367 raise errors.OpExecError("Command '%s' failed: %s" %
368 (result.cmd, result.fail_reason))
369
370 _WaitForSshDaemon(node, netutils.GetDaemonPort(constants.SSH), family)
371
372
374 """Initialize if needed the file storage.
375
376 @param file_storage_dir: the user-supplied value
377 @return: either empty string (if file storage was disabled at build
378 time) or the normalized path to the storage directory
379
380 """
381 file_storage_dir = os.path.normpath(file_storage_dir)
382
383 if not os.path.isabs(file_storage_dir):
384 raise errors.OpPrereqError("File storage directory '%s' is not an absolute"
385 " path" % file_storage_dir, errors.ECODE_INVAL)
386
387 if not os.path.exists(file_storage_dir):
388 try:
389 os.makedirs(file_storage_dir, 0750)
390 except OSError, err:
391 raise errors.OpPrereqError("Cannot create file storage directory"
392 " '%s': %s" % (file_storage_dir, err),
393 errors.ECODE_ENVIRON)
394
395 if not os.path.isdir(file_storage_dir):
396 raise errors.OpPrereqError("The file storage directory '%s' is not"
397 " a directory." % file_storage_dir,
398 errors.ECODE_ENVIRON)
399
400 return file_storage_dir
401
402
407 """Checks if a file-base storage type is enabled and inits the dir.
408
409 @type enabled_disk_templates: list of string
410 @param enabled_disk_templates: list of enabled disk templates
411 @type file_storage_dir: string
412 @param file_storage_dir: the file storage directory
413 @type default_dir: string
414 @param default_dir: default file storage directory when C{file_storage_dir}
415 is 'None'
416 @type file_disk_template: string
417 @param file_disk_template: a disk template whose storage type is 'ST_FILE'
418 @rtype: string
419 @returns: the name of the actual file storage directory
420
421 """
422 assert (file_disk_template in
423 utils.storage.GetDiskTemplatesOfStorageType(constants.ST_FILE))
424 if file_storage_dir is None:
425 file_storage_dir = default_dir
426 if not acceptance_fn:
427 acceptance_fn = \
428 lambda path: filestorage.CheckFileStoragePathAcceptance(
429 path, exact_match_ok=True)
430
431 cluster.CheckFileStoragePathVsEnabledDiskTemplates(
432 logging.warning, file_storage_dir, enabled_disk_templates)
433
434 file_storage_enabled = file_disk_template in enabled_disk_templates
435 if file_storage_enabled:
436 try:
437 acceptance_fn(file_storage_dir)
438 except errors.FileStoragePathError as e:
439 raise errors.OpPrereqError(str(e))
440 result_file_storage_dir = init_fn(file_storage_dir)
441 else:
442 result_file_storage_dir = file_storage_dir
443 return result_file_storage_dir
444
445
458
459
472
473
475 """Checks the sanity of the enabled disk templates.
476
477 """
478 if not enabled_disk_templates:
479 raise errors.OpPrereqError("Enabled disk templates list must contain at"
480 " least one member", errors.ECODE_INVAL)
481 invalid_disk_templates = \
482 set(enabled_disk_templates) - constants.DISK_TEMPLATES
483 if invalid_disk_templates:
484 raise errors.OpPrereqError("Enabled disk templates list contains invalid"
485 " entries: %s" % invalid_disk_templates,
486 errors.ECODE_INVAL)
487
488
490 """Restricts the ipolicy's disk templates to the enabled ones.
491
492 This function clears the ipolicy's list of allowed disk templates from the
493 ones that are not enabled by the cluster.
494
495 @type ipolicy: dict
496 @param ipolicy: the instance policy
497 @type enabled_disk_templates: list of string
498 @param enabled_disk_templates: the list of cluster-wide enabled disk
499 templates
500
501 """
502 assert constants.IPOLICY_DTS in ipolicy
503 allowed_disk_templates = ipolicy[constants.IPOLICY_DTS]
504 restricted_disk_templates = list(set(allowed_disk_templates)
505 .intersection(set(enabled_disk_templates)))
506 ipolicy[constants.IPOLICY_DTS] = restricted_disk_templates
507
508
510 """Checks the DRBD usermode helper.
511
512 @type drbd_helper: string
513 @param drbd_helper: name of the DRBD usermode helper that the system should
514 use
515
516 """
517 if not drbd_enabled:
518 return
519
520 if drbd_helper is not None:
521 try:
522 curr_helper = drbd.DRBD8.GetUsermodeHelper()
523 except errors.BlockDeviceError, err:
524 raise errors.OpPrereqError("Error while checking drbd helper"
525 " (disable drbd with --enabled-disk-templates"
526 " if you are not using drbd): %s" % str(err),
527 errors.ECODE_ENVIRON)
528 if drbd_helper != curr_helper:
529 raise errors.OpPrereqError("Error: requiring %s as drbd helper but %s"
530 " is the current helper" % (drbd_helper,
531 curr_helper),
532 errors.ECODE_INVAL)
533
534
535 -def InitCluster(cluster_name, mac_prefix,
536 master_netmask, master_netdev, file_storage_dir,
537 shared_file_storage_dir, candidate_pool_size, secondary_ip=None,
538 vg_name=None, beparams=None, nicparams=None, ndparams=None,
539 hvparams=None, diskparams=None, enabled_hypervisors=None,
540 modify_etc_hosts=True, modify_ssh_setup=True,
541 maintain_node_health=False, drbd_helper=None, uid_pool=None,
542 default_iallocator=None, primary_ip_version=None, ipolicy=None,
543 prealloc_wipe_disks=False, use_external_mip_script=False,
544 hv_state=None, disk_state=None, enabled_disk_templates=None):
545 """Initialise the cluster.
546
547 @type candidate_pool_size: int
548 @param candidate_pool_size: master candidate pool size
549 @type enabled_disk_templates: list of string
550 @param enabled_disk_templates: list of disk_templates to be used in this
551 cluster
552
553 """
554
555 if config.ConfigWriter.IsCluster():
556 raise errors.OpPrereqError("Cluster is already initialised",
557 errors.ECODE_STATE)
558
559 if not enabled_hypervisors:
560 raise errors.OpPrereqError("Enabled hypervisors list must contain at"
561 " least one member", errors.ECODE_INVAL)
562 invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
563 if invalid_hvs:
564 raise errors.OpPrereqError("Enabled hypervisors contains invalid"
565 " entries: %s" % invalid_hvs,
566 errors.ECODE_INVAL)
567
568 _InitCheckEnabledDiskTemplates(enabled_disk_templates)
569
570 try:
571 ipcls = netutils.IPAddress.GetClassFromIpVersion(primary_ip_version)
572 except errors.ProgrammerError:
573 raise errors.OpPrereqError("Invalid primary ip version: %d." %
574 primary_ip_version, errors.ECODE_INVAL)
575
576 hostname = netutils.GetHostname(family=ipcls.family)
577 if not ipcls.IsValid(hostname.ip):
578 raise errors.OpPrereqError("This host's IP (%s) is not a valid IPv%d"
579 " address." % (hostname.ip, primary_ip_version),
580 errors.ECODE_INVAL)
581
582 if ipcls.IsLoopback(hostname.ip):
583 raise errors.OpPrereqError("This host's IP (%s) resolves to a loopback"
584 " address. Please fix DNS or %s." %
585 (hostname.ip, pathutils.ETC_HOSTS),
586 errors.ECODE_ENVIRON)
587
588 if not ipcls.Own(hostname.ip):
589 raise errors.OpPrereqError("Inconsistency: this host's name resolves"
590 " to %s,\nbut this ip address does not"
591 " belong to this host" %
592 hostname.ip, errors.ECODE_ENVIRON)
593
594 clustername = netutils.GetHostname(name=cluster_name, family=ipcls.family)
595
596 if netutils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT, timeout=5):
597 raise errors.OpPrereqError("Cluster IP already active",
598 errors.ECODE_NOTUNIQUE)
599
600 if not secondary_ip:
601 if primary_ip_version == constants.IP6_VERSION:
602 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
603 " IPv4 address must be given as secondary",
604 errors.ECODE_INVAL)
605 secondary_ip = hostname.ip
606
607 if not netutils.IP4Address.IsValid(secondary_ip):
608 raise errors.OpPrereqError("Secondary IP address (%s) has to be a valid"
609 " IPv4 address." % secondary_ip,
610 errors.ECODE_INVAL)
611
612 if not netutils.IP4Address.Own(secondary_ip):
613 raise errors.OpPrereqError("You gave %s as secondary IP,"
614 " but it does not belong to this host." %
615 secondary_ip, errors.ECODE_ENVIRON)
616
617 if master_netmask is not None:
618 if not ipcls.ValidateNetmask(master_netmask):
619 raise errors.OpPrereqError("CIDR netmask (%s) not valid for IPv%s " %
620 (master_netmask, primary_ip_version),
621 errors.ECODE_INVAL)
622 else:
623 master_netmask = ipcls.iplen
624
625 if vg_name:
626
627 vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
628 constants.MIN_VG_SIZE)
629 if vgstatus:
630 raise errors.OpPrereqError("Error: %s" % vgstatus, errors.ECODE_INVAL)
631
632 drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates
633 _InitCheckDrbdHelper(drbd_helper, drbd_enabled)
634
635 logging.debug("Stopping daemons (if any are running)")
636 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-all"])
637 if result.failed:
638 raise errors.OpExecError("Could not stop daemons, command %s"
639 " had exitcode %s and error '%s'" %
640 (result.cmd, result.exit_code, result.output))
641
642 file_storage_dir = _PrepareFileStorage(enabled_disk_templates,
643 file_storage_dir)
644 shared_file_storage_dir = _PrepareSharedFileStorage(enabled_disk_templates,
645 shared_file_storage_dir)
646
647 if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
648 raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix,
649 errors.ECODE_INVAL)
650
651 if not nicparams.get('mode', None) == constants.NIC_MODE_OVS:
652
653
654 result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
655 if result.failed:
656 raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
657 (master_netdev,
658 result.output.strip()), errors.ECODE_INVAL)
659
660 dirs = [(pathutils.RUN_DIR, constants.RUN_DIRS_MODE)]
661 utils.EnsureDirs(dirs)
662
663 objects.UpgradeBeParams(beparams)
664 utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
665 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
666
667 objects.NIC.CheckParameterSyntax(nicparams)
668
669 full_ipolicy = objects.FillIPolicy(constants.IPOLICY_DEFAULTS, ipolicy)
670 _RestrictIpolicyToEnabledDiskTemplates(full_ipolicy, enabled_disk_templates)
671
672 if ndparams is not None:
673 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
674 else:
675 ndparams = dict(constants.NDC_DEFAULTS)
676
677
678
679
680 if hv_state:
681 for hvname, hvs_data in hv_state.items():
682 utils.ForceDictType(hvs_data, constants.HVSTS_PARAMETER_TYPES)
683 hv_state[hvname] = objects.Cluster.SimpleFillHvState(hvs_data)
684 else:
685 hv_state = dict((hvname, constants.HVST_DEFAULTS)
686 for hvname in enabled_hypervisors)
687
688
689 if disk_state:
690 for storage, ds_data in disk_state.items():
691 if storage not in constants.DS_VALID_TYPES:
692 raise errors.OpPrereqError("Invalid storage type in disk state: %s" %
693 storage, errors.ECODE_INVAL)
694 for ds_name, state in ds_data.items():
695 utils.ForceDictType(state, constants.DSS_PARAMETER_TYPES)
696 ds_data[ds_name] = objects.Cluster.SimpleFillDiskState(state)
697
698
699 for hv_name, hv_params in hvparams.iteritems():
700 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
701 hv_class = hypervisor.GetHypervisor(hv_name)
702 hv_class.CheckParameterSyntax(hv_params)
703
704
705 for template, dt_params in diskparams.items():
706 param_keys = set(dt_params.keys())
707 default_param_keys = set(constants.DISK_DT_DEFAULTS[template].keys())
708 if not (param_keys <= default_param_keys):
709 unknown_params = param_keys - default_param_keys
710 raise errors.OpPrereqError("Invalid parameters for disk template %s:"
711 " %s" % (template,
712 utils.CommaJoin(unknown_params)),
713 errors.ECODE_INVAL)
714 utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
715 if template == constants.DT_DRBD8 and vg_name is not None:
716
717
718 dt_params[constants.DRBD_DEFAULT_METAVG] = vg_name
719
720 try:
721 utils.VerifyDictOptions(diskparams, constants.DISK_DT_DEFAULTS)
722 except errors.OpPrereqError, err:
723 raise errors.OpPrereqError("While verify diskparam options: %s" % err,
724 errors.ECODE_INVAL)
725
726
727 rsa_sshkey = ""
728 dsa_sshkey = ""
729 if os.path.isfile(pathutils.SSH_HOST_RSA_PUB):
730 sshline = utils.ReadFile(pathutils.SSH_HOST_RSA_PUB)
731 rsa_sshkey = sshline.split(" ")[1]
732 if os.path.isfile(pathutils.SSH_HOST_DSA_PUB):
733 sshline = utils.ReadFile(pathutils.SSH_HOST_DSA_PUB)
734 dsa_sshkey = sshline.split(" ")[1]
735 if not rsa_sshkey and not dsa_sshkey:
736 raise errors.OpPrereqError("Failed to find SSH public keys",
737 errors.ECODE_ENVIRON)
738
739 if modify_etc_hosts:
740 utils.AddHostToEtcHosts(hostname.name, hostname.ip)
741
742 if modify_ssh_setup:
743 _InitSSHSetup()
744
745 if default_iallocator is not None:
746 alloc_script = utils.FindFile(default_iallocator,
747 constants.IALLOCATOR_SEARCH_PATH,
748 os.path.isfile)
749 if alloc_script is None:
750 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
751 " specified" % default_iallocator,
752 errors.ECODE_INVAL)
753 elif constants.HTOOLS:
754
755 if utils.FindFile(constants.IALLOC_HAIL,
756 constants.IALLOCATOR_SEARCH_PATH,
757 os.path.isfile):
758 default_iallocator = constants.IALLOC_HAIL
759
760 now = time.time()
761
762
763 cluster_config = objects.Cluster(
764 serial_no=1,
765 rsahostkeypub=rsa_sshkey,
766 dsahostkeypub=dsa_sshkey,
767 highest_used_port=(constants.FIRST_DRBD_PORT - 1),
768 mac_prefix=mac_prefix,
769 volume_group_name=vg_name,
770 tcpudp_port_pool=set(),
771 master_ip=clustername.ip,
772 master_netmask=master_netmask,
773 master_netdev=master_netdev,
774 cluster_name=clustername.name,
775 file_storage_dir=file_storage_dir,
776 shared_file_storage_dir=shared_file_storage_dir,
777 enabled_hypervisors=enabled_hypervisors,
778 beparams={constants.PP_DEFAULT: beparams},
779 nicparams={constants.PP_DEFAULT: nicparams},
780 ndparams=ndparams,
781 hvparams=hvparams,
782 diskparams=diskparams,
783 candidate_pool_size=candidate_pool_size,
784 modify_etc_hosts=modify_etc_hosts,
785 modify_ssh_setup=modify_ssh_setup,
786 uid_pool=uid_pool,
787 ctime=now,
788 mtime=now,
789 maintain_node_health=maintain_node_health,
790 drbd_usermode_helper=drbd_helper,
791 default_iallocator=default_iallocator,
792 primary_ip_family=ipcls.family,
793 prealloc_wipe_disks=prealloc_wipe_disks,
794 use_external_mip_script=use_external_mip_script,
795 ipolicy=full_ipolicy,
796 hv_state_static=hv_state,
797 disk_state_static=disk_state,
798 enabled_disk_templates=enabled_disk_templates,
799 )
800 master_node_config = objects.Node(name=hostname.name,
801 primary_ip=hostname.ip,
802 secondary_ip=secondary_ip,
803 serial_no=1,
804 master_candidate=True,
805 offline=False, drained=False,
806 ctime=now, mtime=now,
807 )
808 InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
809 cfg = config.ConfigWriter(offline=True)
810 ssh.WriteKnownHostsFile(cfg, pathutils.SSH_KNOWN_HOSTS_FILE)
811 cfg.Update(cfg.GetClusterInfo(), logging.error)
812 ssconf.WriteSsconfFiles(cfg.GetSsconfValues())
813
814
815 _InitGanetiServerSetup(hostname.name)
816
817 logging.debug("Starting daemons")
818 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"])
819 if result.failed:
820 raise errors.OpExecError("Could not start daemons, command %s"
821 " had exitcode %s and error %s" %
822 (result.cmd, result.exit_code, result.output))
823
824 _WaitForMasterDaemon()
825
826
829 """Create the initial cluster configuration.
830
831 It will contain the current node, which will also be the master
832 node, and no instances.
833
834 @type version: int
835 @param version: configuration version
836 @type cluster_config: L{objects.Cluster}
837 @param cluster_config: cluster configuration
838 @type master_node_config: L{objects.Node}
839 @param master_node_config: master node configuration
840 @type cfg_file: string
841 @param cfg_file: configuration file path
842
843 """
844 uuid_generator = config.TemporaryReservationManager()
845 cluster_config.uuid = uuid_generator.Generate([], utils.NewUUID,
846 _INITCONF_ECID)
847 master_node_config.uuid = uuid_generator.Generate([], utils.NewUUID,
848 _INITCONF_ECID)
849 cluster_config.master_node = master_node_config.uuid
850 nodes = {
851 master_node_config.uuid: master_node_config,
852 }
853 default_nodegroup = objects.NodeGroup(
854 uuid=uuid_generator.Generate([], utils.NewUUID, _INITCONF_ECID),
855 name=constants.INITIAL_NODE_GROUP_NAME,
856 members=[master_node_config.uuid],
857 diskparams={},
858 )
859 nodegroups = {
860 default_nodegroup.uuid: default_nodegroup,
861 }
862 now = time.time()
863 config_data = objects.ConfigData(version=version,
864 cluster=cluster_config,
865 nodegroups=nodegroups,
866 nodes=nodes,
867 instances={},
868 networks={},
869 serial_no=1,
870 ctime=now, mtime=now)
871 utils.WriteFile(cfg_file,
872 data=serializer.Dump(config_data.ToDict()),
873 mode=0600)
874
875
877 """Execute the last steps of cluster destroy
878
879 This function shuts down all the daemons, completing the destroy
880 begun in cmdlib.LUDestroyOpcode.
881
882 """
883 cfg = config.ConfigWriter()
884 modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
885 runner = rpc.BootstrapRunner()
886
887 master_name = cfg.GetNodeName(master_uuid)
888
889 master_params = cfg.GetMasterNetworkParameters()
890 master_params.uuid = master_uuid
891 ems = cfg.GetUseExternalMipScript()
892 result = runner.call_node_deactivate_master_ip(master_name, master_params,
893 ems)
894
895 msg = result.fail_msg
896 if msg:
897 logging.warning("Could not disable the master IP: %s", msg)
898
899 result = runner.call_node_stop_master(master_name)
900 msg = result.fail_msg
901 if msg:
902 logging.warning("Could not disable the master role: %s", msg)
903
904 result = runner.call_node_leave_cluster(master_name, modify_ssh_setup)
905 msg = result.fail_msg
906 if msg:
907 logging.warning("Could not shutdown the node daemon and cleanup"
908 " the node: %s", msg)
909
910
912 """Add a node to the cluster.
913
914 This function must be called before the actual opcode, and will ssh
915 to the remote node, copy the needed files, and start ganeti-noded,
916 allowing the master to do the rest via normal rpc calls.
917
918 @param cluster_name: the cluster name
919 @param node: the name of the new node
920
921 """
922 data = {
923 constants.NDS_CLUSTER_NAME: cluster_name,
924 constants.NDS_NODE_DAEMON_CERTIFICATE:
925 utils.ReadFile(pathutils.NODED_CERT_FILE),
926 constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
927 constants.NDS_START_NODE_DAEMON: True,
928 }
929
930 RunNodeSetupCmd(cluster_name, node, pathutils.NODE_DAEMON_SETUP,
931 opts.debug, opts.verbose,
932 True, opts.ssh_key_check, opts.ssh_key_check, data)
933
934 _WaitForNodeDaemon(node)
935
936
938 """Failover the master node.
939
940 This checks that we are not already the master, and will cause the
941 current master to cease being master, and the non-master to become
942 new master.
943
944 @type no_voting: boolean
945 @param no_voting: force the operation without remote nodes agreement
946 (dangerous)
947
948 @returns: the pair of an exit code and warnings to display
949 """
950 sstore = ssconf.SimpleStore()
951
952 old_master, new_master = ssconf.GetMasterAndMyself(sstore)
953 node_names = sstore.GetNodeList()
954 mc_list = sstore.GetMasterCandidates()
955
956 if old_master == new_master:
957 raise errors.OpPrereqError("This commands must be run on the node"
958 " where you want the new master to be."
959 " %s is already the master" %
960 old_master, errors.ECODE_INVAL)
961
962 if new_master not in mc_list:
963 mc_no_master = [name for name in mc_list if name != old_master]
964 raise errors.OpPrereqError("This node is not among the nodes marked"
965 " as master candidates. Only these nodes"
966 " can become masters. Current list of"
967 " master candidates is:\n"
968 "%s" % ("\n".join(mc_no_master)),
969 errors.ECODE_STATE)
970
971 if not no_voting:
972 vote_list = GatherMasterVotes(node_names)
973
974 if vote_list:
975 voted_master = vote_list[0][0]
976 if voted_master is None:
977 raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
978 " not respond.", errors.ECODE_ENVIRON)
979 elif voted_master != old_master:
980 raise errors.OpPrereqError("I have a wrong configuration, I believe"
981 " the master is %s but the other nodes"
982 " voted %s. Please resync the configuration"
983 " of this node." %
984 (old_master, voted_master),
985 errors.ECODE_STATE)
986
987
988 rcode = 0
989 warnings = []
990
991 logging.info("Setting master to %s, old master: %s", new_master, old_master)
992
993 try:
994
995
996 cfg = config.ConfigWriter(accept_foreign=True)
997
998 old_master_node = cfg.GetNodeInfoByName(old_master)
999 if old_master_node is None:
1000 raise errors.OpPrereqError("Could not find old master node '%s' in"
1001 " cluster configuration." % old_master,
1002 errors.ECODE_NOENT)
1003
1004 cluster_info = cfg.GetClusterInfo()
1005 new_master_node = cfg.GetNodeInfoByName(new_master)
1006 if new_master_node is None:
1007 raise errors.OpPrereqError("Could not find new master node '%s' in"
1008 " cluster configuration." % new_master,
1009 errors.ECODE_NOENT)
1010
1011 cluster_info.master_node = new_master_node.uuid
1012
1013
1014 cfg.Update(cluster_info, logging.error)
1015 except errors.ConfigurationError, err:
1016 logging.error("Error while trying to set the new master: %s",
1017 str(err))
1018 return 1
1019
1020
1021
1022
1023
1024
1025 logging.info("Stopping the master daemon on node %s", old_master)
1026
1027 runner = rpc.BootstrapRunner()
1028 master_params = cfg.GetMasterNetworkParameters()
1029 master_params.uuid = old_master_node.uuid
1030 ems = cfg.GetUseExternalMipScript()
1031 result = runner.call_node_deactivate_master_ip(old_master,
1032 master_params, ems)
1033
1034 msg = result.fail_msg
1035 if msg:
1036 warning = "Could not disable the master IP: %s" % (msg,)
1037 logging.warning("%s", warning)
1038 warnings.append(warning)
1039
1040 result = runner.call_node_stop_master(old_master)
1041 msg = result.fail_msg
1042 if msg:
1043 warning = ("Could not disable the master role on the old master"
1044 " %s, please disable manually: %s" % (old_master, msg))
1045 logging.error("%s", warning)
1046 warnings.append(warning)
1047
1048 logging.info("Checking master IP non-reachability...")
1049
1050 master_ip = sstore.GetMasterIP()
1051 total_timeout = 30
1052
1053
1054 def _check_ip(expected):
1055 if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT) != expected:
1056 raise utils.RetryAgain()
1057
1058 try:
1059 utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[False])
1060 except utils.RetryTimeout:
1061 warning = ("The master IP is still reachable after %s seconds,"
1062 " continuing but activating the master IP on the current"
1063 " node will probably fail" % total_timeout)
1064 logging.warning("%s", warning)
1065 warnings.append(warning)
1066 rcode = 1
1067
1068 if jstore.CheckDrainFlag():
1069 logging.info("Undraining job queue")
1070 jstore.SetDrainFlag(False)
1071
1072 logging.info("Starting the master daemons on the new master")
1073
1074 result = rpc.BootstrapRunner().call_node_start_master_daemons(new_master,
1075 no_voting)
1076 msg = result.fail_msg
1077 if msg:
1078 logging.error("Could not start the master role on the new master"
1079 " %s, please check: %s", new_master, msg)
1080 rcode = 1
1081
1082
1083
1084 try:
1085 utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[True])
1086 except utils.RetryTimeout:
1087 warning = ("The master IP did not come up within %s seconds; the"
1088 " cluster should still be working and reachable via %s,"
1089 " but not via the master IP address"
1090 % (total_timeout, new_master))
1091 logging.warning("%s", warning)
1092 warnings.append(warning)
1093 rcode = 1
1094
1095 logging.info("Master failed over from %s to %s", old_master, new_master)
1096 return rcode, warnings
1097
1098
1100 """Returns the current master node.
1101
1102 This is a separate function in bootstrap since it's needed by
1103 gnt-cluster, and instead of importing directly ssconf, it's better
1104 to abstract it in bootstrap, where we do use ssconf in other
1105 functions too.
1106
1107 """
1108 sstore = ssconf.SimpleStore()
1109
1110 old_master, _ = ssconf.GetMasterAndMyself(sstore)
1111
1112 return old_master
1113
1114
1116 """Check the agreement on who is the master.
1117
1118 This function will return a list of (node, number of votes), ordered
1119 by the number of votes. Errors will be denoted by the key 'None'.
1120
1121 Note that the sum of votes is the number of nodes this machine
1122 knows, whereas the number of entries in the list could be different
1123 (if some nodes vote for another master).
1124
1125 We remove ourselves from the list since we know that (bugs aside)
1126 since we use the same source for configuration information for both
1127 backend and boostrap, we'll always vote for ourselves.
1128
1129 @type node_names: list
1130 @param node_names: the list of nodes to query for master info; the current
1131 node will be removed if it is in the list
1132 @rtype: list
1133 @return: list of (node, votes)
1134
1135 """
1136 myself = netutils.Hostname.GetSysName()
1137 try:
1138 node_names.remove(myself)
1139 except ValueError:
1140 pass
1141 if not node_names:
1142
1143 return []
1144 results = rpc.BootstrapRunner().call_master_info(node_names)
1145 if not isinstance(results, dict):
1146
1147 logging.critical("Can't complete rpc call, aborting master startup")
1148 return [(None, len(node_names))]
1149 votes = {}
1150 for node_name in results:
1151 nres = results[node_name]
1152 data = nres.payload
1153 msg = nres.fail_msg
1154 fail = False
1155 if msg:
1156 logging.warning("Error contacting node %s: %s", node_name, msg)
1157 fail = True
1158
1159
1160 elif not isinstance(data, (tuple, list)) or len(data) < 3:
1161 logging.warning("Invalid data received from node %s: %s",
1162 node_name, data)
1163 fail = True
1164 if fail:
1165 if None not in votes:
1166 votes[None] = 0
1167 votes[None] += 1
1168 continue
1169 master_node = data[2]
1170 if master_node not in votes:
1171 votes[master_node] = 0
1172 votes[master_node] += 1
1173
1174 vote_list = [v for v in votes.items()]
1175
1176
1177
1178 vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
1179
1180 return vote_list
1181