Package ganeti :: Module bootstrap
[hide private]
[frames] | no frames]

Source Code for Module ganeti.bootstrap

  1  # 
  2  # 
  3   
  4  # Copyright (C) 2006, 2007, 2008, 2010, 2011, 2012 Google Inc. 
  5  # 
  6  # This program is free software; you can redistribute it and/or modify 
  7  # it under the terms of the GNU General Public License as published by 
  8  # the Free Software Foundation; either version 2 of the License, or 
  9  # (at your option) any later version. 
 10  # 
 11  # This program is distributed in the hope that it will be useful, but 
 12  # WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
 14  # General Public License for more details. 
 15  # 
 16  # You should have received a copy of the GNU General Public License 
 17  # along with this program; if not, write to the Free Software 
 18  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 
 19  # 02110-1301, USA. 
 20   
 21   
 22  """Functions to bootstrap a new cluster. 
 23   
 24  """ 
 25   
 26  import os 
 27  import os.path 
 28  import re 
 29  import logging 
 30  import time 
 31  import tempfile 
 32   
 33  from ganeti import rpc 
 34  from ganeti import ssh 
 35  from ganeti import utils 
 36  from ganeti import errors 
 37  from ganeti import config 
 38  from ganeti import constants 
 39  from ganeti import objects 
 40  from ganeti import ssconf 
 41  from ganeti import serializer 
 42  from ganeti import hypervisor 
 43  from ganeti import bdev 
 44  from ganeti import netutils 
 45  from ganeti import luxi 
 46  from ganeti import jstore 
 47  from ganeti import pathutils 
 48   
 49   
 50  # ec_id for InitConfig's temporary reservation manager 
 51  _INITCONF_ECID = "initconfig-ecid" 
 52   
 53  #: After how many seconds daemon must be responsive 
 54  _DAEMON_READY_TIMEOUT = 10.0 
 55   
 56   
57 -def _InitSSHSetup():
58 """Setup the SSH configuration for the cluster. 59 60 This generates a dsa keypair for root, adds the pub key to the 61 permitted hosts and adds the hostkey to its own known hosts. 62 63 """ 64 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER) 65 66 for name in priv_key, pub_key: 67 if os.path.exists(name): 68 utils.CreateBackup(name) 69 utils.RemoveFile(name) 70 71 result = utils.RunCmd(["ssh-keygen", "-t", "dsa", 72 "-f", priv_key, 73 "-q", "-N", ""]) 74 if result.failed: 75 raise errors.OpExecError("Could not generate ssh keypair, error %s" % 76 result.output) 77 78 utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
79 80
81 -def GenerateHmacKey(file_name):
82 """Writes a new HMAC key. 83 84 @type file_name: str 85 @param file_name: Path to output file 86 87 """ 88 utils.WriteFile(file_name, data="%s\n" % utils.GenerateSecret(), mode=0400, 89 backup=True)
90 91
92 -def GenerateClusterCrypto(new_cluster_cert, new_rapi_cert, new_spice_cert, 93 new_confd_hmac_key, new_cds, 94 rapi_cert_pem=None, spice_cert_pem=None, 95 spice_cacert_pem=None, cds=None, 96 nodecert_file=pathutils.NODED_CERT_FILE, 97 rapicert_file=pathutils.RAPI_CERT_FILE, 98 spicecert_file=pathutils.SPICE_CERT_FILE, 99 spicecacert_file=pathutils.SPICE_CACERT_FILE, 100 hmackey_file=pathutils.CONFD_HMAC_KEY, 101 cds_file=pathutils.CLUSTER_DOMAIN_SECRET_FILE):
102 """Updates the cluster certificates, keys and secrets. 103 104 @type new_cluster_cert: bool 105 @param new_cluster_cert: Whether to generate a new cluster certificate 106 @type new_rapi_cert: bool 107 @param new_rapi_cert: Whether to generate a new RAPI certificate 108 @type new_spice_cert: bool 109 @param new_spice_cert: Whether to generate a new SPICE certificate 110 @type new_confd_hmac_key: bool 111 @param new_confd_hmac_key: Whether to generate a new HMAC key 112 @type new_cds: bool 113 @param new_cds: Whether to generate a new cluster domain secret 114 @type rapi_cert_pem: string 115 @param rapi_cert_pem: New RAPI certificate in PEM format 116 @type spice_cert_pem: string 117 @param spice_cert_pem: New SPICE certificate in PEM format 118 @type spice_cacert_pem: string 119 @param spice_cacert_pem: Certificate of the CA that signed the SPICE 120 certificate, in PEM format 121 @type cds: string 122 @param cds: New cluster domain secret 123 @type nodecert_file: string 124 @param nodecert_file: optional override of the node cert file path 125 @type rapicert_file: string 126 @param rapicert_file: optional override of the rapi cert file path 127 @type spicecert_file: string 128 @param spicecert_file: optional override of the spice cert file path 129 @type spicecacert_file: string 130 @param spicecacert_file: optional override of the spice CA cert file path 131 @type hmackey_file: string 132 @param hmackey_file: optional override of the hmac key file path 133 134 """ 135 # noded SSL certificate 136 cluster_cert_exists = os.path.exists(nodecert_file) 137 if new_cluster_cert or not cluster_cert_exists: 138 if cluster_cert_exists: 139 utils.CreateBackup(nodecert_file) 140 141 logging.debug("Generating new cluster certificate at %s", nodecert_file) 142 utils.GenerateSelfSignedSslCert(nodecert_file) 143 144 # confd HMAC key 145 if new_confd_hmac_key or not os.path.exists(hmackey_file): 146 logging.debug("Writing new confd HMAC key to %s", hmackey_file) 147 GenerateHmacKey(hmackey_file) 148 149 # RAPI 150 rapi_cert_exists = os.path.exists(rapicert_file) 151 152 if rapi_cert_pem: 153 # Assume rapi_pem contains a valid PEM-formatted certificate and key 154 logging.debug("Writing RAPI certificate at %s", rapicert_file) 155 utils.WriteFile(rapicert_file, data=rapi_cert_pem, backup=True) 156 157 elif new_rapi_cert or not rapi_cert_exists: 158 if rapi_cert_exists: 159 utils.CreateBackup(rapicert_file) 160 161 logging.debug("Generating new RAPI certificate at %s", rapicert_file) 162 utils.GenerateSelfSignedSslCert(rapicert_file) 163 164 # SPICE 165 spice_cert_exists = os.path.exists(spicecert_file) 166 spice_cacert_exists = os.path.exists(spicecacert_file) 167 if spice_cert_pem: 168 # spice_cert_pem implies also spice_cacert_pem 169 logging.debug("Writing SPICE certificate at %s", spicecert_file) 170 utils.WriteFile(spicecert_file, data=spice_cert_pem, backup=True) 171 logging.debug("Writing SPICE CA certificate at %s", spicecacert_file) 172 utils.WriteFile(spicecacert_file, data=spice_cacert_pem, backup=True) 173 elif new_spice_cert or not spice_cert_exists: 174 if spice_cert_exists: 175 utils.CreateBackup(spicecert_file) 176 if spice_cacert_exists: 177 utils.CreateBackup(spicecacert_file) 178 179 logging.debug("Generating new self-signed SPICE certificate at %s", 180 spicecert_file) 181 (_, cert_pem) = utils.GenerateSelfSignedSslCert(spicecert_file) 182 183 # Self-signed certificate -> the public certificate is also the CA public 184 # certificate 185 logging.debug("Writing the public certificate to %s", 186 spicecert_file) 187 utils.io.WriteFile(spicecacert_file, mode=0400, data=cert_pem) 188 189 # Cluster domain secret 190 if cds: 191 logging.debug("Writing cluster domain secret to %s", cds_file) 192 utils.WriteFile(cds_file, data=cds, backup=True) 193 194 elif new_cds or not os.path.exists(cds_file): 195 logging.debug("Generating new cluster domain secret at %s", cds_file) 196 GenerateHmacKey(cds_file)
197 198
199 -def _InitGanetiServerSetup(master_name):
200 """Setup the necessary configuration for the initial node daemon. 201 202 This creates the nodepass file containing the shared password for 203 the cluster, generates the SSL certificate and starts the node daemon. 204 205 @type master_name: str 206 @param master_name: Name of the master node 207 208 """ 209 # Generate cluster secrets 210 GenerateClusterCrypto(True, False, False, False, False) 211 212 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.NODED]) 213 if result.failed: 214 raise errors.OpExecError("Could not start the node daemon, command %s" 215 " had exitcode %s and error %s" % 216 (result.cmd, result.exit_code, result.output)) 217 218 _WaitForNodeDaemon(master_name)
219 220
221 -def _WaitForNodeDaemon(node_name):
222 """Wait for node daemon to become responsive. 223 224 """ 225 def _CheckNodeDaemon(): 226 # Pylint bug <http://www.logilab.org/ticket/35642> 227 # pylint: disable=E1101 228 result = rpc.BootstrapRunner().call_version([node_name])[node_name] 229 if result.fail_msg: 230 raise utils.RetryAgain()
231 232 try: 233 utils.Retry(_CheckNodeDaemon, 1.0, _DAEMON_READY_TIMEOUT) 234 except utils.RetryTimeout: 235 raise errors.OpExecError("Node daemon on %s didn't answer queries within" 236 " %s seconds" % (node_name, _DAEMON_READY_TIMEOUT)) 237 238
239 -def _WaitForMasterDaemon():
240 """Wait for master daemon to become responsive. 241 242 """ 243 def _CheckMasterDaemon(): 244 try: 245 cl = luxi.Client() 246 (cluster_name, ) = cl.QueryConfigValues(["cluster_name"]) 247 except Exception: 248 raise utils.RetryAgain() 249 250 logging.debug("Received cluster name %s from master", cluster_name)
251 252 try: 253 utils.Retry(_CheckMasterDaemon, 1.0, _DAEMON_READY_TIMEOUT) 254 except utils.RetryTimeout: 255 raise errors.OpExecError("Master daemon didn't answer queries within" 256 " %s seconds" % _DAEMON_READY_TIMEOUT) 257 258
259 -def RunNodeSetupCmd(cluster_name, node, basecmd, debug, verbose, 260 use_cluster_key, ask_key, strict_host_check, data):
261 """Runs a command to configure something on a remote machine. 262 263 @type cluster_name: string 264 @param cluster_name: Cluster name 265 @type node: string 266 @param node: Node name 267 @type basecmd: string 268 @param basecmd: Base command (path on the remote machine) 269 @type debug: bool 270 @param debug: Enable debug output 271 @type verbose: bool 272 @param verbose: Enable verbose output 273 @type use_cluster_key: bool 274 @param use_cluster_key: See L{ssh.SshRunner.BuildCmd} 275 @type ask_key: bool 276 @param ask_key: See L{ssh.SshRunner.BuildCmd} 277 @type strict_host_check: bool 278 @param strict_host_check: See L{ssh.SshRunner.BuildCmd} 279 @param data: JSON-serializable input data for script (passed to stdin) 280 281 """ 282 cmd = [basecmd] 283 284 # Pass --debug/--verbose to the external script if set on our invocation 285 if debug: 286 cmd.append("--debug") 287 288 if verbose: 289 cmd.append("--verbose") 290 291 family = ssconf.SimpleStore().GetPrimaryIPFamily() 292 srun = ssh.SshRunner(cluster_name, 293 ipv6=(family == netutils.IP6Address.family)) 294 scmd = srun.BuildCmd(node, constants.SSH_LOGIN_USER, 295 utils.ShellQuoteArgs(cmd), 296 batch=False, ask_key=ask_key, quiet=False, 297 strict_host_check=strict_host_check, 298 use_cluster_key=use_cluster_key) 299 300 tempfh = tempfile.TemporaryFile() 301 try: 302 tempfh.write(serializer.DumpJson(data)) 303 tempfh.seek(0) 304 305 result = utils.RunCmd(scmd, interactive=True, input_fd=tempfh) 306 finally: 307 tempfh.close() 308 309 if result.failed: 310 raise errors.OpExecError("Command '%s' failed: %s" % 311 (result.cmd, result.fail_reason))
312 313
314 -def _InitFileStorage(file_storage_dir):
315 """Initialize if needed the file storage. 316 317 @param file_storage_dir: the user-supplied value 318 @return: either empty string (if file storage was disabled at build 319 time) or the normalized path to the storage directory 320 321 """ 322 file_storage_dir = os.path.normpath(file_storage_dir) 323 324 if not os.path.isabs(file_storage_dir): 325 raise errors.OpPrereqError("File storage directory '%s' is not an absolute" 326 " path" % file_storage_dir, errors.ECODE_INVAL) 327 328 if not os.path.exists(file_storage_dir): 329 try: 330 os.makedirs(file_storage_dir, 0750) 331 except OSError, err: 332 raise errors.OpPrereqError("Cannot create file storage directory" 333 " '%s': %s" % (file_storage_dir, err), 334 errors.ECODE_ENVIRON) 335 336 if not os.path.isdir(file_storage_dir): 337 raise errors.OpPrereqError("The file storage directory '%s' is not" 338 " a directory." % file_storage_dir, 339 errors.ECODE_ENVIRON) 340 return file_storage_dir
341 342
343 -def InitCluster(cluster_name, mac_prefix, # pylint: disable=R0913, R0914 344 master_netmask, master_netdev, file_storage_dir, 345 shared_file_storage_dir, candidate_pool_size, secondary_ip=None, 346 vg_name=None, beparams=None, nicparams=None, ndparams=None, 347 hvparams=None, diskparams=None, enabled_hypervisors=None, 348 modify_etc_hosts=True, modify_ssh_setup=True, 349 maintain_node_health=False, drbd_helper=None, uid_pool=None, 350 default_iallocator=None, primary_ip_version=None, ipolicy=None, 351 prealloc_wipe_disks=False, use_external_mip_script=False, 352 hv_state=None, disk_state=None):
353 """Initialise the cluster. 354 355 @type candidate_pool_size: int 356 @param candidate_pool_size: master candidate pool size 357 358 """ 359 # TODO: complete the docstring 360 if config.ConfigWriter.IsCluster(): 361 raise errors.OpPrereqError("Cluster is already initialised", 362 errors.ECODE_STATE) 363 364 if not enabled_hypervisors: 365 raise errors.OpPrereqError("Enabled hypervisors list must contain at" 366 " least one member", errors.ECODE_INVAL) 367 invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES 368 if invalid_hvs: 369 raise errors.OpPrereqError("Enabled hypervisors contains invalid" 370 " entries: %s" % invalid_hvs, 371 errors.ECODE_INVAL) 372 373 try: 374 ipcls = netutils.IPAddress.GetClassFromIpVersion(primary_ip_version) 375 except errors.ProgrammerError: 376 raise errors.OpPrereqError("Invalid primary ip version: %d." % 377 primary_ip_version, errors.ECODE_INVAL) 378 379 hostname = netutils.GetHostname(family=ipcls.family) 380 if not ipcls.IsValid(hostname.ip): 381 raise errors.OpPrereqError("This host's IP (%s) is not a valid IPv%d" 382 " address." % (hostname.ip, primary_ip_version), 383 errors.ECODE_INVAL) 384 385 if ipcls.IsLoopback(hostname.ip): 386 raise errors.OpPrereqError("This host's IP (%s) resolves to a loopback" 387 " address. Please fix DNS or %s." % 388 (hostname.ip, pathutils.ETC_HOSTS), 389 errors.ECODE_ENVIRON) 390 391 if not ipcls.Own(hostname.ip): 392 raise errors.OpPrereqError("Inconsistency: this host's name resolves" 393 " to %s,\nbut this ip address does not" 394 " belong to this host" % 395 hostname.ip, errors.ECODE_ENVIRON) 396 397 clustername = netutils.GetHostname(name=cluster_name, family=ipcls.family) 398 399 if netutils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT, timeout=5): 400 raise errors.OpPrereqError("Cluster IP already active", 401 errors.ECODE_NOTUNIQUE) 402 403 if not secondary_ip: 404 if primary_ip_version == constants.IP6_VERSION: 405 raise errors.OpPrereqError("When using a IPv6 primary address, a valid" 406 " IPv4 address must be given as secondary", 407 errors.ECODE_INVAL) 408 secondary_ip = hostname.ip 409 410 if not netutils.IP4Address.IsValid(secondary_ip): 411 raise errors.OpPrereqError("Secondary IP address (%s) has to be a valid" 412 " IPv4 address." % secondary_ip, 413 errors.ECODE_INVAL) 414 415 if not netutils.IP4Address.Own(secondary_ip): 416 raise errors.OpPrereqError("You gave %s as secondary IP," 417 " but it does not belong to this host." % 418 secondary_ip, errors.ECODE_ENVIRON) 419 420 if master_netmask is not None: 421 if not ipcls.ValidateNetmask(master_netmask): 422 raise errors.OpPrereqError("CIDR netmask (%s) not valid for IPv%s " % 423 (master_netmask, primary_ip_version), 424 errors.ECODE_INVAL) 425 else: 426 master_netmask = ipcls.iplen 427 428 if vg_name is not None: 429 # Check if volume group is valid 430 vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name, 431 constants.MIN_VG_SIZE) 432 if vgstatus: 433 raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if" 434 " you are not using lvm" % vgstatus, 435 errors.ECODE_INVAL) 436 437 if drbd_helper is not None: 438 try: 439 curr_helper = bdev.BaseDRBD.GetUsermodeHelper() 440 except errors.BlockDeviceError, err: 441 raise errors.OpPrereqError("Error while checking drbd helper" 442 " (specify --no-drbd-storage if you are not" 443 " using drbd): %s" % str(err), 444 errors.ECODE_ENVIRON) 445 if drbd_helper != curr_helper: 446 raise errors.OpPrereqError("Error: requiring %s as drbd helper but %s" 447 " is the current helper" % (drbd_helper, 448 curr_helper), 449 errors.ECODE_INVAL) 450 451 logging.debug("Stopping daemons (if any are running)") 452 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-all"]) 453 if result.failed: 454 raise errors.OpExecError("Could not stop daemons, command %s" 455 " had exitcode %s and error '%s'" % 456 (result.cmd, result.exit_code, result.output)) 457 458 if constants.ENABLE_FILE_STORAGE: 459 file_storage_dir = _InitFileStorage(file_storage_dir) 460 else: 461 file_storage_dir = "" 462 463 if constants.ENABLE_SHARED_FILE_STORAGE: 464 shared_file_storage_dir = _InitFileStorage(shared_file_storage_dir) 465 else: 466 shared_file_storage_dir = "" 467 468 if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix): 469 raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix, 470 errors.ECODE_INVAL) 471 472 result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev]) 473 if result.failed: 474 raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" % 475 (master_netdev, 476 result.output.strip()), errors.ECODE_INVAL) 477 478 dirs = [(pathutils.RUN_DIR, constants.RUN_DIRS_MODE)] 479 utils.EnsureDirs(dirs) 480 481 objects.UpgradeBeParams(beparams) 482 utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES) 483 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES) 484 485 objects.NIC.CheckParameterSyntax(nicparams) 486 487 full_ipolicy = objects.FillIPolicy(constants.IPOLICY_DEFAULTS, ipolicy) 488 489 if ndparams is not None: 490 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES) 491 else: 492 ndparams = dict(constants.NDC_DEFAULTS) 493 494 # This is ugly, as we modify the dict itself 495 # FIXME: Make utils.ForceDictType pure functional or write a wrapper 496 # around it 497 if hv_state: 498 for hvname, hvs_data in hv_state.items(): 499 utils.ForceDictType(hvs_data, constants.HVSTS_PARAMETER_TYPES) 500 hv_state[hvname] = objects.Cluster.SimpleFillHvState(hvs_data) 501 else: 502 hv_state = dict((hvname, constants.HVST_DEFAULTS) 503 for hvname in enabled_hypervisors) 504 505 # FIXME: disk_state has no default values yet 506 if disk_state: 507 for storage, ds_data in disk_state.items(): 508 if storage not in constants.DS_VALID_TYPES: 509 raise errors.OpPrereqError("Invalid storage type in disk state: %s" % 510 storage, errors.ECODE_INVAL) 511 for ds_name, state in ds_data.items(): 512 utils.ForceDictType(state, constants.DSS_PARAMETER_TYPES) 513 ds_data[ds_name] = objects.Cluster.SimpleFillDiskState(state) 514 515 # hvparams is a mapping of hypervisor->hvparams dict 516 for hv_name, hv_params in hvparams.iteritems(): 517 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) 518 hv_class = hypervisor.GetHypervisor(hv_name) 519 hv_class.CheckParameterSyntax(hv_params) 520 521 # diskparams is a mapping of disk-template->diskparams dict 522 for template, dt_params in diskparams.items(): 523 param_keys = set(dt_params.keys()) 524 default_param_keys = set(constants.DISK_DT_DEFAULTS[template].keys()) 525 if not (param_keys <= default_param_keys): 526 unknown_params = param_keys - default_param_keys 527 raise errors.OpPrereqError("Invalid parameters for disk template %s:" 528 " %s" % (template, 529 utils.CommaJoin(unknown_params)), 530 errors.ECODE_INVAL) 531 utils.ForceDictType(dt_params, constants.DISK_DT_TYPES) 532 if template == constants.DT_DRBD8 and vg_name is not None: 533 # The default METAVG value is equal to the VG name set at init time, 534 # if provided 535 dt_params[constants.DRBD_DEFAULT_METAVG] = vg_name 536 537 try: 538 utils.VerifyDictOptions(diskparams, constants.DISK_DT_DEFAULTS) 539 except errors.OpPrereqError, err: 540 raise errors.OpPrereqError("While verify diskparam options: %s" % err, 541 errors.ECODE_INVAL) 542 543 # set up ssh config and /etc/hosts 544 sshline = utils.ReadFile(pathutils.SSH_HOST_RSA_PUB) 545 sshkey = sshline.split(" ")[1] 546 547 if modify_etc_hosts: 548 utils.AddHostToEtcHosts(hostname.name, hostname.ip) 549 550 if modify_ssh_setup: 551 _InitSSHSetup() 552 553 if default_iallocator is not None: 554 alloc_script = utils.FindFile(default_iallocator, 555 constants.IALLOCATOR_SEARCH_PATH, 556 os.path.isfile) 557 if alloc_script is None: 558 raise errors.OpPrereqError("Invalid default iallocator script '%s'" 559 " specified" % default_iallocator, 560 errors.ECODE_INVAL) 561 elif constants.HTOOLS: 562 # htools was enabled at build-time, we default to it 563 if utils.FindFile(constants.IALLOC_HAIL, 564 constants.IALLOCATOR_SEARCH_PATH, 565 os.path.isfile): 566 default_iallocator = constants.IALLOC_HAIL 567 568 now = time.time() 569 570 # init of cluster config file 571 cluster_config = objects.Cluster( 572 serial_no=1, 573 rsahostkeypub=sshkey, 574 highest_used_port=(constants.FIRST_DRBD_PORT - 1), 575 mac_prefix=mac_prefix, 576 volume_group_name=vg_name, 577 tcpudp_port_pool=set(), 578 master_node=hostname.name, 579 master_ip=clustername.ip, 580 master_netmask=master_netmask, 581 master_netdev=master_netdev, 582 cluster_name=clustername.name, 583 file_storage_dir=file_storage_dir, 584 shared_file_storage_dir=shared_file_storage_dir, 585 enabled_hypervisors=enabled_hypervisors, 586 beparams={constants.PP_DEFAULT: beparams}, 587 nicparams={constants.PP_DEFAULT: nicparams}, 588 ndparams=ndparams, 589 hvparams=hvparams, 590 diskparams=diskparams, 591 candidate_pool_size=candidate_pool_size, 592 modify_etc_hosts=modify_etc_hosts, 593 modify_ssh_setup=modify_ssh_setup, 594 uid_pool=uid_pool, 595 ctime=now, 596 mtime=now, 597 maintain_node_health=maintain_node_health, 598 drbd_usermode_helper=drbd_helper, 599 default_iallocator=default_iallocator, 600 primary_ip_family=ipcls.family, 601 prealloc_wipe_disks=prealloc_wipe_disks, 602 use_external_mip_script=use_external_mip_script, 603 ipolicy=full_ipolicy, 604 hv_state_static=hv_state, 605 disk_state_static=disk_state, 606 ) 607 master_node_config = objects.Node(name=hostname.name, 608 primary_ip=hostname.ip, 609 secondary_ip=secondary_ip, 610 serial_no=1, 611 master_candidate=True, 612 offline=False, drained=False, 613 ctime=now, mtime=now, 614 ) 615 InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config) 616 cfg = config.ConfigWriter(offline=True) 617 ssh.WriteKnownHostsFile(cfg, pathutils.SSH_KNOWN_HOSTS_FILE) 618 cfg.Update(cfg.GetClusterInfo(), logging.error) 619 ssconf.WriteSsconfFiles(cfg.GetSsconfValues()) 620 621 # set up the inter-node password and certificate 622 _InitGanetiServerSetup(hostname.name) 623 624 logging.debug("Starting daemons") 625 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"]) 626 if result.failed: 627 raise errors.OpExecError("Could not start daemons, command %s" 628 " had exitcode %s and error %s" % 629 (result.cmd, result.exit_code, result.output)) 630 631 _WaitForMasterDaemon()
632 633
634 -def InitConfig(version, cluster_config, master_node_config, 635 cfg_file=pathutils.CLUSTER_CONF_FILE):
636 """Create the initial cluster configuration. 637 638 It will contain the current node, which will also be the master 639 node, and no instances. 640 641 @type version: int 642 @param version: configuration version 643 @type cluster_config: L{objects.Cluster} 644 @param cluster_config: cluster configuration 645 @type master_node_config: L{objects.Node} 646 @param master_node_config: master node configuration 647 @type cfg_file: string 648 @param cfg_file: configuration file path 649 650 """ 651 uuid_generator = config.TemporaryReservationManager() 652 cluster_config.uuid = uuid_generator.Generate([], utils.NewUUID, 653 _INITCONF_ECID) 654 master_node_config.uuid = uuid_generator.Generate([], utils.NewUUID, 655 _INITCONF_ECID) 656 nodes = { 657 master_node_config.name: master_node_config, 658 } 659 default_nodegroup = objects.NodeGroup( 660 uuid=uuid_generator.Generate([], utils.NewUUID, _INITCONF_ECID), 661 name=constants.INITIAL_NODE_GROUP_NAME, 662 members=[master_node_config.name], 663 diskparams={}, 664 ) 665 nodegroups = { 666 default_nodegroup.uuid: default_nodegroup, 667 } 668 now = time.time() 669 config_data = objects.ConfigData(version=version, 670 cluster=cluster_config, 671 nodegroups=nodegroups, 672 nodes=nodes, 673 instances={}, 674 networks={}, 675 serial_no=1, 676 ctime=now, mtime=now) 677 utils.WriteFile(cfg_file, 678 data=serializer.Dump(config_data.ToDict()), 679 mode=0600)
680 681
682 -def FinalizeClusterDestroy(master):
683 """Execute the last steps of cluster destroy 684 685 This function shuts down all the daemons, completing the destroy 686 begun in cmdlib.LUDestroyOpcode. 687 688 """ 689 cfg = config.ConfigWriter() 690 modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup 691 runner = rpc.BootstrapRunner() 692 693 master_params = cfg.GetMasterNetworkParameters() 694 master_params.name = master 695 ems = cfg.GetUseExternalMipScript() 696 result = runner.call_node_deactivate_master_ip(master_params.name, 697 master_params, ems) 698 699 msg = result.fail_msg 700 if msg: 701 logging.warning("Could not disable the master IP: %s", msg) 702 703 result = runner.call_node_stop_master(master) 704 msg = result.fail_msg 705 if msg: 706 logging.warning("Could not disable the master role: %s", msg) 707 708 result = runner.call_node_leave_cluster(master, modify_ssh_setup) 709 msg = result.fail_msg 710 if msg: 711 logging.warning("Could not shutdown the node daemon and cleanup" 712 " the node: %s", msg)
713 714
715 -def SetupNodeDaemon(opts, cluster_name, node):
716 """Add a node to the cluster. 717 718 This function must be called before the actual opcode, and will ssh 719 to the remote node, copy the needed files, and start ganeti-noded, 720 allowing the master to do the rest via normal rpc calls. 721 722 @param cluster_name: the cluster name 723 @param node: the name of the new node 724 725 """ 726 data = { 727 constants.NDS_CLUSTER_NAME: cluster_name, 728 constants.NDS_NODE_DAEMON_CERTIFICATE: 729 utils.ReadFile(pathutils.NODED_CERT_FILE), 730 constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(), 731 constants.NDS_START_NODE_DAEMON: True, 732 } 733 734 RunNodeSetupCmd(cluster_name, node, pathutils.NODE_DAEMON_SETUP, 735 opts.debug, opts.verbose, 736 True, opts.ssh_key_check, opts.ssh_key_check, data) 737 738 _WaitForNodeDaemon(node)
739 740
741 -def MasterFailover(no_voting=False):
742 """Failover the master node. 743 744 This checks that we are not already the master, and will cause the 745 current master to cease being master, and the non-master to become 746 new master. 747 748 @type no_voting: boolean 749 @param no_voting: force the operation without remote nodes agreement 750 (dangerous) 751 752 """ 753 sstore = ssconf.SimpleStore() 754 755 old_master, new_master = ssconf.GetMasterAndMyself(sstore) 756 node_list = sstore.GetNodeList() 757 mc_list = sstore.GetMasterCandidates() 758 759 if old_master == new_master: 760 raise errors.OpPrereqError("This commands must be run on the node" 761 " where you want the new master to be." 762 " %s is already the master" % 763 old_master, errors.ECODE_INVAL) 764 765 if new_master not in mc_list: 766 mc_no_master = [name for name in mc_list if name != old_master] 767 raise errors.OpPrereqError("This node is not among the nodes marked" 768 " as master candidates. Only these nodes" 769 " can become masters. Current list of" 770 " master candidates is:\n" 771 "%s" % ("\n".join(mc_no_master)), 772 errors.ECODE_STATE) 773 774 if not no_voting: 775 vote_list = GatherMasterVotes(node_list) 776 777 if vote_list: 778 voted_master = vote_list[0][0] 779 if voted_master is None: 780 raise errors.OpPrereqError("Cluster is inconsistent, most nodes did" 781 " not respond.", errors.ECODE_ENVIRON) 782 elif voted_master != old_master: 783 raise errors.OpPrereqError("I have a wrong configuration, I believe" 784 " the master is %s but the other nodes" 785 " voted %s. Please resync the configuration" 786 " of this node." % 787 (old_master, voted_master), 788 errors.ECODE_STATE) 789 # end checks 790 791 rcode = 0 792 793 logging.info("Setting master to %s, old master: %s", new_master, old_master) 794 795 try: 796 # instantiate a real config writer, as we now know we have the 797 # configuration data 798 cfg = config.ConfigWriter(accept_foreign=True) 799 800 cluster_info = cfg.GetClusterInfo() 801 cluster_info.master_node = new_master 802 # this will also regenerate the ssconf files, since we updated the 803 # cluster info 804 cfg.Update(cluster_info, logging.error) 805 except errors.ConfigurationError, err: 806 logging.error("Error while trying to set the new master: %s", 807 str(err)) 808 return 1 809 810 # if cfg.Update worked, then it means the old master daemon won't be 811 # able now to write its own config file (we rely on locking in both 812 # backend.UploadFile() and ConfigWriter._Write(); hence the next 813 # step is to kill the old master 814 815 logging.info("Stopping the master daemon on node %s", old_master) 816 817 runner = rpc.BootstrapRunner() 818 master_params = cfg.GetMasterNetworkParameters() 819 master_params.name = old_master 820 ems = cfg.GetUseExternalMipScript() 821 result = runner.call_node_deactivate_master_ip(master_params.name, 822 master_params, ems) 823 824 msg = result.fail_msg 825 if msg: 826 logging.warning("Could not disable the master IP: %s", msg) 827 828 result = runner.call_node_stop_master(old_master) 829 msg = result.fail_msg 830 if msg: 831 logging.error("Could not disable the master role on the old master" 832 " %s, please disable manually: %s", old_master, msg) 833 834 logging.info("Checking master IP non-reachability...") 835 836 master_ip = sstore.GetMasterIP() 837 total_timeout = 30 838 839 # Here we have a phase where no master should be running 840 def _check_ip(): 841 if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT): 842 raise utils.RetryAgain()
843 844 try: 845 utils.Retry(_check_ip, (1, 1.5, 5), total_timeout) 846 except utils.RetryTimeout: 847 logging.warning("The master IP is still reachable after %s seconds," 848 " continuing but activating the master on the current" 849 " node will probably fail", total_timeout) 850 851 if jstore.CheckDrainFlag(): 852 logging.info("Undraining job queue") 853 jstore.SetDrainFlag(False) 854 855 logging.info("Starting the master daemons on the new master") 856 857 result = rpc.BootstrapRunner().call_node_start_master_daemons(new_master, 858 no_voting) 859 msg = result.fail_msg 860 if msg: 861 logging.error("Could not start the master role on the new master" 862 " %s, please check: %s", new_master, msg) 863 rcode = 1 864 865 logging.info("Master failed over from %s to %s", old_master, new_master) 866 return rcode 867 868
869 -def GetMaster():
870 """Returns the current master node. 871 872 This is a separate function in bootstrap since it's needed by 873 gnt-cluster, and instead of importing directly ssconf, it's better 874 to abstract it in bootstrap, where we do use ssconf in other 875 functions too. 876 877 """ 878 sstore = ssconf.SimpleStore() 879 880 old_master, _ = ssconf.GetMasterAndMyself(sstore) 881 882 return old_master
883 884
885 -def GatherMasterVotes(node_list):
886 """Check the agreement on who is the master. 887 888 This function will return a list of (node, number of votes), ordered 889 by the number of votes. Errors will be denoted by the key 'None'. 890 891 Note that the sum of votes is the number of nodes this machine 892 knows, whereas the number of entries in the list could be different 893 (if some nodes vote for another master). 894 895 We remove ourselves from the list since we know that (bugs aside) 896 since we use the same source for configuration information for both 897 backend and boostrap, we'll always vote for ourselves. 898 899 @type node_list: list 900 @param node_list: the list of nodes to query for master info; the current 901 node will be removed if it is in the list 902 @rtype: list 903 @return: list of (node, votes) 904 905 """ 906 myself = netutils.Hostname.GetSysName() 907 try: 908 node_list.remove(myself) 909 except ValueError: 910 pass 911 if not node_list: 912 # no nodes left (eventually after removing myself) 913 return [] 914 results = rpc.BootstrapRunner().call_master_info(node_list) 915 if not isinstance(results, dict): 916 # this should not happen (unless internal error in rpc) 917 logging.critical("Can't complete rpc call, aborting master startup") 918 return [(None, len(node_list))] 919 votes = {} 920 for node in results: 921 nres = results[node] 922 data = nres.payload 923 msg = nres.fail_msg 924 fail = False 925 if msg: 926 logging.warning("Error contacting node %s: %s", node, msg) 927 fail = True 928 # for now we accept both length 3, 4 and 5 (data[3] is primary ip version 929 # and data[4] is the master netmask) 930 elif not isinstance(data, (tuple, list)) or len(data) < 3: 931 logging.warning("Invalid data received from node %s: %s", node, data) 932 fail = True 933 if fail: 934 if None not in votes: 935 votes[None] = 0 936 votes[None] += 1 937 continue 938 master_node = data[2] 939 if master_node not in votes: 940 votes[master_node] = 0 941 votes[master_node] += 1 942 943 vote_list = [v for v in votes.items()] 944 # sort first on number of votes then on name, since we want None 945 # sorted later if we have the half of the nodes not responding, and 946 # half voting all for the same master 947 vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True) 948 949 return vote_list
950