ganeti.cmdlib.instance

242 """Tasklet class for instance migration. 243 244 @type live: boolean 245 @ivar live: whether the migration will be done live or non-live; 246 this variable is initalized only after CheckPrereq has run 247 @type cleanup: boolean 248 @ivar cleanup: Wheater we cleanup from a failed migration 249 @type iallocator: string 250 @ivar iallocator: The iallocator used to determine target_node 251 @type target_node_uuid: string 252 @ivar target_node_uuid: If given, the target node UUID to reallocate the 253 instance to 254 @type failover: boolean 255 @ivar failover: Whether operation results in failover or migration 256 @type fallback: boolean 257 @ivar fallback: Whether fallback to failover is allowed if migration not 258 possible 259 @type ignore_consistency: boolean 260 @ivar ignore_consistency: Wheter we should ignore consistency between source 261 and target node 262 @type shutdown_timeout: int 263 @ivar shutdown_timeout: In case of failover timeout of the shutdown 264 @type ignore_ipolicy: bool 265 @ivar ignore_ipolicy: If true, we can ignore instance policy when migrating 266 267 """ 268 269 # Constants 270 _MIGRATION_POLL_INTERVAL = 1 # seconds 271 _MIGRATION_FEEDBACK_INTERVAL = 10 # seconds 272

273 - def __init__(self, lu, instance_uuid, instance_name, cleanup, failover, 274 fallback, ignore_consistency, allow_runtime_changes, 275 shutdown_timeout, ignore_ipolicy):

276 """Initializes this class. 277 278 """ 279 Tasklet.__init__(self, lu) 280 281 # Parameters 282 self.instance_uuid = instance_uuid 283 self.instance_name = instance_name 284 self.cleanup = cleanup 285 self.live = False # will be overridden later 286 self.failover = failover 287 self.fallback = fallback 288 self.ignore_consistency = ignore_consistency 289 self.shutdown_timeout = shutdown_timeout 290 self.ignore_ipolicy = ignore_ipolicy 291 self.allow_runtime_changes = allow_runtime_changes

292

293 - def CheckPrereq(self):

294 """Check prerequisites. 295 296 This checks that the instance is in the cluster. 297 298 """ 299 (self.instance_uuid, self.instance_name) = \ 300 ExpandInstanceUuidAndName(self.lu.cfg, self.instance_uuid, 301 self.instance_name) 302 self.instance = self.cfg.GetInstanceInfo(self.instance_uuid) 303 assert self.instance is not None 304 cluster = self.cfg.GetClusterInfo() 305 306 if (not self.cleanup and 307 not self.instance.admin_state == constants.ADMINST_UP and 308 not self.failover and self.fallback): 309 self.lu.LogInfo("Instance is marked down or offline, fallback allowed," 310 " switching to failover") 311 self.failover = True 312 313 if self.instance.disk_template not in constants.DTS_MIRRORED: 314 if self.failover: 315 text = "failovers" 316 else: 317 text = "migrations" 318 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow" 319 " %s" % (self.instance.disk_template, text), 320 errors.ECODE_STATE) 321 322 if self.instance.disk_template in constants.DTS_EXT_MIRROR: 323 CheckIAllocatorOrNode(self.lu, "iallocator", "target_node") 324 325 if self.lu.op.iallocator: 326 assert locking.NAL in self.lu.owned_locks(locking.LEVEL_NODE_ALLOC) 327 self._RunAllocator() 328 else: 329 # We set set self.target_node_uuid as it is required by 330 # BuildHooksEnv 331 self.target_node_uuid = self.lu.op.target_node_uuid 332 333 # Check that the target node is correct in terms of instance policy 334 nodeinfo = self.cfg.GetNodeInfo(self.target_node_uuid) 335 group_info = self.cfg.GetNodeGroup(nodeinfo.group) 336 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster, 337 group_info) 338 CheckTargetNodeIPolicy(self.lu, ipolicy, self.instance, nodeinfo, 339 self.cfg, ignore=self.ignore_ipolicy) 340 341 # self.target_node is already populated, either directly or by the 342 # iallocator run 343 target_node_uuid = self.target_node_uuid 344 if self.target_node_uuid == self.instance.primary_node: 345 raise errors.OpPrereqError( 346 "Cannot migrate instance %s to its primary (%s)" % 347 (self.instance.name, 348 self.cfg.GetNodeName(self.instance.primary_node)), 349 errors.ECODE_STATE) 350 351 if len(self.lu.tasklets) == 1: 352 # It is safe to release locks only when we're the only tasklet 353 # in the LU 354 ReleaseLocks(self.lu, locking.LEVEL_NODE, 355 keep=[self.instance.primary_node, self.target_node_uuid]) 356 ReleaseLocks(self.lu, locking.LEVEL_NODE_ALLOC) 357 358 else: 359 assert not self.lu.glm.is_owned(locking.LEVEL_NODE_ALLOC) 360 361 secondary_node_uuids = self.instance.secondary_nodes 362 if not secondary_node_uuids: 363 raise errors.ConfigurationError("No secondary node but using" 364 " %s disk template" % 365 self.instance.disk_template) 366 self.target_node_uuid = target_node_uuid = secondary_node_uuids[0] 367 if self.lu.op.iallocator or \ 368 (self.lu.op.target_node_uuid and 369 self.lu.op.target_node_uuid != target_node_uuid): 370 if self.failover: 371 text = "failed over" 372 else: 373 text = "migrated" 374 raise errors.OpPrereqError("Instances with disk template %s cannot" 375 " be %s to arbitrary nodes" 376 " (neither an iallocator nor a target" 377 " node can be passed)" % 378 (self.instance.disk_template, text), 379 errors.ECODE_INVAL) 380 nodeinfo = self.cfg.GetNodeInfo(target_node_uuid) 381 group_info = self.cfg.GetNodeGroup(nodeinfo.group) 382 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster, 383 group_info) 384 CheckTargetNodeIPolicy(self.lu, ipolicy, self.instance, nodeinfo, 385 self.cfg, ignore=self.ignore_ipolicy) 386 387 i_be = cluster.FillBE(self.instance) 388 389 # check memory requirements on the secondary node 390 if (not self.cleanup and 391 (not self.failover or 392 self.instance.admin_state == constants.ADMINST_UP)): 393 self.tgt_free_mem = CheckNodeFreeMemory( 394 self.lu, target_node_uuid, 395 "migrating instance %s" % self.instance.name, 396 i_be[constants.BE_MINMEM], self.instance.hypervisor, 397 self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor]) 398 else: 399 self.lu.LogInfo("Not checking memory on the secondary node as" 400 " instance will not be started") 401 402 # check if failover must be forced instead of migration 403 if (not self.cleanup and not self.failover and 404 i_be[constants.BE_ALWAYS_FAILOVER]): 405 self.lu.LogInfo("Instance configured to always failover; fallback" 406 " to failover") 407 self.failover = True 408 409 # check bridge existance 410 CheckInstanceBridgesExist(self.lu, self.instance, 411 node_uuid=target_node_uuid) 412 413 if not self.cleanup: 414 CheckNodeNotDrained(self.lu, target_node_uuid) 415 if not self.failover: 416 result = self.rpc.call_instance_migratable(self.instance.primary_node, 417 self.instance) 418 if result.fail_msg and self.fallback: 419 self.lu.LogInfo("Can't migrate, instance offline, fallback to" 420 " failover") 421 self.failover = True 422 else: 423 result.Raise("Can't migrate, please use failover", 424 prereq=True, ecode=errors.ECODE_STATE) 425 426 assert not (self.failover and self.cleanup) 427 428 if not self.failover: 429 if self.lu.op.live is not None and self.lu.op.mode is not None: 430 raise errors.OpPrereqError("Only one of the 'live' and 'mode'" 431 " parameters are accepted", 432 errors.ECODE_INVAL) 433 if self.lu.op.live is not None: 434 if self.lu.op.live: 435 self.lu.op.mode = constants.HT_MIGRATION_LIVE 436 else: 437 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE 438 # reset the 'live' parameter to None so that repeated 439 # invocations of CheckPrereq do not raise an exception 440 self.lu.op.live = None 441 elif self.lu.op.mode is None: 442 # read the default value from the hypervisor 443 i_hv = cluster.FillHV(self.instance, skip_globals=False) 444 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE] 445 446 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE 447 else: 448 # Failover is never live 449 self.live = False 450 451 if not (self.failover or self.cleanup): 452 remote_info = self.rpc.call_instance_info( 453 self.instance.primary_node, self.instance.name, 454 self.instance.hypervisor, cluster.hvparams[self.instance.hypervisor]) 455 remote_info.Raise("Error checking instance on node %s" % 456 self.cfg.GetNodeName(self.instance.primary_node), 457 prereq=True) 458 instance_running = bool(remote_info.payload) 459 if instance_running: 460 self.current_mem = int(remote_info.payload["memory"])

461

462 - def _RunAllocator(self):

463 """Run the allocator based on input opcode. 464 465 """ 466 assert locking.NAL in self.lu.owned_locks(locking.LEVEL_NODE_ALLOC) 467 468 # FIXME: add a self.ignore_ipolicy option 469 req = iallocator.IAReqRelocate( 470 inst_uuid=self.instance_uuid, 471 relocate_from_node_uuids=[self.instance.primary_node]) 472 ial = iallocator.IAllocator(self.cfg, self.rpc, req) 473 474 ial.Run(self.lu.op.iallocator) 475 476 if not ial.success: 477 raise errors.OpPrereqError("Can't compute nodes using" 478 " iallocator '%s': %s" % 479 (self.lu.op.iallocator, ial.info), 480 errors.ECODE_NORES) 481 self.target_node_uuid = self.cfg.GetNodeInfoByName(ial.result[0]).uuid 482 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s", 483 self.instance_name, self.lu.op.iallocator, 484 utils.CommaJoin(ial.result))

485

486 - def _WaitUntilSync(self):

487 """Poll with custom rpc for disk sync. 488 489 This uses our own step-based rpc call. 490 491 """ 492 self.feedback_fn("* wait until resync is done") 493 all_done = False 494 while not all_done: 495 all_done = True 496 result = self.rpc.call_drbd_wait_sync(self.all_node_uuids, 497 (self.instance.disks, 498 self.instance)) 499 min_percent = 100 500 for node_uuid, nres in result.items(): 501 nres.Raise("Cannot resync disks on node %s" % 502 self.cfg.GetNodeName(node_uuid)) 503 node_done, node_percent = nres.payload 504 all_done = all_done and node_done 505 if node_percent is not None: 506 min_percent = min(min_percent, node_percent) 507 if not all_done: 508 if min_percent < 100: 509 self.feedback_fn(" - progress: %.1f%%" % min_percent) 510 time.sleep(2)

511

512 - def _EnsureSecondary(self, node_uuid):

513 """Demote a node to secondary. 514 515 """ 516 self.feedback_fn("* switching node %s to secondary mode" % 517 self.cfg.GetNodeName(node_uuid)) 518 519 result = self.rpc.call_blockdev_close(node_uuid, self.instance.name, 520 (self.instance.disks, self.instance)) 521 result.Raise("Cannot change disk to secondary on node %s" % 522 self.cfg.GetNodeName(node_uuid))

523

524 - def _GoStandalone(self):

525 """Disconnect from the network. 526 527 """ 528 self.feedback_fn("* changing into standalone mode") 529 result = self.rpc.call_drbd_disconnect_net( 530 self.all_node_uuids, (self.instance.disks, self.instance)) 531 for node_uuid, nres in result.items(): 532 nres.Raise("Cannot disconnect disks node %s" % 533 self.cfg.GetNodeName(node_uuid))

534

535 - def _GoReconnect(self, multimaster):

536 """Reconnect to the network. 537 538 """ 539 if multimaster: 540 msg = "dual-master" 541 else: 542 msg = "single-master" 543 self.feedback_fn("* changing disks into %s mode" % msg) 544 result = self.rpc.call_drbd_attach_net(self.all_node_uuids, 545 (self.instance.disks, self.instance), 546 self.instance.name, multimaster) 547 for node_uuid, nres in result.items(): 548 nres.Raise("Cannot change disks config on node %s" % 549 self.cfg.GetNodeName(node_uuid))

550

551 - def _ExecCleanup(self):

552 """Try to cleanup after a failed migration. 553 554 The cleanup is done by: 555 - check that the instance is running only on one node 556 (and update the config if needed) 557 - change disks on its secondary node to secondary 558 - wait until disks are fully synchronized 559 - disconnect from the network 560 - change disks into single-master mode 561 - wait again until disks are fully synchronized 562 563 """ 564 # check running on only one node 565 self.feedback_fn("* checking where the instance actually runs" 566 " (if this hangs, the hypervisor might be in" 567 " a bad state)") 568 cluster_hvparams = self.cfg.GetClusterInfo().hvparams 569 ins_l = self.rpc.call_instance_list(self.all_node_uuids, 570 [self.instance.hypervisor], 571 cluster_hvparams) 572 for node_uuid, result in ins_l.items(): 573 result.Raise("Can't contact node %s" % node_uuid) 574 575 runningon_source = self.instance.name in \ 576 ins_l[self.source_node_uuid].payload 577 runningon_target = self.instance.name in \ 578 ins_l[self.target_node_uuid].payload 579 580 if runningon_source and runningon_target: 581 raise errors.OpExecError("Instance seems to be running on two nodes," 582 " or the hypervisor is confused; you will have" 583 " to ensure manually that it runs only on one" 584 " and restart this operation") 585 586 if not (runningon_source or runningon_target): 587 raise errors.OpExecError("Instance does not seem to be running at all;" 588 " in this case it's safer to repair by" 589 " running 'gnt-instance stop' to ensure disk" 590 " shutdown, and then restarting it") 591 592 if runningon_target: 593 # the migration has actually succeeded, we need to update the config 594 self.feedback_fn("* instance running on secondary node (%s)," 595 " updating config" % 596 self.cfg.GetNodeName(self.target_node_uuid)) 597 self.instance.primary_node = self.target_node_uuid 598 self.cfg.Update(self.instance, self.feedback_fn) 599 demoted_node_uuid = self.source_node_uuid 600 else: 601 self.feedback_fn("* instance confirmed to be running on its" 602 " primary node (%s)" % 603 self.cfg.GetNodeName(self.source_node_uuid)) 604 demoted_node_uuid = self.target_node_uuid 605 606 if self.instance.disk_template in constants.DTS_INT_MIRROR: 607 self._EnsureSecondary(demoted_node_uuid) 608 try: 609 self._WaitUntilSync() 610 except errors.OpExecError: 611 # we ignore here errors, since if the device is standalone, it 612 # won't be able to sync 613 pass 614 self._GoStandalone() 615 self._GoReconnect(False) 616 self._WaitUntilSync() 617 618 self.feedback_fn("* done")

619

620 - def _RevertDiskStatus(self):

621 """Try to revert the disk status after a failed migration. 622 623 """ 624 if self.instance.disk_template in constants.DTS_EXT_MIRROR: 625 return 626 627 try: 628 self._EnsureSecondary(self.target_node_uuid) 629 self._GoStandalone() 630 self._GoReconnect(False) 631 self._WaitUntilSync() 632 except errors.OpExecError, err: 633 self.lu.LogWarning("Migration failed and I can't reconnect the drives," 634 " please try to recover the instance manually;" 635 " error '%s'" % str(err))

636

637 - def _AbortMigration(self):

638 """Call the hypervisor code to abort a started migration. 639 640 """ 641 abort_result = self.rpc.call_instance_finalize_migration_dst( 642 self.target_node_uuid, self.instance, self.migration_info, 643 False) 644 abort_msg = abort_result.fail_msg 645 if abort_msg: 646 logging.error("Aborting migration failed on target node %s: %s", 647 self.cfg.GetNodeName(self.target_node_uuid), abort_msg) 648 # Don't raise an exception here, as we stil have to try to revert the 649 # disk status, even if this step failed. 650 651 abort_result = self.rpc.call_instance_finalize_migration_src( 652 self.source_node_uuid, self.instance, False, self.live) 653 abort_msg = abort_result.fail_msg 654 if abort_msg: 655 logging.error("Aborting migration failed on source node %s: %s", 656 self.cfg.GetNodeName(self.source_node_uuid), abort_msg)

657

658 - def _ExecMigration(self):

659 """Migrate an instance. 660 661 The migrate is done by: 662 - change the disks into dual-master mode 663 - wait until disks are fully synchronized again 664 - migrate the instance 665 - change disks on the new secondary node (the old primary) to secondary 666 - wait until disks are fully synchronized 667 - change disks into single-master mode 668 669 """ 670 # Check for hypervisor version mismatch and warn the user. 671 hvspecs = [(self.instance.hypervisor, 672 self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor])] 673 nodeinfo = self.rpc.call_node_info( 674 [self.source_node_uuid, self.target_node_uuid], None, hvspecs) 675 for ninfo in nodeinfo.values(): 676 ninfo.Raise("Unable to retrieve node information from node '%s'" % 677 ninfo.node) 678 (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload 679 (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload 680 681 if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and 682 (constants.HV_NODEINFO_KEY_VERSION in dst_info)): 683 src_version = src_info[constants.HV_NODEINFO_KEY_VERSION] 684 dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION] 685 if src_version != dst_version: 686 self.feedback_fn("* warning: hypervisor version mismatch between" 687 " source (%s) and target (%s) node" % 688 (src_version, dst_version)) 689 690 self.feedback_fn("* checking disk consistency between source and target") 691 for (idx, dev) in enumerate(self.instance.disks): 692 if not CheckDiskConsistency(self.lu, self.instance, dev, 693 self.target_node_uuid, 694 False): 695 raise errors.OpExecError("Disk %s is degraded or not fully" 696 " synchronized on target node," 697 " aborting migration" % idx) 698 699 if self.current_mem > self.tgt_free_mem: 700 if not self.allow_runtime_changes: 701 raise errors.OpExecError("Memory ballooning not allowed and not enough" 702 " free memory to fit instance %s on target" 703 " node %s (have %dMB, need %dMB)" % 704 (self.instance.name, 705 self.cfg.GetNodeName(self.target_node_uuid), 706 self.tgt_free_mem, self.current_mem)) 707 self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem) 708 rpcres = self.rpc.call_instance_balloon_memory(self.instance.primary_node, 709 self.instance, 710 self.tgt_free_mem) 711 rpcres.Raise("Cannot modify instance runtime memory") 712 713 # First get the migration information from the remote node 714 result = self.rpc.call_migration_info(self.source_node_uuid, self.instance) 715 msg = result.fail_msg 716 if msg: 717 log_err = ("Failed fetching source migration information from %s: %s" % 718 (self.cfg.GetNodeName(self.source_node_uuid), msg)) 719 logging.error(log_err) 720 raise errors.OpExecError(log_err) 721 722 self.migration_info = migration_info = result.payload 723 724 if self.instance.disk_template not in constants.DTS_EXT_MIRROR: 725 # Then switch the disks to master/master mode 726 self._EnsureSecondary(self.target_node_uuid) 727 self._GoStandalone() 728 self._GoReconnect(True) 729 self._WaitUntilSync() 730 731 self.feedback_fn("* preparing %s to accept the instance" % 732 self.cfg.GetNodeName(self.target_node_uuid)) 733 result = self.rpc.call_accept_instance(self.target_node_uuid, 734 self.instance, 735 migration_info, 736 self.nodes_ip[self.target_node_uuid]) 737 738 msg = result.fail_msg 739 if msg: 740 logging.error("Instance pre-migration failed, trying to revert" 741 " disk status: %s", msg) 742 self.feedback_fn("Pre-migration failed, aborting") 743 self._AbortMigration() 744 self._RevertDiskStatus() 745 raise errors.OpExecError("Could not pre-migrate instance %s: %s" % 746 (self.instance.name, msg)) 747 748 self.feedback_fn("* migrating instance to %s" % 749 self.cfg.GetNodeName(self.target_node_uuid)) 750 cluster = self.cfg.GetClusterInfo() 751 result = self.rpc.call_instance_migrate( 752 self.source_node_uuid, cluster.cluster_name, self.instance, 753 self.nodes_ip[self.target_node_uuid], self.live) 754 msg = result.fail_msg 755 if msg: 756 logging.error("Instance migration failed, trying to revert" 757 " disk status: %s", msg) 758 self.feedback_fn("Migration failed, aborting") 759 self._AbortMigration() 760 self._RevertDiskStatus() 761 raise errors.OpExecError("Could not migrate instance %s: %s" % 762 (self.instance.name, msg)) 763 764 self.feedback_fn("* starting memory transfer") 765 last_feedback = time.time() 766 while True: 767 result = self.rpc.call_instance_get_migration_status( 768 self.source_node_uuid, self.instance) 769 msg = result.fail_msg 770 ms = result.payload # MigrationStatus instance 771 if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES): 772 logging.error("Instance migration failed, trying to revert" 773 " disk status: %s", msg) 774 self.feedback_fn("Migration failed, aborting") 775 self._AbortMigration() 776 self._RevertDiskStatus() 777 if not msg: 778 msg = "hypervisor returned failure" 779 raise errors.OpExecError("Could not migrate instance %s: %s" % 780 (self.instance.name, msg)) 781 782 if result.payload.status != constants.HV_MIGRATION_ACTIVE: 783 self.feedback_fn("* memory transfer complete") 784 break 785 786 if (utils.TimeoutExpired(last_feedback, 787 self._MIGRATION_FEEDBACK_INTERVAL) and 788 ms.transferred_ram is not None): 789 mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram) 790 self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress) 791 last_feedback = time.time() 792 793 time.sleep(self._MIGRATION_POLL_INTERVAL) 794 795 result = self.rpc.call_instance_finalize_migration_src( 796 self.source_node_uuid, self.instance, True, self.live) 797 msg = result.fail_msg 798 if msg: 799 logging.error("Instance migration succeeded, but finalization failed" 800 " on the source node: %s", msg) 801 raise errors.OpExecError("Could not finalize instance migration: %s" % 802 msg) 803 804 self.instance.primary_node = self.target_node_uuid 805 806 # distribute new instance config to the other nodes 807 self.cfg.Update(self.instance, self.feedback_fn) 808 809 result = self.rpc.call_instance_finalize_migration_dst( 810 self.target_node_uuid, self.instance, migration_info, True) 811 msg = result.fail_msg 812 if msg: 813 logging.error("Instance migration succeeded, but finalization failed" 814 " on the target node: %s", msg) 815 raise errors.OpExecError("Could not finalize instance migration: %s" % 816 msg) 817 818 if self.instance.disk_template not in constants.DTS_EXT_MIRROR: 819 self._EnsureSecondary(self.source_node_uuid) 820 self._WaitUntilSync() 821 self._GoStandalone() 822 self._GoReconnect(False) 823 self._WaitUntilSync() 824 825 # If the instance's disk template is `rbd' or `ext' and there was a 826 # successful migration, unmap the device from the source node. 827 if self.instance.disk_template in (constants.DT_RBD, constants.DT_EXT): 828 disks = ExpandCheckDisks(self.instance, self.instance.disks) 829 self.feedback_fn("* unmapping instance's disks from %s" % 830 self.cfg.GetNodeName(self.source_node_uuid)) 831 for disk in disks: 832 result = self.rpc.call_blockdev_shutdown(self.source_node_uuid, 833 (disk, self.instance)) 834 msg = result.fail_msg 835 if msg: 836 logging.error("Migration was successful, but couldn't unmap the" 837 " block device %s on source node %s: %s", 838 disk.iv_name, 839 self.cfg.GetNodeName(self.source_node_uuid), msg) 840 logging.error("You need to unmap the device %s manually on %s", 841 disk.iv_name, 842 self.cfg.GetNodeName(self.source_node_uuid)) 843 844 self.feedback_fn("* done")

845

846 - def _ExecFailover(self):

847 """Failover an instance. 848 849 The failover is done by shutting it down on its present node and 850 starting it on the secondary. 851 852 """ 853 primary_node = self.cfg.GetNodeInfo(self.instance.primary_node) 854 855 source_node_uuid = self.instance.primary_node 856 857 if self.instance.disks_active: 858 self.feedback_fn("* checking disk consistency between source and target") 859 for (idx, dev) in enumerate(self.instance.disks): 860 # for drbd, these are drbd over lvm 861 if not CheckDiskConsistency(self.lu, self.instance, dev, 862 self.target_node_uuid, False): 863 if primary_node.offline: 864 self.feedback_fn("Node %s is offline, ignoring degraded disk %s on" 865 " target node %s" % 866 (primary_node.name, idx, 867 self.cfg.GetNodeName(self.target_node_uuid))) 868 elif not self.ignore_consistency: 869 raise errors.OpExecError("Disk %s is degraded on target node," 870 " aborting failover" % idx) 871 else: 872 self.feedback_fn("* not checking disk consistency as instance is not" 873 " running") 874 875 self.feedback_fn("* shutting down instance on source node") 876 logging.info("Shutting down instance %s on node %s", 877 self.instance.name, self.cfg.GetNodeName(source_node_uuid)) 878 879 result = self.rpc.call_instance_shutdown(source_node_uuid, self.instance, 880 self.shutdown_timeout, 881 self.lu.op.reason) 882 msg = result.fail_msg 883 if msg: 884 if self.ignore_consistency or primary_node.offline: 885 self.lu.LogWarning("Could not shutdown instance %s on node %s," 886 " proceeding anyway; please make sure node" 887 " %s is down; error details: %s", 888 self.instance.name, 889 self.cfg.GetNodeName(source_node_uuid), 890 self.cfg.GetNodeName(source_node_uuid), msg) 891 else: 892 raise errors.OpExecError("Could not shutdown instance %s on" 893 " node %s: %s" % 894 (self.instance.name, 895 self.cfg.GetNodeName(source_node_uuid), msg)) 896 897 self.feedback_fn("* deactivating the instance's disks on source node") 898 if not ShutdownInstanceDisks(self.lu, self.instance, ignore_primary=True): 899 raise errors.OpExecError("Can't shut down the instance's disks") 900 901 self.instance.primary_node = self.target_node_uuid 902 # distribute new instance config to the other nodes 903 self.cfg.Update(self.instance, self.feedback_fn) 904 905 # Only start the instance if it's marked as up 906 if self.instance.admin_state == constants.ADMINST_UP: 907 self.feedback_fn("* activating the instance's disks on target node %s" % 908 self.cfg.GetNodeName(self.target_node_uuid)) 909 logging.info("Starting instance %s on node %s", self.instance.name, 910 self.cfg.GetNodeName(self.target_node_uuid)) 911 912 disks_ok, _ = AssembleInstanceDisks(self.lu, self.instance, 913 ignore_secondaries=True) 914 if not disks_ok: 915 ShutdownInstanceDisks(self.lu, self.instance) 916 raise errors.OpExecError("Can't activate the instance's disks") 917 918 self.feedback_fn("* starting the instance on the target node %s" % 919 self.cfg.GetNodeName(self.target_node_uuid)) 920 result = self.rpc.call_instance_start(self.target_node_uuid, 921 (self.instance, None, None), False, 922 self.lu.op.reason) 923 msg = result.fail_msg 924 if msg: 925 ShutdownInstanceDisks(self.lu, self.instance) 926 raise errors.OpExecError("Could not start instance %s on node %s: %s" % 927 (self.instance.name, 928 self.cfg.GetNodeName(self.target_node_uuid), 929 msg))

930

931 - def Exec(self, feedback_fn):

932 """Perform the migration. 933 934 """ 935 self.feedback_fn = feedback_fn 936 self.source_node_uuid = self.instance.primary_node 937 938 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing 939 if self.instance.disk_template in constants.DTS_INT_MIRROR: 940 self.target_node_uuid = self.instance.secondary_nodes[0] 941 # Otherwise self.target_node has been populated either 942 # directly, or through an iallocator. 943 944 self.all_node_uuids = [self.source_node_uuid, self.target_node_uuid] 945 self.nodes_ip = dict((uuid, node.secondary_ip) for (uuid, node) 946 in self.cfg.GetMultiNodeInfo(self.all_node_uuids)) 947 948 if self.failover: 949 feedback_fn("Failover instance %s" % self.instance.name) 950 self._ExecFailover() 951 else: 952 feedback_fn("Migrating instance %s" % self.instance.name) 953 954 if self.cleanup: 955 return self._ExecCleanup() 956 else: 957 return self._ExecMigration()

Source Code for Module ganeti.cmdlib.instance_migration