ganeti.jqueue

1 # 2 # 3 4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 Google Inc. 5 # 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; either version 2 of the License, or 9 # (at your option) any later version. 10 # 11 # This program is distributed in the hope that it will be useful, but 12 # WITHOUT ANY WARRANTY; without even the implied warranty of 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 # General Public License for more details. 15 # 16 # You should have received a copy of the GNU General Public License 17 # along with this program; if not, write to the Free Software 18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 # 02110-1301, USA. 20 21 22 """Module implementing the job queue handling. 23 24 Locking: there's a single, large lock in the L{JobQueue} class. It's 25 used by all other classes in this module. 26 27 @var JOBQUEUE_THREADS: the number of worker threads we start for 28 processing jobs 29 30 """ 31 32 import logging 33 import errno 34 import time 35 import weakref 36 import threading 37 import itertools 38 39 try: 40 # pylint: disable=E0611 41 from pyinotify import pyinotify 42 except ImportError: 43 import pyinotify 44 45 from ganeti import asyncnotifier 46 from ganeti import constants 47 from ganeti import serializer 48 from ganeti import workerpool 49 from ganeti import locking 50 from ganeti import opcodes 51 from ganeti import errors 52 from ganeti import mcpu 53 from ganeti import utils 54 from ganeti import jstore 55 from ganeti import rpc 56 from ganeti import runtime 57 from ganeti import netutils 58 from ganeti import compat 59 from ganeti import ht 60 from ganeti import query 61 from ganeti import qlang 62 63 64 JOBQUEUE_THREADS = 25 65 JOBS_PER_ARCHIVE_DIRECTORY = 10000 66 67 # member lock names to be passed to @ssynchronized decorator 68 _LOCK = "_lock" 69 _QUEUE = "_queue"

70 71 72 -class CancelJob(Exception):

73 """Special exception to cancel a job. 74 75 """

76

77 78 -class QueueShutdown(Exception):

79 """Special exception to abort a job when the job queue is shutting down. 80 81 """

82

83 84 -def TimeStampNow():

85 """Returns the current timestamp. 86 87 @rtype: tuple 88 @return: the current time in the (seconds, microseconds) format 89 90 """ 91 return utils.SplitTime(time.time())

92

93 94 -class _SimpleJobQuery:

95 """Wrapper for job queries. 96 97 Instance keeps list of fields cached, useful e.g. in L{_JobChangesChecker}. 98 99 """

100 - def __init__(self, fields):

101 """Initializes this class. 102 103 """ 104 self._query = query.Query(query.JOB_FIELDS, fields)

105

106 - def __call__(self, job):

107 """Executes a job query using cached field list. 108 109 """ 110 return self._query.OldStyleQuery([(job.id, job)], sort_by_name=False)[0]

111

112 113 -class _QueuedOpCode(object):

114 """Encapsulates an opcode object. 115 116 @ivar log: holds the execution log and consists of tuples 117 of the form C{(log_serial, timestamp, level, message)} 118 @ivar input: the OpCode we encapsulate 119 @ivar status: the current status 120 @ivar result: the result of the LU execution 121 @ivar start_timestamp: timestamp for the start of the execution 122 @ivar exec_timestamp: timestamp for the actual LU Exec() function invocation 123 @ivar stop_timestamp: timestamp for the end of the execution 124 125 """ 126 __slots__ = ["input", "status", "result", "log", "priority", 127 "start_timestamp", "exec_timestamp", "end_timestamp", 128 "__weakref__"] 129

130 - def __init__(self, op):

131 """Initializes instances of this class. 132 133 @type op: L{opcodes.OpCode} 134 @param op: the opcode we encapsulate 135 136 """ 137 self.input = op 138 self.status = constants.OP_STATUS_QUEUED 139 self.result = None 140 self.log = [] 141 self.start_timestamp = None 142 self.exec_timestamp = None 143 self.end_timestamp = None 144 145 # Get initial priority (it might change during the lifetime of this opcode) 146 self.priority = getattr(op, "priority", constants.OP_PRIO_DEFAULT)

147 148 @classmethod

149 - def Restore(cls, state):

150 """Restore the _QueuedOpCode from the serialized form. 151 152 @type state: dict 153 @param state: the serialized state 154 @rtype: _QueuedOpCode 155 @return: a new _QueuedOpCode instance 156 157 """ 158 obj = _QueuedOpCode.__new__(cls) 159 obj.input = opcodes.OpCode.LoadOpCode(state["input"]) 160 obj.status = state["status"] 161 obj.result = state["result"] 162 obj.log = state["log"] 163 obj.start_timestamp = state.get("start_timestamp", None) 164 obj.exec_timestamp = state.get("exec_timestamp", None) 165 obj.end_timestamp = state.get("end_timestamp", None) 166 obj.priority = state.get("priority", constants.OP_PRIO_DEFAULT) 167 return obj

168

169 - def Serialize(self):

170 """Serializes this _QueuedOpCode. 171 172 @rtype: dict 173 @return: the dictionary holding the serialized state 174 175 """ 176 return { 177 "input": self.input.__getstate__(), 178 "status": self.status, 179 "result": self.result, 180 "log": self.log, 181 "start_timestamp": self.start_timestamp, 182 "exec_timestamp": self.exec_timestamp, 183 "end_timestamp": self.end_timestamp, 184 "priority": self.priority, 185 }

186

187 188 -class _QueuedJob(object):

189 """In-memory job representation. 190 191 This is what we use to track the user-submitted jobs. Locking must 192 be taken care of by users of this class. 193 194 @type queue: L{JobQueue} 195 @ivar queue: the parent queue 196 @ivar id: the job ID 197 @type ops: list 198 @ivar ops: the list of _QueuedOpCode that constitute the job 199 @type log_serial: int 200 @ivar log_serial: holds the index for the next log entry 201 @ivar received_timestamp: the timestamp for when the job was received 202 @ivar start_timestmap: the timestamp for start of execution 203 @ivar end_timestamp: the timestamp for end of execution 204 @ivar writable: Whether the job is allowed to be modified 205 206 """ 207 # pylint: disable=W0212 208 __slots__ = ["queue", "id", "ops", "log_serial", "ops_iter", "cur_opctx", 209 "received_timestamp", "start_timestamp", "end_timestamp", 210 "__weakref__", "processor_lock", "writable"] 211

212 - def __init__(self, queue, job_id, ops, writable):

213 """Constructor for the _QueuedJob. 214 215 @type queue: L{JobQueue} 216 @param queue: our parent queue 217 @type job_id: job_id 218 @param job_id: our job id 219 @type ops: list 220 @param ops: the list of opcodes we hold, which will be encapsulated 221 in _QueuedOpCodes 222 @type writable: bool 223 @param writable: Whether job can be modified 224 225 """ 226 if not ops: 227 raise errors.GenericError("A job needs at least one opcode") 228 229 self.queue = queue 230 self.id = job_id 231 self.ops = [_QueuedOpCode(op) for op in ops] 232 self.log_serial = 0 233 self.received_timestamp = TimeStampNow() 234 self.start_timestamp = None 235 self.end_timestamp = None 236 237 self._InitInMemory(self, writable)

238 239 @staticmethod

240 - def _InitInMemory(obj, writable):

241 """Initializes in-memory variables. 242 243 """ 244 obj.writable = writable 245 obj.ops_iter = None 246 obj.cur_opctx = None 247 248 # Read-only jobs are not processed and therefore don't need a lock 249 if writable: 250 obj.processor_lock = threading.Lock() 251 else: 252 obj.processor_lock = None

253

254 - def __repr__(self):

255 status = ["%s.%s" % (self.__class__.__module__, self.__class__.__name__), 256 "id=%s" % self.id, 257 "ops=%s" % ",".join([op.input.Summary() for op in self.ops])] 258 259 return "<%s at %#x>" % (" ".join(status), id(self))

260 261 @classmethod

262 - def Restore(cls, queue, state, writable):

263 """Restore a _QueuedJob from serialized state: 264 265 @type queue: L{JobQueue} 266 @param queue: to which queue the restored job belongs 267 @type state: dict 268 @param state: the serialized state 269 @type writable: bool 270 @param writable: Whether job can be modified 271 @rtype: _JobQueue 272 @return: the restored _JobQueue instance 273 274 """ 275 obj = _QueuedJob.__new__(cls) 276 obj.queue = queue 277 obj.id = state["id"] 278 obj.received_timestamp = state.get("received_timestamp", None) 279 obj.start_timestamp = state.get("start_timestamp", None) 280 obj.end_timestamp = state.get("end_timestamp", None) 281 282 obj.ops = [] 283 obj.log_serial = 0 284 for op_state in state["ops"]: 285 op = _QueuedOpCode.Restore(op_state) 286 for log_entry in op.log: 287 obj.log_serial = max(obj.log_serial, log_entry[0]) 288 obj.ops.append(op) 289 290 cls._InitInMemory(obj, writable) 291 292 return obj

293

294 - def Serialize(self):

295 """Serialize the _JobQueue instance. 296 297 @rtype: dict 298 @return: the serialized state 299 300 """ 301 return { 302 "id": self.id, 303 "ops": [op.Serialize() for op in self.ops], 304 "start_timestamp": self.start_timestamp, 305 "end_timestamp": self.end_timestamp, 306 "received_timestamp": self.received_timestamp, 307 }

308

309 - def CalcStatus(self):

310 """Compute the status of this job. 311 312 This function iterates over all the _QueuedOpCodes in the job and 313 based on their status, computes the job status. 314 315 The algorithm is: 316 - if we find a cancelled, or finished with error, the job 317 status will be the same 318 - otherwise, the last opcode with the status one of: 319 - waitlock 320 - canceling 321 - running 322 323 will determine the job status 324 325 - otherwise, it means either all opcodes are queued, or success, 326 and the job status will be the same 327 328 @return: the job status 329 330 """ 331 status = constants.JOB_STATUS_QUEUED 332 333 all_success = True 334 for op in self.ops: 335 if op.status == constants.OP_STATUS_SUCCESS: 336 continue 337 338 all_success = False 339 340 if op.status == constants.OP_STATUS_QUEUED: 341 pass 342 elif op.status == constants.OP_STATUS_WAITING: 343 status = constants.JOB_STATUS_WAITING 344 elif op.status == constants.OP_STATUS_RUNNING: 345 status = constants.JOB_STATUS_RUNNING 346 elif op.status == constants.OP_STATUS_CANCELING: 347 status = constants.JOB_STATUS_CANCELING 348 break 349 elif op.status == constants.OP_STATUS_ERROR: 350 status = constants.JOB_STATUS_ERROR 351 # The whole job fails if one opcode failed 352 break 353 elif op.status == constants.OP_STATUS_CANCELED: 354 status = constants.OP_STATUS_CANCELED 355 break 356 357 if all_success: 358 status = constants.JOB_STATUS_SUCCESS 359 360 return status

361

362 - def CalcPriority(self):

363 """Gets the current priority for this job. 364 365 Only unfinished opcodes are considered. When all are done, the default 366 priority is used. 367 368 @rtype: int 369 370 """ 371 priorities = [op.priority for op in self.ops 372 if op.status not in constants.OPS_FINALIZED] 373 374 if not priorities: 375 # All opcodes are done, assume default priority 376 return constants.OP_PRIO_DEFAULT 377 378 return min(priorities)

379

380 - def GetLogEntries(self, newer_than):

381 """Selectively returns the log entries. 382 383 @type newer_than: None or int 384 @param newer_than: if this is None, return all log entries, 385 otherwise return only the log entries with serial higher 386 than this value 387 @rtype: list 388 @return: the list of the log entries selected 389 390 """ 391 if newer_than is None: 392 serial = -1 393 else: 394 serial = newer_than 395 396 entries = [] 397 for op in self.ops: 398 entries.extend(filter(lambda entry: entry[0] > serial, op.log)) 399 400 return entries

401

402 - def GetInfo(self, fields):

403 """Returns information about a job. 404 405 @type fields: list 406 @param fields: names of fields to return 407 @rtype: list 408 @return: list with one element for each field 409 @raise errors.OpExecError: when an invalid field 410 has been passed 411 412 """ 413 return _SimpleJobQuery(fields)(self)

414

415 - def MarkUnfinishedOps(self, status, result):

416 """Mark unfinished opcodes with a given status and result. 417 418 This is an utility function for marking all running or waiting to 419 be run opcodes with a given status. Opcodes which are already 420 finalised are not changed. 421 422 @param status: a given opcode status 423 @param result: the opcode result 424 425 """ 426 not_marked = True 427 for op in self.ops: 428 if op.status in constants.OPS_FINALIZED: 429 assert not_marked, "Finalized opcodes found after non-finalized ones" 430 continue 431 op.status = status 432 op.result = result 433 not_marked = False

434

435 - def Finalize(self):

436 """Marks the job as finalized. 437 438 """ 439 self.end_timestamp = TimeStampNow()

440

441 - def Cancel(self):

442 """Marks job as canceled/-ing if possible. 443 444 @rtype: tuple; (bool, string) 445 @return: Boolean describing whether job was successfully canceled or marked 446 as canceling and a text message 447 448 """ 449 status = self.CalcStatus() 450 451 if status == constants.JOB_STATUS_QUEUED: 452 self.MarkUnfinishedOps(constants.OP_STATUS_CANCELED, 453 "Job canceled by request") 454 self.Finalize() 455 return (True, "Job %s canceled" % self.id) 456 457 elif status == constants.JOB_STATUS_WAITING: 458 # The worker will notice the new status and cancel the job 459 self.MarkUnfinishedOps(constants.OP_STATUS_CANCELING, None) 460 return (True, "Job %s will be canceled" % self.id) 461 462 else: 463 logging.debug("Job %s is no longer waiting in the queue", self.id) 464 return (False, "Job %s is no longer waiting in the queue" % self.id)

465

466 467 -class _OpExecCallbacks(mcpu.OpExecCbBase):

468 - def __init__(self, queue, job, op):

469 """Initializes this class. 470 471 @type queue: L{JobQueue} 472 @param queue: Job queue 473 @type job: L{_QueuedJob} 474 @param job: Job object 475 @type op: L{_QueuedOpCode} 476 @param op: OpCode 477 478 """ 479 assert queue, "Queue is missing" 480 assert job, "Job is missing" 481 assert op, "Opcode is missing" 482 483 self._queue = queue 484 self._job = job 485 self._op = op

486

487 - def _CheckCancel(self):

488 """Raises an exception to cancel the job if asked to. 489 490 """ 491 # Cancel here if we were asked to 492 if self._op.status == constants.OP_STATUS_CANCELING: 493 logging.debug("Canceling opcode") 494 raise CancelJob() 495 496 # See if queue is shutting down 497 if not self._queue.AcceptingJobsUnlocked(): 498 logging.debug("Queue is shutting down") 499 raise QueueShutdown()

500 501 @locking.ssynchronized(_QUEUE, shared=1)

502 - def NotifyStart(self):

503 """Mark the opcode as running, not lock-waiting. 504 505 This is called from the mcpu code as a notifier function, when the LU is 506 finally about to start the Exec() method. Of course, to have end-user 507 visible results, the opcode must be initially (before calling into 508 Processor.ExecOpCode) set to OP_STATUS_WAITING. 509 510 """ 511 assert self._op in self._job.ops 512 assert self._op.status in (constants.OP_STATUS_WAITING, 513 constants.OP_STATUS_CANCELING) 514 515 # Cancel here if we were asked to 516 self._CheckCancel() 517 518 logging.debug("Opcode is now running") 519 520 self._op.status = constants.OP_STATUS_RUNNING 521 self._op.exec_timestamp = TimeStampNow() 522 523 # And finally replicate the job status 524 self._queue.UpdateJobUnlocked(self._job)

525 526 @locking.ssynchronized(_QUEUE, shared=1)

527 - def _AppendFeedback(self, timestamp, log_type, log_msg):

528 """Internal feedback append function, with locks 529 530 """ 531 self._job.log_serial += 1 532 self._op.log.append((self._job.log_serial, timestamp, log_type, log_msg)) 533 self._queue.UpdateJobUnlocked(self._job, replicate=False)

534

535 - def Feedback(self, *args):

536 """Append a log entry. 537 538 """ 539 assert len(args) < 3 540 541 if len(args) == 1: 542 log_type = constants.ELOG_MESSAGE 543 log_msg = args[0] 544 else: 545 (log_type, log_msg) = args 546 547 # The time is split to make serialization easier and not lose 548 # precision. 549 timestamp = utils.SplitTime(time.time()) 550 self._AppendFeedback(timestamp, log_type, log_msg)

551

552 - def CheckCancel(self):

553 """Check whether job has been cancelled. 554 555 """ 556 assert self._op.status in (constants.OP_STATUS_WAITING, 557 constants.OP_STATUS_CANCELING) 558 559 # Cancel here if we were asked to 560 self._CheckCancel()

561

562 - def SubmitManyJobs(self, jobs):

563 """Submits jobs for processing. 564 565 See L{JobQueue.SubmitManyJobs}. 566 567 """ 568 # Locking is done in job queue 569 return self._queue.SubmitManyJobs(jobs)

570

571 572 -class _JobChangesChecker(object):

573 - def __init__(self, fields, prev_job_info, prev_log_serial):

574 """Initializes this class. 575 576 @type fields: list of strings 577 @param fields: Fields requested by LUXI client 578 @type prev_job_info: string 579 @param prev_job_info: previous job info, as passed by the LUXI client 580 @type prev_log_serial: string 581 @param prev_log_serial: previous job serial, as passed by the LUXI client 582 583 """ 584 self._squery = _SimpleJobQuery(fields) 585 self._prev_job_info = prev_job_info 586 self._prev_log_serial = prev_log_serial

587

588 - def __call__(self, job):

589 """Checks whether job has changed. 590 591 @type job: L{_QueuedJob} 592 @param job: Job object 593 594 """ 595 assert not job.writable, "Expected read-only job" 596 597 status = job.CalcStatus() 598 job_info = self._squery(job) 599 log_entries = job.GetLogEntries(self._prev_log_serial) 600 601 # Serializing and deserializing data can cause type changes (e.g. from 602 # tuple to list) or precision loss. We're doing it here so that we get 603 # the same modifications as the data received from the client. Without 604 # this, the comparison afterwards might fail without the data being 605 # significantly different. 606 # TODO: we just deserialized from disk, investigate how to make sure that 607 # the job info and log entries are compatible to avoid this further step. 608 # TODO: Doing something like in testutils.py:UnifyValueType might be more 609 # efficient, though floats will be tricky 610 job_info = serializer.LoadJson(serializer.DumpJson(job_info)) 611 log_entries = serializer.LoadJson(serializer.DumpJson(log_entries)) 612 613 # Don't even try to wait if the job is no longer running, there will be 614 # no changes. 615 if (status not in (constants.JOB_STATUS_QUEUED, 616 constants.JOB_STATUS_RUNNING, 617 constants.JOB_STATUS_WAITING) or 618 job_info != self._prev_job_info or 619 (log_entries and self._prev_log_serial != log_entries[0][0])): 620 logging.debug("Job %s changed", job.id) 621 return (job_info, log_entries) 622 623 return None

624

625 626 -class _JobFileChangesWaiter(object):

627 - def __init__(self, filename):

628 """Initializes this class. 629 630 @type filename: string 631 @param filename: Path to job file 632 @raises errors.InotifyError: if the notifier cannot be setup 633 634 """ 635 self._wm = pyinotify.WatchManager() 636 self._inotify_handler = \ 637 asyncnotifier.SingleFileEventHandler(self._wm, self._OnInotify, filename) 638 self._notifier = \ 639 pyinotify.Notifier(self._wm, default_proc_fun=self._inotify_handler) 640 try: 641 self._inotify_handler.enable() 642 except Exception: 643 # pyinotify doesn't close file descriptors automatically 644 self._notifier.stop() 645 raise

646

647 - def _OnInotify(self, notifier_enabled):

648 """Callback for inotify. 649 650 """ 651 if not notifier_enabled: 652 self._inotify_handler.enable()

653

654 - def Wait(self, timeout):

655 """Waits for the job file to change. 656 657 @type timeout: float 658 @param timeout: Timeout in seconds 659 @return: Whether there have been events 660 661 """ 662 assert timeout >= 0 663 have_events = self._notifier.check_events(timeout * 1000) 664 if have_events: 665 self._notifier.read_events() 666 self._notifier.process_events() 667 return have_events

668

669 - def Close(self):

670 """Closes underlying notifier and its file descriptor. 671 672 """ 673 self._notifier.stop()

674

675 676 -class _JobChangesWaiter(object):

677 - def __init__(self, filename):

678 """Initializes this class. 679 680 @type filename: string 681 @param filename: Path to job file 682 683 """ 684 self._filewaiter = None 685 self._filename = filename

686

687 - def Wait(self, timeout):

688 """Waits for a job to change. 689 690 @type timeout: float 691 @param timeout: Timeout in seconds 692 @return: Whether there have been events 693 694 """ 695 if self._filewaiter: 696 return self._filewaiter.Wait(timeout) 697 698 # Lazy setup: Avoid inotify setup cost when job file has already changed. 699 # If this point is reached, return immediately and let caller check the job 700 # file again in case there were changes since the last check. This avoids a 701 # race condition. 702 self._filewaiter = _JobFileChangesWaiter(self._filename) 703 704 return True

705

706 - def Close(self):

707 """Closes underlying waiter. 708 709 """ 710 if self._filewaiter: 711 self._filewaiter.Close()

712

713 714 -class _WaitForJobChangesHelper(object):

715 """Helper class using inotify to wait for changes in a job file. 716 717 This class takes a previous job status and serial, and alerts the client when 718 the current job status has changed. 719 720 """ 721 @staticmethod

722 - def _CheckForChanges(counter, job_load_fn, check_fn):

723 if counter.next() > 0: 724 # If this isn't the first check the job is given some more time to change 725 # again. This gives better performance for jobs generating many 726 # changes/messages. 727 time.sleep(0.1) 728 729 job = job_load_fn() 730 if not job: 731 raise errors.JobLost() 732 733 result = check_fn(job) 734 if result is None: 735 raise utils.RetryAgain() 736 737 return result

738

739 - def __call__(self, filename, job_load_fn, 740 fields, prev_job_info, prev_log_serial, timeout):

741 """Waits for changes on a job. 742 743 @type filename: string 744 @param filename: File on which to wait for changes 745 @type job_load_fn: callable 746 @param job_load_fn: Function to load job 747 @type fields: list of strings 748 @param fields: Which fields to check for changes 749 @type prev_job_info: list or None 750 @param prev_job_info: Last job information returned 751 @type prev_log_serial: int 752 @param prev_log_serial: Last job message serial number 753 @type timeout: float 754 @param timeout: maximum time to wait in seconds 755 756 """ 757 counter = itertools.count() 758 try: 759 check_fn = _JobChangesChecker(fields, prev_job_info, prev_log_serial) 760 waiter = _JobChangesWaiter(filename) 761 try: 762 return utils.Retry(compat.partial(self._CheckForChanges, 763 counter, job_load_fn, check_fn), 764 utils.RETRY_REMAINING_TIME, timeout, 765 wait_fn=waiter.Wait) 766 finally: 767 waiter.Close() 768 except (errors.InotifyError, errors.JobLost): 769 return None 770 except utils.RetryTimeout: 771 return constants.JOB_NOTCHANGED

772

773 774 -def _EncodeOpError(err):

775 """Encodes an error which occurred while processing an opcode. 776 777 """ 778 if isinstance(err, errors.GenericError): 779 to_encode = err 780 else: 781 to_encode = errors.OpExecError(str(err)) 782 783 return errors.EncodeException(to_encode)

784

785 786 -class _TimeoutStrategyWrapper:

787 - def __init__(self, fn):

788 """Initializes this class. 789 790 """ 791 self._fn = fn 792 self._next = None

793

794 - def _Advance(self):

795 """Gets the next timeout if necessary. 796 797 """ 798 if self._next is None: 799 self._next = self._fn()

800

801 - def Peek(self):

802 """Returns the next timeout. 803 804 """ 805 self._Advance() 806 return self._next

807

808 - def Next(self):

809 """Returns the current timeout and advances the internal state. 810 811 """ 812 self._Advance() 813 result = self._next 814 self._next = None 815 return result

816

817 818 -class _OpExecContext:

819 - def __init__(self, op, index, log_prefix, timeout_strategy_factory):

820 """Initializes this class. 821 822 """ 823 self.op = op 824 self.index = index 825 self.log_prefix = log_prefix 826 self.summary = op.input.Summary() 827 828 # Create local copy to modify 829 if getattr(op.input, opcodes.DEPEND_ATTR, None): 830 self.jobdeps = op.input.depends[:] 831 else: 832 self.jobdeps = None 833 834 self._timeout_strategy_factory = timeout_strategy_factory 835 self._ResetTimeoutStrategy()

836

837 - def _ResetTimeoutStrategy(self):

838 """Creates a new timeout strategy. 839 840 """ 841 self._timeout_strategy = \ 842 _TimeoutStrategyWrapper(self._timeout_strategy_factory().NextAttempt)

843

844 - def CheckPriorityIncrease(self):

845 """Checks whether priority can and should be increased. 846 847 Called when locks couldn't be acquired. 848 849 """ 850 op = self.op 851 852 # Exhausted all retries and next round should not use blocking acquire 853 # for locks? 854 if (self._timeout_strategy.Peek() is None and 855 op.priority > constants.OP_PRIO_HIGHEST): 856 logging.debug("Increasing priority") 857 op.priority -= 1 858 self._ResetTimeoutStrategy() 859 return True 860 861 return False

862

863 - def GetNextLockTimeout(self):

864 """Returns the next lock acquire timeout. 865 866 """ 867 return self._timeout_strategy.Next()

868

869 870 -class _JobProcessor(object):

871 (DEFER, 872 WAITDEP, 873 FINISHED) = range(1, 4) 874

875 - def __init__(self, queue, opexec_fn, job, 876 _timeout_strategy_factory=mcpu.LockAttemptTimeoutStrategy):

877 """Initializes this class. 878 879 """ 880 self.queue = queue 881 self.opexec_fn = opexec_fn 882 self.job = job 883 self._timeout_strategy_factory = _timeout_strategy_factory

884 885 @staticmethod

886 - def _FindNextOpcode(job, timeout_strategy_factory):

887 """Locates the next opcode to run. 888 889 @type job: L{_QueuedJob} 890 @param job: Job object 891 @param timeout_strategy_factory: Callable to create new timeout strategy 892 893 """ 894 # Create some sort of a cache to speed up locating next opcode for future 895 # lookups 896 # TODO: Consider splitting _QueuedJob.ops into two separate lists, one for 897 # pending and one for processed ops. 898 if job.ops_iter is None: 899 job.ops_iter = enumerate(job.ops) 900 901 # Find next opcode to run 902 while True: 903 try: 904 (idx, op) = job.ops_iter.next() 905 except StopIteration: 906 raise errors.ProgrammerError("Called for a finished job") 907 908 if op.status == constants.OP_STATUS_RUNNING: 909 # Found an opcode already marked as running 910 raise errors.ProgrammerError("Called for job marked as running") 911 912 opctx = _OpExecContext(op, idx, "Op %s/%s" % (idx + 1, len(job.ops)), 913 timeout_strategy_factory) 914 915 if op.status not in constants.OPS_FINALIZED: 916 return opctx 917 918 # This is a job that was partially completed before master daemon 919 # shutdown, so it can be expected that some opcodes are already 920 # completed successfully (if any did error out, then the whole job 921 # should have been aborted and not resubmitted for processing). 922 logging.info("%s: opcode %s already processed, skipping", 923 opctx.log_prefix, opctx.summary)

924 925 @staticmethod

926 - def _MarkWaitlock(job, op):

927 """Marks an opcode as waiting for locks. 928 929 The job's start timestamp is also set if necessary. 930 931 @type job: L{_QueuedJob} 932 @param job: Job object 933 @type op: L{_QueuedOpCode} 934 @param op: Opcode object 935 936 """ 937 assert op in job.ops 938 assert op.status in (constants.OP_STATUS_QUEUED, 939 constants.OP_STATUS_WAITING) 940 941 update = False 942 943 op.result = None 944 945 if op.status == constants.OP_STATUS_QUEUED: 946 op.status = constants.OP_STATUS_WAITING 947 update = True 948 949 if op.start_timestamp is None: 950 op.start_timestamp = TimeStampNow() 951 update = True 952 953 if job.start_timestamp is None: 954 job.start_timestamp = op.start_timestamp 955 update = True 956 957 assert op.status == constants.OP_STATUS_WAITING 958 959 return update

960 961 @staticmethod

962 - def _CheckDependencies(queue, job, opctx):

963 """Checks if an opcode has dependencies and if so, processes them. 964 965 @type queue: L{JobQueue} 966 @param queue: Queue object 967 @type job: L{_QueuedJob} 968 @param job: Job object 969 @type opctx: L{_OpExecContext} 970 @param opctx: Opcode execution context 971 @rtype: bool 972 @return: Whether opcode will be re-scheduled by dependency tracker 973 974 """ 975 op = opctx.op 976 977 result = False 978 979 while opctx.jobdeps: 980 (dep_job_id, dep_status) = opctx.jobdeps[0] 981 982 (depresult, depmsg) = queue.depmgr.CheckAndRegister(job, dep_job_id, 983 dep_status) 984 assert ht.TNonEmptyString(depmsg), "No dependency message" 985 986 logging.info("%s: %s", opctx.log_prefix, depmsg) 987 988 if depresult == _JobDependencyManager.CONTINUE: 989 # Remove dependency and continue 990 opctx.jobdeps.pop(0) 991 992 elif depresult == _JobDependencyManager.WAIT: 993 # Need to wait for notification, dependency tracker will re-add job 994 # to workerpool 995 result = True 996 break 997 998 elif depresult == _JobDependencyManager.CANCEL: 999 # Job was cancelled, cancel this job as well 1000 job.Cancel() 1001 assert op.status == constants.OP_STATUS_CANCELING 1002 break 1003 1004 elif depresult in (_JobDependencyManager.WRONGSTATUS, 1005 _JobDependencyManager.ERROR): 1006 # Job failed or there was an error, this job must fail 1007 op.status = constants.OP_STATUS_ERROR 1008 op.result = _EncodeOpError(errors.OpExecError(depmsg)) 1009 break 1010 1011 else: 1012 raise errors.ProgrammerError("Unknown dependency result '%s'" % 1013 depresult) 1014 1015 return result

1016

1017 - def _ExecOpCodeUnlocked(self, opctx):

1018 """Processes one opcode and returns the result. 1019 1020 """ 1021 op = opctx.op 1022 1023 assert op.status == constants.OP_STATUS_WAITING 1024 1025 timeout = opctx.GetNextLockTimeout() 1026 1027 try: 1028 # Make sure not to hold queue lock while calling ExecOpCode 1029 result = self.opexec_fn(op.input, 1030 _OpExecCallbacks(self.queue, self.job, op), 1031 timeout=timeout, priority=op.priority) 1032 except mcpu.LockAcquireTimeout: 1033 assert timeout is not None, "Received timeout for blocking acquire" 1034 logging.debug("Couldn't acquire locks in %0.6fs", timeout) 1035 1036 assert op.status in (constants.OP_STATUS_WAITING, 1037 constants.OP_STATUS_CANCELING) 1038 1039 # Was job cancelled while we were waiting for the lock? 1040 if op.status == constants.OP_STATUS_CANCELING: 1041 return (constants.OP_STATUS_CANCELING, None) 1042 1043 # Queue is shutting down, return to queued 1044 if not self.queue.AcceptingJobsUnlocked(): 1045 return (constants.OP_STATUS_QUEUED, None) 1046 1047 # Stay in waitlock while trying to re-acquire lock 1048 return (constants.OP_STATUS_WAITING, None) 1049 except CancelJob: 1050 logging.exception("%s: Canceling job", opctx.log_prefix) 1051 assert op.status == constants.OP_STATUS_CANCELING 1052 return (constants.OP_STATUS_CANCELING, None) 1053 1054 except QueueShutdown: 1055 logging.exception("%s: Queue is shutting down", opctx.log_prefix) 1056 1057 assert op.status == constants.OP_STATUS_WAITING 1058 1059 # Job hadn't been started yet, so it should return to the queue 1060 return (constants.OP_STATUS_QUEUED, None) 1061 1062 except Exception, err: # pylint: disable=W0703 1063 logging.exception("%s: Caught exception in %s", 1064 opctx.log_prefix, opctx.summary) 1065 return (constants.OP_STATUS_ERROR, _EncodeOpError(err)) 1066 else: 1067 logging.debug("%s: %s successful", 1068 opctx.log_prefix, opctx.summary) 1069 return (constants.OP_STATUS_SUCCESS, result)

1070

1071 - def __call__(self, _nextop_fn=None):

1072 """Continues execution of a job. 1073 1074 @param _nextop_fn: Callback function for tests 1075 @return: C{FINISHED} if job is fully processed, C{DEFER} if the job should 1076 be deferred and C{WAITDEP} if the dependency manager 1077 (L{_JobDependencyManager}) will re-schedule the job when appropriate 1078 1079 """ 1080 queue = self.queue 1081 job = self.job 1082 1083 logging.debug("Processing job %s", job.id) 1084 1085 queue.acquire(shared=1) 1086 try: 1087 opcount = len(job.ops) 1088 1089 assert job.writable, "Expected writable job" 1090 1091 # Don't do anything for finalized jobs 1092 if job.CalcStatus() in constants.JOBS_FINALIZED: 1093 return self.FINISHED 1094 1095 # Is a previous opcode still pending? 1096 if job.cur_opctx: 1097 opctx = job.cur_opctx 1098 job.cur_opctx = None 1099 else: 1100 if __debug__ and _nextop_fn: 1101 _nextop_fn() 1102 opctx = self._FindNextOpcode(job, self._timeout_strategy_factory) 1103 1104 op = opctx.op 1105 1106 # Consistency check 1107 assert compat.all(i.status in (constants.OP_STATUS_QUEUED, 1108 constants.OP_STATUS_CANCELING) 1109 for i in job.ops[opctx.index + 1:]) 1110 1111 assert op.status in (constants.OP_STATUS_QUEUED, 1112 constants.OP_STATUS_WAITING, 1113 constants.OP_STATUS_CANCELING) 1114 1115 assert (op.priority <= constants.OP_PRIO_LOWEST and 1116 op.priority >= constants.OP_PRIO_HIGHEST) 1117 1118 waitjob = None 1119 1120 if op.status != constants.OP_STATUS_CANCELING: 1121 assert op.status in (constants.OP_STATUS_QUEUED, 1122 constants.OP_STATUS_WAITING) 1123 1124 # Prepare to start opcode 1125 if self._MarkWaitlock(job, op): 1126 # Write to disk 1127 queue.UpdateJobUnlocked(job) 1128 1129 assert op.status == constants.OP_STATUS_WAITING 1130 assert job.CalcStatus() == constants.JOB_STATUS_WAITING 1131 assert job.start_timestamp and op.start_timestamp 1132 assert waitjob is None 1133 1134 # Check if waiting for a job is necessary 1135 waitjob = self._CheckDependencies(queue, job, opctx) 1136 1137 assert op.status in (constants.OP_STATUS_WAITING, 1138 constants.OP_STATUS_CANCELING, 1139 constants.OP_STATUS_ERROR) 1140 1141 if not (waitjob or op.status in (constants.OP_STATUS_CANCELING, 1142 constants.OP_STATUS_ERROR)): 1143 logging.info("%s: opcode %s waiting for locks", 1144 opctx.log_prefix, opctx.summary) 1145 1146 assert not opctx.jobdeps, "Not all dependencies were removed" 1147 1148 queue.release() 1149 try: 1150 (op_status, op_result) = self._ExecOpCodeUnlocked(opctx) 1151 finally: 1152 queue.acquire(shared=1) 1153 1154 op.status = op_status 1155 op.result = op_result 1156 1157 assert not waitjob 1158 1159 if op.status in (constants.OP_STATUS_WAITING, 1160 constants.OP_STATUS_QUEUED): 1161 # waiting: Couldn't get locks in time 1162 # queued: Queue is shutting down 1163 assert not op.end_timestamp 1164 else: 1165 # Finalize opcode 1166 op.end_timestamp = TimeStampNow() 1167 1168 if op.status == constants.OP_STATUS_CANCELING: 1169 assert not compat.any(i.status != constants.OP_STATUS_CANCELING 1170 for i in job.ops[opctx.index:]) 1171 else: 1172 assert op.status in constants.OPS_FINALIZED 1173 1174 if op.status == constants.OP_STATUS_QUEUED: 1175 # Queue is shutting down 1176 assert not waitjob 1177 1178 finalize = False 1179 1180 # Reset context 1181 job.cur_opctx = None 1182 1183 # In no case must the status be finalized here 1184 assert job.CalcStatus() == constants.JOB_STATUS_QUEUED 1185 1186 elif op.status == constants.OP_STATUS_WAITING or waitjob: 1187 finalize = False 1188 1189 if not waitjob and opctx.CheckPriorityIncrease(): 1190 # Priority was changed, need to update on-disk file 1191 queue.UpdateJobUnlocked(job) 1192 1193 # Keep around for another round 1194 job.cur_opctx = opctx 1195 1196 assert (op.priority <= constants.OP_PRIO_LOWEST and 1197 op.priority >= constants.OP_PRIO_HIGHEST) 1198 1199 # In no case must the status be finalized here 1200 assert job.CalcStatus() == constants.JOB_STATUS_WAITING 1201 1202 else: 1203 # Ensure all opcodes so far have been successful 1204 assert (opctx.index == 0 or 1205 compat.all(i.status == constants.OP_STATUS_SUCCESS 1206 for i in job.ops[:opctx.index])) 1207 1208 # Reset context 1209 job.cur_opctx = None 1210 1211 if op.status == constants.OP_STATUS_SUCCESS: 1212 finalize = False 1213 1214 elif op.status == constants.OP_STATUS_ERROR: 1215 # Ensure failed opcode has an exception as its result 1216 assert errors.GetEncodedError(job.ops[opctx.index].result) 1217 1218 to_encode = errors.OpExecError("Preceding opcode failed") 1219 job.MarkUnfinishedOps(constants.OP_STATUS_ERROR, 1220 _EncodeOpError(to_encode)) 1221 finalize = True 1222 1223 # Consistency check 1224 assert compat.all(i.status == constants.OP_STATUS_ERROR and 1225 errors.GetEncodedError(i.result) 1226 for i in job.ops[opctx.index:]) 1227 1228 elif op.status == constants.OP_STATUS_CANCELING: 1229 job.MarkUnfinishedOps(constants.OP_STATUS_CANCELED, 1230 "Job canceled by request") 1231 finalize = True 1232 1233 else: 1234 raise errors.ProgrammerError("Unknown status '%s'" % op.status) 1235 1236 if opctx.index == (opcount - 1): 1237 # Finalize on last opcode 1238 finalize = True 1239 1240 if finalize: 1241 # All opcodes have been run, finalize job 1242 job.Finalize() 1243 1244 # Write to disk. If the job status is final, this is the final write 1245 # allowed. Once the file has been written, it can be archived anytime. 1246 queue.UpdateJobUnlocked(job) 1247 1248 assert not waitjob 1249 1250 if finalize: 1251 logging.info("Finished job %s, status = %s", job.id, job.CalcStatus()) 1252 return self.FINISHED 1253 1254 assert not waitjob or queue.depmgr.JobWaiting(job) 1255 1256 if waitjob: 1257 return self.WAITDEP 1258 else: 1259 return self.DEFER 1260 finally: 1261 assert job.writable, "Job became read-only while being processed" 1262 queue.release()

1263

1264 1265 -def _EvaluateJobProcessorResult(depmgr, job, result):

1266 """Looks at a result from L{_JobProcessor} for a job. 1267 1268 To be used in a L{_JobQueueWorker}. 1269 1270 """ 1271 if result == _JobProcessor.FINISHED: 1272 # Notify waiting jobs 1273 depmgr.NotifyWaiters(job.id) 1274 1275 elif result == _JobProcessor.DEFER: 1276 # Schedule again 1277 raise workerpool.DeferTask(priority=job.CalcPriority()) 1278 1279 elif result == _JobProcessor.WAITDEP: 1280 # No-op, dependency manager will re-schedule 1281 pass 1282 1283 else: 1284 raise errors.ProgrammerError("Job processor returned unknown status %s" % 1285 (result, ))

1286

1287 1288 -class _JobQueueWorker(workerpool.BaseWorker):

1289 """The actual job workers. 1290 1291 """

1292 - def RunTask(self, job): # pylint: disable=W0221

1293 """Job executor. 1294 1295 @type job: L{_QueuedJob} 1296 @param job: the job to be processed 1297 1298 """ 1299 assert job.writable, "Expected writable job" 1300 1301 # Ensure only one worker is active on a single job. If a job registers for 1302 # a dependency job, and the other job notifies before the first worker is 1303 # done, the job can end up in the tasklist more than once. 1304 job.processor_lock.acquire() 1305 try: 1306 return self._RunTaskInner(job) 1307 finally: 1308 job.processor_lock.release()

1309

1310 - def _RunTaskInner(self, job):

1311 """Executes a job. 1312 1313 Must be called with per-job lock acquired. 1314 1315 """ 1316 queue = job.queue 1317 assert queue == self.pool.queue 1318 1319 setname_fn = lambda op: self.SetTaskName(self._GetWorkerName(job, op)) 1320 setname_fn(None) 1321 1322 proc = mcpu.Processor(queue.context, job.id) 1323 1324 # Create wrapper for setting thread name 1325 wrap_execop_fn = compat.partial(self._WrapExecOpCode, setname_fn, 1326 proc.ExecOpCode) 1327 1328 _EvaluateJobProcessorResult(queue.depmgr, job, 1329 _JobProcessor(queue, wrap_execop_fn, job)())

1330 1331 @staticmethod

1332 - def _WrapExecOpCode(setname_fn, execop_fn, op, *args, **kwargs):

1333 """Updates the worker thread name to include a short summary of the opcode. 1334 1335 @param setname_fn: Callable setting worker thread name 1336 @param execop_fn: Callable for executing opcode (usually 1337 L{mcpu.Processor.ExecOpCode}) 1338 1339 """ 1340 setname_fn(op) 1341 try: 1342 return execop_fn(op, *args, **kwargs) 1343 finally: 1344 setname_fn(None)

1345 1346 @staticmethod

1347 - def _GetWorkerName(job, op):

1348 """Sets the worker thread name. 1349 1350 @type job: L{_QueuedJob} 1351 @type op: L{opcodes.OpCode} 1352 1353 """ 1354 parts = ["Job%s" % job.id] 1355 1356 if op: 1357 parts.append(op.TinySummary()) 1358 1359 return "/".join(parts)

1360

1361 1362 -class _JobQueueWorkerPool(workerpool.WorkerPool):

1363 """Simple class implementing a job-processing workerpool. 1364 1365 """

1366 - def __init__(self, queue):

1367 super(_JobQueueWorkerPool, self).__init__("Jq", 1368 JOBQUEUE_THREADS, 1369 _JobQueueWorker) 1370 self.queue = queue

1371

1372 1373 -class _JobDependencyManager:

1374 """Keeps track of job dependencies. 1375 1376 """ 1377 (WAIT, 1378 ERROR, 1379 CANCEL, 1380 CONTINUE, 1381 WRONGSTATUS) = range(1, 6) 1382

1383 - def __init__(self, getstatus_fn, enqueue_fn):

1384 """Initializes this class. 1385 1386 """ 1387 self._getstatus_fn = getstatus_fn 1388 self._enqueue_fn = enqueue_fn 1389 1390 self._waiters = {} 1391 self._lock = locking.SharedLock("JobDepMgr")

1392 1393 @locking.ssynchronized(_LOCK, shared=1)

1394 - def GetLockInfo(self, requested): # pylint: disable=W0613

1395 """Retrieves information about waiting jobs. 1396 1397 @type requested: set 1398 @param requested: Requested information, see C{query.LQ_*} 1399 1400 """ 1401 # No need to sort here, that's being done by the lock manager and query 1402 # library. There are no priorities for notifying jobs, hence all show up as 1403 # one item under "pending". 1404 return [("job/%s" % job_id, None, None, 1405 [("job", [job.id for job in waiters])]) 1406 for job_id, waiters in self._waiters.items() 1407 if waiters]

1408 1409 @locking.ssynchronized(_LOCK, shared=1)

1410 - def JobWaiting(self, job):

1411 """Checks if a job is waiting. 1412 1413 """ 1414 return compat.any(job in jobs 1415 for jobs in self._waiters.values())

1416 1417 @locking.ssynchronized(_LOCK)

1418 - def CheckAndRegister(self, job, dep_job_id, dep_status):

1419 """Checks if a dependency job has the requested status. 1420 1421 If the other job is not yet in a finalized status, the calling job will be 1422 notified (re-added to the workerpool) at a later point. 1423 1424 @type job: L{_QueuedJob} 1425 @param job: Job object 1426 @type dep_job_id: string 1427 @param dep_job_id: ID of dependency job 1428 @type dep_status: list 1429 @param dep_status: Required status 1430 1431 """ 1432 assert ht.TString(job.id) 1433 assert ht.TString(dep_job_id) 1434 assert ht.TListOf(ht.TElemOf(constants.JOBS_FINALIZED))(dep_status) 1435 1436 if job.id == dep_job_id: 1437 return (self.ERROR, "Job can't depend on itself") 1438 1439 # Get status of dependency job 1440 try: 1441 status = self._getstatus_fn(dep_job_id) 1442 except errors.JobLost, err: 1443 return (self.ERROR, "Dependency error: %s" % err) 1444 1445 assert status in constants.JOB_STATUS_ALL 1446 1447 job_id_waiters = self._waiters.setdefault(dep_job_id, set()) 1448 1449 if status not in constants.JOBS_FINALIZED: 1450 # Register for notification and wait for job to finish 1451 job_id_waiters.add(job) 1452 return (self.WAIT, 1453 "Need to wait for job %s, wanted status '%s'" % 1454 (dep_job_id, dep_status)) 1455 1456 # Remove from waiters list 1457 if job in job_id_waiters: 1458 job_id_waiters.remove(job) 1459 1460 if (status == constants.JOB_STATUS_CANCELED and 1461 constants.JOB_STATUS_CANCELED not in dep_status): 1462 return (self.CANCEL, "Dependency job %s was cancelled" % dep_job_id) 1463 1464 elif not dep_status or status in dep_status: 1465 return (self.CONTINUE, 1466 "Dependency job %s finished with status '%s'" % 1467 (dep_job_id, status)) 1468 1469 else: 1470 return (self.WRONGSTATUS, 1471 "Dependency job %s finished with status '%s'," 1472 " not one of '%s' as required" % 1473 (dep_job_id, status, utils.CommaJoin(dep_status)))

1474

1475 - def _RemoveEmptyWaitersUnlocked(self):

1476 """Remove all jobs without actual waiters. 1477 1478 """ 1479 for job_id in [job_id for (job_id, waiters) in self._waiters.items() 1480 if not waiters]: 1481 del self._waiters[job_id]

1482

1483 - def NotifyWaiters(self, job_id):

1484 """Notifies all jobs waiting for a certain job ID. 1485 1486 @attention: Do not call until L{CheckAndRegister} returned a status other 1487 than C{WAITDEP} for C{job_id}, or behaviour is undefined 1488 @type job_id: string 1489 @param job_id: Job ID 1490 1491 """ 1492 assert ht.TString(job_id) 1493 1494 self._lock.acquire() 1495 try: 1496 self._RemoveEmptyWaitersUnlocked() 1497 1498 jobs = self._waiters.pop(job_id, None) 1499 finally: 1500 self._lock.release() 1501 1502 if jobs: 1503 # Re-add jobs to workerpool 1504 logging.debug("Re-adding %s jobs which were waiting for job %s", 1505 len(jobs), job_id) 1506 self._enqueue_fn(jobs)

1507

1508 1509 -def _RequireOpenQueue(fn):

1510 """Decorator for "public" functions. 1511 1512 This function should be used for all 'public' functions. That is, 1513 functions usually called from other classes. Note that this should 1514 be applied only to methods (not plain functions), since it expects 1515 that the decorated function is called with a first argument that has 1516 a '_queue_filelock' argument. 1517 1518 @warning: Use this decorator only after locking.ssynchronized 1519 1520 Example:: 1521 @locking.ssynchronized(_LOCK) 1522 @_RequireOpenQueue 1523 def Example(self): 1524 pass 1525 1526 """ 1527 def wrapper(self, *args, **kwargs): 1528 # pylint: disable=W0212 1529 assert self._queue_filelock is not None, "Queue should be open" 1530 return fn(self, *args, **kwargs)

1531 return wrapper 1532

1533 1534 -def _RequireNonDrainedQueue(fn):

1535 """Decorator checking for a non-drained queue. 1536 1537 To be used with functions submitting new jobs. 1538 1539 """ 1540 def wrapper(self, *args, **kwargs): 1541 """Wrapper function. 1542 1543 @raise errors.JobQueueDrainError: if the job queue is marked for draining 1544 1545 """ 1546 # Ok when sharing the big job queue lock, as the drain file is created when 1547 # the lock is exclusive. 1548 # Needs access to protected member, pylint: disable=W0212 1549 if self._drained: 1550 raise errors.JobQueueDrainError("Job queue is drained, refusing job") 1551 1552 if not self._accepting_jobs: 1553 raise errors.JobQueueError("Job queue is shutting down, refusing job") 1554 1555 return fn(self, *args, **kwargs)

1556 return wrapper 1557

1558 1559 -class JobQueue(object):

1560 """Queue used to manage the jobs. 1561 1562 """

1563 - def __init__(self, context):

1564 """Constructor for JobQueue. 1565 1566 The constructor will initialize the job queue object and then 1567 start loading the current jobs from disk, either for starting them 1568 (if they were queue) or for aborting them (if they were already 1569 running). 1570 1571 @type context: GanetiContext 1572 @param context: the context object for access to the configuration 1573 data and other ganeti objects 1574 1575 """ 1576 self.context = context 1577 self._memcache = weakref.WeakValueDictionary() 1578 self._my_hostname = netutils.Hostname.GetSysName() 1579 1580 # The Big JobQueue lock. If a code block or method acquires it in shared 1581 # mode safe it must guarantee concurrency with all the code acquiring it in 1582 # shared mode, including itself. In order not to acquire it at all 1583 # concurrency must be guaranteed with all code acquiring it in shared mode 1584 # and all code acquiring it exclusively. 1585 self._lock = locking.SharedLock("JobQueue") 1586 1587 self.acquire = self._lock.acquire 1588 self.release = self._lock.release 1589 1590 # Accept jobs by default 1591 self._accepting_jobs = True 1592 1593 # Initialize the queue, and acquire the filelock. 1594 # This ensures no other process is working on the job queue. 1595 self._queue_filelock = jstore.InitAndVerifyQueue(must_lock=True) 1596 1597 # Read serial file 1598 self._last_serial = jstore.ReadSerial() 1599 assert self._last_serial is not None, ("Serial file was modified between" 1600 " check in jstore and here") 1601 1602 # Get initial list of nodes 1603 self._nodes = dict((n.name, n.primary_ip) 1604 for n in self.context.cfg.GetAllNodesInfo().values() 1605 if n.master_candidate) 1606 1607 # Remove master node 1608 self._nodes.pop(self._my_hostname, None) 1609 1610 # TODO: Check consistency across nodes 1611 1612 self._queue_size = None 1613 self._UpdateQueueSizeUnlocked() 1614 assert ht.TInt(self._queue_size) 1615 self._drained = jstore.CheckDrainFlag() 1616 1617 # Job dependencies 1618 self.depmgr = _JobDependencyManager(self._GetJobStatusForDependencies, 1619 self._EnqueueJobs) 1620 self.context.glm.AddToLockMonitor(self.depmgr) 1621 1622 # Setup worker pool 1623 self._wpool = _JobQueueWorkerPool(self) 1624 try: 1625 self._InspectQueue() 1626 except: 1627 self._wpool.TerminateWorkers() 1628 raise

1629 1630 @locking.ssynchronized(_LOCK) 1631 @_RequireOpenQueue

1632 - def _InspectQueue(self):

1633 """Loads the whole job queue and resumes unfinished jobs. 1634 1635 This function needs the lock here because WorkerPool.AddTask() may start a 1636 job while we're still doing our work. 1637 1638 """ 1639 logging.info("Inspecting job queue") 1640 1641 restartjobs = [] 1642 1643 all_job_ids = self._GetJobIDsUnlocked() 1644 jobs_count = len(all_job_ids) 1645 lastinfo = time.time() 1646 for idx, job_id in enumerate(all_job_ids): 1647 # Give an update every 1000 jobs or 10 seconds 1648 if (idx % 1000 == 0 or time.time() >= (lastinfo + 10.0) or 1649 idx == (jobs_count - 1)): 1650 logging.info("Job queue inspection: %d/%d (%0.1f %%)", 1651 idx, jobs_count - 1, 100.0 * (idx + 1) / jobs_count) 1652 lastinfo = time.time() 1653 1654 job = self._LoadJobUnlocked(job_id) 1655 1656 # a failure in loading the job can cause 'None' to be returned 1657 if job is None: 1658 continue 1659 1660 status = job.CalcStatus() 1661 1662 if status == constants.JOB_STATUS_QUEUED: 1663 restartjobs.append(job) 1664 1665 elif status in (constants.JOB_STATUS_RUNNING, 1666 constants.JOB_STATUS_WAITING, 1667 constants.JOB_STATUS_CANCELING): 1668 logging.warning("Unfinished job %s found: %s", job.id, job) 1669 1670 if status == constants.JOB_STATUS_WAITING: 1671 # Restart job 1672 job.MarkUnfinishedOps(constants.OP_STATUS_QUEUED, None) 1673 restartjobs.append(job) 1674 else: 1675 job.MarkUnfinishedOps(constants.OP_STATUS_ERROR, 1676 "Unclean master daemon shutdown") 1677 job.Finalize() 1678 1679 self.UpdateJobUnlocked(job) 1680 1681 if restartjobs: 1682 logging.info("Restarting %s jobs", len(restartjobs)) 1683 self._EnqueueJobsUnlocked(restartjobs) 1684 1685 logging.info("Job queue inspection finished")

1686

1687 - def _GetRpc(self, address_list):

1688 """Gets RPC runner with context. 1689 1690 """ 1691 return rpc.JobQueueRunner(self.context, address_list)

1692 1693 @locking.ssynchronized(_LOCK) 1694 @_RequireOpenQueue

1695 - def AddNode(self, node):

1696 """Register a new node with the queue. 1697 1698 @type node: L{objects.Node} 1699 @param node: the node object to be added 1700 1701 """ 1702 node_name = node.name 1703 assert node_name != self._my_hostname 1704 1705 # Clean queue directory on added node 1706 result = self._GetRpc(None).call_jobqueue_purge(node_name) 1707 msg = result.fail_msg 1708 if msg: 1709 logging.warning("Cannot cleanup queue directory on node %s: %s", 1710 node_name, msg) 1711 1712 if not node.master_candidate: 1713 # remove if existing, ignoring errors 1714 self._nodes.pop(node_name, None) 1715 # and skip the replication of the job ids 1716 return 1717 1718 # Upload the whole queue excluding archived jobs 1719 files = [self._GetJobPath(job_id) for job_id in self._GetJobIDsUnlocked()] 1720 1721 # Upload current serial file 1722 files.append(constants.JOB_QUEUE_SERIAL_FILE) 1723 1724 # Static address list 1725 addrs = [node.primary_ip] 1726 1727 for file_name in files: 1728 # Read file content 1729 content = utils.ReadFile(file_name) 1730 1731 result = self._GetRpc(addrs).call_jobqueue_update([node_name], file_name, 1732 content) 1733 msg = result[node_name].fail_msg 1734 if msg: 1735 logging.error("Failed to upload file %s to node %s: %s", 1736 file_name, node_name, msg) 1737 1738 self._nodes[node_name] = node.primary_ip

1739 1740 @locking.ssynchronized(_LOCK) 1741 @_RequireOpenQueue

1742 - def RemoveNode(self, node_name):

1743 """Callback called when removing nodes from the cluster. 1744 1745 @type node_name: str 1746 @param node_name: the name of the node to remove 1747 1748 """ 1749 self._nodes.pop(node_name, None)

1750 1751 @staticmethod

1752 - def _CheckRpcResult(result, nodes, failmsg):

1753 """Verifies the status of an RPC call. 1754 1755 Since we aim to keep consistency should this node (the current 1756 master) fail, we will log errors if our rpc fail, and especially 1757 log the case when more than half of the nodes fails. 1758 1759 @param result: the data as returned from the rpc call 1760 @type nodes: list 1761 @param nodes: the list of nodes we made the call to 1762 @type failmsg: str 1763 @param failmsg: the identifier to be used for logging 1764 1765 """ 1766 failed = [] 1767 success = [] 1768 1769 for node in nodes: 1770 msg = result[node].fail_msg 1771 if msg: 1772 failed.append(node) 1773 logging.error("RPC call %s (%s) failed on node %s: %s", 1774 result[node].call, failmsg, node, msg) 1775 else: 1776 success.append(node) 1777 1778 # +1 for the master node 1779 if (len(success) + 1) < len(failed): 1780 # TODO: Handle failing nodes 1781 logging.error("More than half of the nodes failed")

1782

1783 - def _GetNodeIp(self):

1784 """Helper for returning the node name/ip list. 1785 1786 @rtype: (list, list) 1787 @return: a tuple of two lists, the first one with the node 1788 names and the second one with the node addresses 1789 1790 """ 1791 # TODO: Change to "tuple(map(list, zip(*self._nodes.items())))"? 1792 name_list = self._nodes.keys() 1793 addr_list = [self._nodes[name] for name in name_list] 1794 return name_list, addr_list

1795

1796 - def _UpdateJobQueueFile(self, file_name, data, replicate):

1797 """Writes a file locally and then replicates it to all nodes. 1798 1799 This function will replace the contents of a file on the local 1800 node and then replicate it to all the other nodes we have. 1801 1802 @type file_name: str 1803 @param file_name: the path of the file to be replicated 1804 @type data: str 1805 @param data: the new contents of the file 1806 @type replicate: boolean 1807 @param replicate: whether to spread the changes to the remote nodes 1808 1809 """ 1810 getents = runtime.GetEnts() 1811 utils.WriteFile(file_name, data=data, uid=getents.masterd_uid, 1812 gid=getents.masterd_gid) 1813 1814 if replicate: 1815 names, addrs = self._GetNodeIp() 1816 result = self._GetRpc(addrs).call_jobqueue_update(names, file_name, data) 1817 self._CheckRpcResult(result, self._nodes, "Updating %s" % file_name)

1818

1819 - def _RenameFilesUnlocked(self, rename):

1820 """Renames a file locally and then replicate the change. 1821 1822 This function will rename a file in the local queue directory 1823 and then replicate this rename to all the other nodes we have. 1824 1825 @type rename: list of (old, new) 1826 @param rename: List containing tuples mapping old to new names 1827 1828 """ 1829 # Rename them locally 1830 for old, new in rename: 1831 utils.RenameFile(old, new, mkdir=True) 1832 1833 # ... and on all nodes 1834 names, addrs = self._GetNodeIp() 1835 result = self._GetRpc(addrs).call_jobqueue_rename(names, rename) 1836 self._CheckRpcResult(result, self._nodes, "Renaming files (%r)" % rename)

1837 1838 @staticmethod

1839 - def _FormatJobID(job_id):

1840 """Convert a job ID to string format. 1841 1842 Currently this just does C{str(job_id)} after performing some 1843 checks, but if we want to change the job id format this will 1844 abstract this change. 1845 1846 @type job_id: int or long 1847 @param job_id: the numeric job id 1848 @rtype: str 1849 @return: the formatted job id 1850 1851 """ 1852 if not isinstance(job_id, (int, long)): 1853 raise errors.ProgrammerError("Job ID '%s' not numeric" % job_id) 1854 if job_id < 0: 1855 raise errors.ProgrammerError("Job ID %s is negative" % job_id) 1856 1857 return str(job_id)

1858 1859 @classmethod

1860 - def _GetArchiveDirectory(cls, job_id):

1861 """Returns the archive directory for a job. 1862 1863 @type job_id: str 1864 @param job_id: Job identifier 1865 @rtype: str 1866 @return: Directory name 1867 1868 """ 1869 return str(int(job_id) / JOBS_PER_ARCHIVE_DIRECTORY)

1870

1871 - def _NewSerialsUnlocked(self, count):

1872 """Generates a new job identifier. 1873 1874 Job identifiers are unique during the lifetime of a cluster. 1875 1876 @type count: integer 1877 @param count: how many serials to return 1878 @rtype: str 1879 @return: a string representing the job identifier. 1880 1881 """ 1882 assert ht.TPositiveInt(count) 1883 1884 # New number 1885 serial = self._last_serial + count 1886 1887 # Write to file 1888 self._UpdateJobQueueFile(constants.JOB_QUEUE_SERIAL_FILE, 1889 "%s\n" % serial, True) 1890 1891 result = [self._FormatJobID(v) 1892 for v in range(self._last_serial + 1, serial + 1)] 1893 1894 # Keep it only if we were able to write the file 1895 self._last_serial = serial 1896 1897 assert len(result) == count 1898 1899 return result

1900 1901 @staticmethod

1902 - def _GetJobPath(job_id):

1903 """Returns the job file for a given job id. 1904 1905 @type job_id: str 1906 @param job_id: the job identifier 1907 @rtype: str 1908 @return: the path to the job file 1909 1910 """ 1911 return utils.PathJoin(constants.QUEUE_DIR, "job-%s" % job_id)

1912 1913 @classmethod

1914 - def _GetArchivedJobPath(cls, job_id):

1915 """Returns the archived job file for a give job id. 1916 1917 @type job_id: str 1918 @param job_id: the job identifier 1919 @rtype: str 1920 @return: the path to the archived job file 1921 1922 """ 1923 return utils.PathJoin(constants.JOB_QUEUE_ARCHIVE_DIR, 1924 cls._GetArchiveDirectory(job_id), "job-%s" % job_id)

1925 1926 @staticmethod

1927 - def _GetJobIDsUnlocked(sort=True):

1928 """Return all known job IDs. 1929 1930 The method only looks at disk because it's a requirement that all 1931 jobs are present on disk (so in the _memcache we don't have any 1932 extra IDs). 1933 1934 @type sort: boolean 1935 @param sort: perform sorting on the returned job ids 1936 @rtype: list 1937 @return: the list of job IDs 1938 1939 """ 1940 jlist = [] 1941 for filename in utils.ListVisibleFiles(constants.QUEUE_DIR): 1942 m = constants.JOB_FILE_RE.match(filename) 1943 if m: 1944 jlist.append(m.group(1)) 1945 if sort: 1946 jlist = utils.NiceSort(jlist) 1947 return jlist

1948

1949 - def _LoadJobUnlocked(self, job_id):

1950 """Loads a job from the disk or memory. 1951 1952 Given a job id, this will return the cached job object if 1953 existing, or try to load the job from the disk. If loading from 1954 disk, it will also add the job to the cache. 1955 1956 @param job_id: the job id 1957 @rtype: L{_QueuedJob} or None 1958 @return: either None or the job object 1959 1960 """ 1961 job = self._memcache.get(job_id, None) 1962 if job: 1963 logging.debug("Found job %s in memcache", job_id) 1964 assert job.writable, "Found read-only job in memcache" 1965 return job 1966 1967 try: 1968 job = self._LoadJobFromDisk(job_id, False) 1969 if job is None: 1970 return job 1971 except errors.JobFileCorrupted: 1972 old_path = self._GetJobPath(job_id) 1973 new_path = self._GetArchivedJobPath(job_id) 1974 if old_path == new_path: 1975 # job already archived (future case) 1976 logging.exception("Can't parse job %s", job_id) 1977 else: 1978 # non-archived case 1979 logging.exception("Can't parse job %s, will archive.", job_id) 1980 self._RenameFilesUnlocked([(old_path, new_path)]) 1981 return None 1982 1983 assert job.writable, "Job just loaded is not writable" 1984 1985 self._memcache[job_id] = job 1986 logging.debug("Added job %s to the cache", job_id) 1987 return job

1988

1989 - def _LoadJobFromDisk(self, job_id, try_archived, writable=None):

1990 """Load the given job file from disk. 1991 1992 Given a job file, read, load and restore it in a _QueuedJob format. 1993 1994 @type job_id: string 1995 @param job_id: job identifier 1996 @type try_archived: bool 1997 @param try_archived: Whether to try loading an archived job 1998 @rtype: L{_QueuedJob} or None 1999 @return: either None or the job object 2000 2001 """ 2002 path_functions = [(self._GetJobPath, True)] 2003 2004 if try_archived: 2005 path_functions.append((self._GetArchivedJobPath, False)) 2006 2007 raw_data = None 2008 writable_default = None 2009 2010 for (fn, writable_default) in path_functions: 2011 filepath = fn(job_id) 2012 logging.debug("Loading job from %s", filepath) 2013 try: 2014 raw_data = utils.ReadFile(filepath) 2015 except EnvironmentError, err: 2016 if err.errno != errno.ENOENT: 2017 raise 2018 else: 2019 break 2020 2021 if not raw_data: 2022 return None 2023 2024 if writable is None: 2025 writable = writable_default 2026 2027 try: 2028 data = serializer.LoadJson(raw_data) 2029 job = _QueuedJob.Restore(self, data, writable) 2030 except Exception, err: # pylint: disable=W0703 2031 raise errors.JobFileCorrupted(err) 2032 2033 return job

2034

2035 - def SafeLoadJobFromDisk(self, job_id, try_archived, writable=None):

2036 """Load the given job file from disk. 2037 2038 Given a job file, read, load and restore it in a _QueuedJob format. 2039 In case of error reading the job, it gets returned as None, and the 2040 exception is logged. 2041 2042 @type job_id: string 2043 @param job_id: job identifier 2044 @type try_archived: bool 2045 @param try_archived: Whether to try loading an archived job 2046 @rtype: L{_QueuedJob} or None 2047 @return: either None or the job object 2048 2049 """ 2050 try: 2051 return self._LoadJobFromDisk(job_id, try_archived, writable=writable) 2052 except (errors.JobFileCorrupted, EnvironmentError): 2053 logging.exception("Can't load/parse job %s", job_id) 2054 return None

2055

2056 - def _UpdateQueueSizeUnlocked(self):

2057 """Update the queue size. 2058 2059 """ 2060 self._queue_size = len(self._GetJobIDsUnlocked(sort=False))

2061 2062 @locking.ssynchronized(_LOCK) 2063 @_RequireOpenQueue

2064 - def SetDrainFlag(self, drain_flag):

2065 """Sets the drain flag for the queue. 2066 2067 @type drain_flag: boolean 2068 @param drain_flag: Whether to set or unset the drain flag 2069 2070 """ 2071 jstore.SetDrainFlag(drain_flag) 2072 2073 self._drained = drain_flag 2074 2075 return True

2076 2077 @_RequireOpenQueue

2078 - def _SubmitJobUnlocked(self, job_id, ops):

2079 """Create and store a new job. 2080 2081 This enters the job into our job queue and also puts it on the new 2082 queue, in order for it to be picked up by the queue processors. 2083 2084 @type job_id: job ID 2085 @param job_id: the job ID for the new job 2086 @type ops: list 2087 @param ops: The list of OpCodes that will become the new job. 2088 @rtype: L{_QueuedJob} 2089 @return: the job object to be queued 2090 @raise errors.JobQueueFull: if the job queue has too many jobs in it 2091 @raise errors.GenericError: If an opcode is not valid 2092 2093 """ 2094 if self._queue_size >= constants.JOB_QUEUE_SIZE_HARD_LIMIT: 2095 raise errors.JobQueueFull() 2096 2097 job = _QueuedJob(self, job_id, ops, True) 2098 2099 # Check priority 2100 for idx, op in enumerate(job.ops): 2101 if op.priority not in constants.OP_PRIO_SUBMIT_VALID: 2102 allowed = utils.CommaJoin(constants.OP_PRIO_SUBMIT_VALID) 2103 raise errors.GenericError("Opcode %s has invalid priority %s, allowed" 2104 " are %s" % (idx, op.priority, allowed)) 2105 2106 dependencies = getattr(op.input, opcodes.DEPEND_ATTR, None) 2107 if not opcodes.TNoRelativeJobDependencies(dependencies): 2108 raise errors.GenericError("Opcode %s has invalid dependencies, must" 2109 " match %s: %s" % 2110 (idx, opcodes.TNoRelativeJobDependencies, 2111 dependencies)) 2112 2113 # Write to disk 2114 self.UpdateJobUnlocked(job) 2115 2116 self._queue_size += 1 2117 2118 logging.debug("Adding new job %s to the cache", job_id) 2119 self._memcache[job_id] = job 2120 2121 return job

2122 2123 @locking.ssynchronized(_LOCK) 2124 @_RequireOpenQueue 2125 @_RequireNonDrainedQueue

2126 - def SubmitJob(self, ops):

2127 """Create and store a new job. 2128 2129 @see: L{_SubmitJobUnlocked} 2130 2131 """ 2132 (job_id, ) = self._NewSerialsUnlocked(1) 2133 self._EnqueueJobsUnlocked([self._SubmitJobUnlocked(job_id, ops)]) 2134 return job_id

2135 2136 @locking.ssynchronized(_LOCK) 2137 @_RequireOpenQueue 2138 @_RequireNonDrainedQueue

2139 - def SubmitManyJobs(self, jobs):

2140 """Create and store multiple jobs. 2141 2142 @see: L{_SubmitJobUnlocked} 2143 2144 """ 2145 all_job_ids = self._NewSerialsUnlocked(len(jobs)) 2146 2147 (results, added_jobs) = \ 2148 self._SubmitManyJobsUnlocked(jobs, all_job_ids, []) 2149 2150 self._EnqueueJobsUnlocked(added_jobs) 2151 2152 return results

2153 2154 @staticmethod

2155 - def _FormatSubmitError(msg, ops):

2156 """Formats errors which occurred while submitting a job. 2157 2158 """ 2159 return ("%s; opcodes %s" % 2160 (msg, utils.CommaJoin(op.Summary() for op in ops)))

2161 2162 @staticmethod

2163 - def _ResolveJobDependencies(resolve_fn, deps):

2164 """Resolves relative job IDs in dependencies. 2165 2166 @type resolve_fn: callable 2167 @param resolve_fn: Function to resolve a relative job ID 2168 @type deps: list 2169 @param deps: Dependencies 2170 @rtype: list 2171 @return: Resolved dependencies 2172 2173 """ 2174 result = [] 2175 2176 for (dep_job_id, dep_status) in deps: 2177 if ht.TRelativeJobId(dep_job_id): 2178 assert ht.TInt(dep_job_id) and dep_job_id < 0 2179 try: 2180 job_id = resolve_fn(dep_job_id) 2181 except IndexError: 2182 # Abort 2183 return (False, "Unable to resolve relative job ID %s" % dep_job_id) 2184 else: 2185 job_id = dep_job_id 2186 2187 result.append((job_id, dep_status)) 2188 2189 return (True, result)

2190

2191 - def _SubmitManyJobsUnlocked(self, jobs, job_ids, previous_job_ids):

2192 """Create and store multiple jobs. 2193 2194 @see: L{_SubmitJobUnlocked} 2195 2196 """ 2197 results = [] 2198 added_jobs = [] 2199 2200 def resolve_fn(job_idx, reljobid): 2201 assert reljobid < 0 2202 return (previous_job_ids + job_ids[:job_idx])[reljobid]

2203 2204 for (idx, (job_id, ops)) in enumerate(zip(job_ids, jobs)): 2205 for op in ops: 2206 if getattr(op, opcodes.DEPEND_ATTR, None): 2207 (status, data) = \ 2208 self._ResolveJobDependencies(compat.partial(resolve_fn, idx), 2209 op.depends) 2210 if not status: 2211 # Abort resolving dependencies 2212 assert ht.TNonEmptyString(data), "No error message" 2213 break 2214 # Use resolved dependencies 2215 op.depends = data 2216 else: 2217 try: 2218 job = self._SubmitJobUnlocked(job_id, ops) 2219 except errors.GenericError, err: 2220 status = False 2221 data = self._FormatSubmitError(str(err), ops) 2222 else: 2223 status = True 2224 data = job_id 2225 added_jobs.append(job) 2226 2227 results.append((status, data)) 2228 2229 return (results, added_jobs)

2230 2231 @locking.ssynchronized(_LOCK)

2232 - def _EnqueueJobs(self, jobs):

2233 """Helper function to add jobs to worker pool's queue. 2234 2235 @type jobs: list 2236 @param jobs: List of all jobs 2237 2238 """ 2239 return self._EnqueueJobsUnlocked(jobs)

2240

2241 - def _EnqueueJobsUnlocked(self, jobs):

2242 """Helper function to add jobs to worker pool's queue. 2243 2244 @type jobs: list 2245 @param jobs: List of all jobs 2246 2247 """ 2248 assert self._lock.is_owned(shared=0), "Must own lock in exclusive mode" 2249 self._wpool.AddManyTasks([(job, ) for job in jobs], 2250 priority=[job.CalcPriority() for job in jobs])

2251

2252 - def _GetJobStatusForDependencies(self, job_id):

2253 """Gets the status of a job for dependencies. 2254 2255 @type job_id: string 2256 @param job_id: Job ID 2257 @raise errors.JobLost: If job can't be found 2258 2259 """ 2260 if not isinstance(job_id, basestring): 2261 job_id = self._FormatJobID(job_id) 2262 2263 # Not using in-memory cache as doing so would require an exclusive lock 2264 2265 # Try to load from disk 2266 job = self.SafeLoadJobFromDisk(job_id, True, writable=False) 2267 2268 assert not job.writable, "Got writable job" # pylint: disable=E1101 2269 2270 if job: 2271 return job.CalcStatus() 2272 2273 raise errors.JobLost("Job %s not found" % job_id)

2274 2275 @_RequireOpenQueue

2276 - def UpdateJobUnlocked(self, job, replicate=True):

2277 """Update a job's on disk storage. 2278 2279 After a job has been modified, this function needs to be called in 2280 order to write the changes to disk and replicate them to the other 2281 nodes. 2282 2283 @type job: L{_QueuedJob} 2284 @param job: the changed job 2285 @type replicate: boolean 2286 @param replicate: whether to replicate the change to remote nodes 2287 2288 """ 2289 if __debug__: 2290 finalized = job.CalcStatus() in constants.JOBS_FINALIZED 2291 assert (finalized ^ (job.end_timestamp is None)) 2292 assert job.writable, "Can't update read-only job" 2293 2294 filename = self._GetJobPath(job.id) 2295 data = serializer.DumpJson(job.Serialize()) 2296 logging.debug("Writing job %s to %s", job.id, filename) 2297 self._UpdateJobQueueFile(filename, data, replicate)

2298

2299 - def WaitForJobChanges(self, job_id, fields, prev_job_info, prev_log_serial, 2300 timeout):

2301 """Waits for changes in a job. 2302 2303 @type job_id: string 2304 @param job_id: Job identifier 2305 @type fields: list of strings 2306 @param fields: Which fields to check for changes 2307 @type prev_job_info: list or None 2308 @param prev_job_info: Last job information returned 2309 @type prev_log_serial: int 2310 @param prev_log_serial: Last job message serial number 2311 @type timeout: float 2312 @param timeout: maximum time to wait in seconds 2313 @rtype: tuple (job info, log entries) 2314 @return: a tuple of the job information as required via 2315 the fields parameter, and the log entries as a list 2316 2317 if the job has not changed and the timeout has expired, 2318 we instead return a special value, 2319 L{constants.JOB_NOTCHANGED}, which should be interpreted 2320 as such by the clients 2321 2322 """ 2323 load_fn = compat.partial(self.SafeLoadJobFromDisk, job_id, True, 2324 writable=False) 2325 2326 helper = _WaitForJobChangesHelper() 2327 2328 return helper(self._GetJobPath(job_id), load_fn, 2329 fields, prev_job_info, prev_log_serial, timeout)

2330 2331 @locking.ssynchronized(_LOCK) 2332 @_RequireOpenQueue

2333 - def CancelJob(self, job_id):

2334 """Cancels a job. 2335 2336 This will only succeed if the job has not started yet. 2337 2338 @type job_id: string 2339 @param job_id: job ID of job to be cancelled. 2340 2341 """ 2342 logging.info("Cancelling job %s", job_id) 2343 2344 job = self._LoadJobUnlocked(job_id) 2345 if not job: 2346 logging.debug("Job %s not found", job_id) 2347 return (False, "Job %s not found" % job_id) 2348 2349 assert job.writable, "Can't cancel read-only job" 2350 2351 (success, msg) = job.Cancel() 2352 2353 if success: 2354 # If the job was finalized (e.g. cancelled), this is the final write 2355 # allowed. The job can be archived anytime. 2356 self.UpdateJobUnlocked(job) 2357 2358 return (success, msg)

2359 2360 @_RequireOpenQueue

2361 - def _ArchiveJobsUnlocked(self, jobs):

2362 """Archives jobs. 2363 2364 @type jobs: list of L{_QueuedJob} 2365 @param jobs: Job objects 2366 @rtype: int 2367 @return: Number of archived jobs 2368 2369 """ 2370 archive_jobs = [] 2371 rename_files = [] 2372 for job in jobs: 2373 assert job.writable, "Can't archive read-only job" 2374 2375 if job.CalcStatus() not in constants.JOBS_FINALIZED: 2376 logging.debug("Job %s is not yet done", job.id) 2377 continue 2378 2379 archive_jobs.append(job) 2380 2381 old = self._GetJobPath(job.id) 2382 new = self._GetArchivedJobPath(job.id) 2383 rename_files.append((old, new)) 2384 2385 # TODO: What if 1..n files fail to rename? 2386 self._RenameFilesUnlocked(rename_files) 2387 2388 logging.debug("Successfully archived job(s) %s", 2389 utils.CommaJoin(job.id for job in archive_jobs)) 2390 2391 # Since we haven't quite checked, above, if we succeeded or failed renaming 2392 # the files, we update the cached queue size from the filesystem. When we 2393 # get around to fix the TODO: above, we can use the number of actually 2394 # archived jobs to fix this. 2395 self._UpdateQueueSizeUnlocked() 2396 return len(archive_jobs)

2397 2398 @locking.ssynchronized(_LOCK) 2399 @_RequireOpenQueue

2400 - def ArchiveJob(self, job_id):

2401 """Archives a job. 2402 2403 This is just a wrapper over L{_ArchiveJobsUnlocked}. 2404 2405 @type job_id: string 2406 @param job_id: Job ID of job to be archived. 2407 @rtype: bool 2408 @return: Whether job was archived 2409 2410 """ 2411 logging.info("Archiving job %s", job_id) 2412 2413 job = self._LoadJobUnlocked(job_id) 2414 if not job: 2415 logging.debug("Job %s not found", job_id) 2416 return False 2417 2418 return self._ArchiveJobsUnlocked([job]) == 1

2419 2420 @locking.ssynchronized(_LOCK) 2421 @_RequireOpenQueue

2422 - def AutoArchiveJobs(self, age, timeout):

2423 """Archives all jobs based on age. 2424 2425 The method will archive all jobs which are older than the age 2426 parameter. For jobs that don't have an end timestamp, the start 2427 timestamp will be considered. The special '-1' age will cause 2428 archival of all jobs (that are not running or queued). 2429 2430 @type age: int 2431 @param age: the minimum age in seconds 2432 2433 """ 2434 logging.info("Archiving jobs with age more than %s seconds", age) 2435 2436 now = time.time() 2437 end_time = now + timeout 2438 archived_count = 0 2439 last_touched = 0 2440 2441 all_job_ids = self._GetJobIDsUnlocked() 2442 pending = [] 2443 for idx, job_id in enumerate(all_job_ids): 2444 last_touched = idx + 1 2445 2446 # Not optimal because jobs could be pending 2447 # TODO: Measure average duration for job archival and take number of 2448 # pending jobs into account. 2449 if time.time() > end_time: 2450 break 2451 2452 # Returns None if the job failed to load 2453 job = self._LoadJobUnlocked(job_id) 2454 if job: 2455 if job.end_timestamp is None: 2456 if job.start_timestamp is None: 2457 job_age = job.received_timestamp 2458 else: 2459 job_age = job.start_timestamp 2460 else: 2461 job_age = job.end_timestamp 2462 2463 if age == -1 or now - job_age[0] > age: 2464 pending.append(job) 2465 2466 # Archive 10 jobs at a time 2467 if len(pending) >= 10: 2468 archived_count += self._ArchiveJobsUnlocked(pending) 2469 pending = [] 2470 2471 if pending: 2472 archived_count += self._ArchiveJobsUnlocked(pending) 2473 2474 return (archived_count, len(all_job_ids) - last_touched)

2475

2476 - def _Query(self, fields, qfilter):

2477 qobj = query.Query(query.JOB_FIELDS, fields, qfilter=qfilter, 2478 namefield="id") 2479 2480 job_ids = qobj.RequestedNames() 2481 2482 list_all = (job_ids is None) 2483 2484 if list_all: 2485 # Since files are added to/removed from the queue atomically, there's no 2486 # risk of getting the job ids in an inconsistent state. 2487 job_ids = self._GetJobIDsUnlocked() 2488 2489 jobs = [] 2490 2491 for job_id in job_ids: 2492 job = self.SafeLoadJobFromDisk(job_id, True, writable=False) 2493 if job is not None or not list_all: 2494 jobs.append((job_id, job)) 2495 2496 return (qobj, jobs, list_all)

2497

2498 - def QueryJobs(self, fields, qfilter):

2499 """Returns a list of jobs in queue. 2500 2501 @type fields: sequence 2502 @param fields: List of wanted fields 2503 @type qfilter: None or query2 filter (list) 2504 @param qfilter: Query filter 2505 2506 """ 2507 (qobj, ctx, sort_by_name) = self._Query(fields, qfilter) 2508 2509 return query.GetQueryResponse(qobj, ctx, sort_by_name=sort_by_name)

2510

2511 - def OldStyleQueryJobs(self, job_ids, fields):

2512 """Returns a list of jobs in queue. 2513 2514 @type job_ids: list 2515 @param job_ids: sequence of job identifiers or None for all 2516 @type fields: list 2517 @param fields: names of fields to return 2518 @rtype: list 2519 @return: list one element per job, each element being list with 2520 the requested fields 2521 2522 """ 2523 qfilter = qlang.MakeSimpleFilter("id", job_ids) 2524 2525 (qobj, ctx, sort_by_name) = self._Query(fields, qfilter) 2526 2527 return qobj.OldStyleQuery(ctx, sort_by_name=sort_by_name)

2528 2529 @locking.ssynchronized(_LOCK)

2530 - def PrepareShutdown(self):

2531 """Prepare to stop the job queue. 2532 2533 Disables execution of jobs in the workerpool and returns whether there are 2534 any jobs currently running. If the latter is the case, the job queue is not 2535 yet ready for shutdown. Once this function returns C{True} L{Shutdown} can 2536 be called without interfering with any job. Queued and unfinished jobs will 2537 be resumed next time. 2538 2539 Once this function has been called no new job submissions will be accepted 2540 (see L{_RequireNonDrainedQueue}). 2541 2542 @rtype: bool 2543 @return: Whether there are any running jobs 2544 2545 """ 2546 if self._accepting_jobs: 2547 self._accepting_jobs = False 2548 2549 # Tell worker pool to stop processing pending tasks 2550 self._wpool.SetActive(False) 2551 2552 return self._wpool.HasRunningTasks()

2553

2554 - def AcceptingJobsUnlocked(self):

2555 """Returns whether jobs are accepted. 2556 2557 Once L{PrepareShutdown} has been called, no new jobs are accepted and the 2558 queue is shutting down. 2559 2560 @rtype: bool 2561 2562 """ 2563 return self._accepting_jobs

2564 2565 @locking.ssynchronized(_LOCK) 2566 @_RequireOpenQueue

2567 - def Shutdown(self):

2568 """Stops the job queue. 2569 2570 This shutdowns all the worker threads an closes the queue. 2571 2572 """ 2573 self._wpool.TerminateWorkers() 2574 2575 self._queue_filelock.Close() 2576 self._queue_filelock = None

2577

Source Code for Module ganeti.jqueue