ganeti.jqueue

1 # 2 # 3 4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 Google Inc. 5 # 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; either version 2 of the License, or 9 # (at your option) any later version. 10 # 11 # This program is distributed in the hope that it will be useful, but 12 # WITHOUT ANY WARRANTY; without even the implied warranty of 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 # General Public License for more details. 15 # 16 # You should have received a copy of the GNU General Public License 17 # along with this program; if not, write to the Free Software 18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 # 02110-1301, USA. 20 21 22 """Module implementing the job queue handling. 23 24 Locking: there's a single, large lock in the L{JobQueue} class. It's 25 used by all other classes in this module. 26 27 @var JOBQUEUE_THREADS: the number of worker threads we start for 28 processing jobs 29 30 """ 31 32 import logging 33 import errno 34 import time 35 import weakref 36 import threading 37 import itertools 38 import operator 39 40 try: 41 # pylint: disable=E0611 42 from pyinotify import pyinotify 43 except ImportError: 44 import pyinotify 45 46 from ganeti import asyncnotifier 47 from ganeti import constants 48 from ganeti import serializer 49 from ganeti import workerpool 50 from ganeti import locking 51 from ganeti import opcodes 52 from ganeti import errors 53 from ganeti import mcpu 54 from ganeti import utils 55 from ganeti import jstore 56 from ganeti import rpc 57 from ganeti import runtime 58 from ganeti import netutils 59 from ganeti import compat 60 from ganeti import ht 61 from ganeti import query 62 from ganeti import qlang 63 from ganeti import pathutils 64 from ganeti import vcluster 65 66 67 JOBQUEUE_THREADS = 25 68 69 # member lock names to be passed to @ssynchronized decorator 70 _LOCK = "_lock" 71 _QUEUE = "_queue" 72 73 #: Retrieves "id" attribute 74 _GetIdAttr = operator.attrgetter("id")

75 76 77 -class CancelJob(Exception):

78 """Special exception to cancel a job. 79 80 """

81

82 83 -class QueueShutdown(Exception):

84 """Special exception to abort a job when the job queue is shutting down. 85 86 """

87

88 89 -def TimeStampNow():

90 """Returns the current timestamp. 91 92 @rtype: tuple 93 @return: the current time in the (seconds, microseconds) format 94 95 """ 96 return utils.SplitTime(time.time())

97

98 99 -def _CallJqUpdate(runner, names, file_name, content):

100 """Updates job queue file after virtualizing filename. 101 102 """ 103 virt_file_name = vcluster.MakeVirtualPath(file_name) 104 return runner.call_jobqueue_update(names, virt_file_name, content)

105

106 107 -class _SimpleJobQuery:

108 """Wrapper for job queries. 109 110 Instance keeps list of fields cached, useful e.g. in L{_JobChangesChecker}. 111 112 """

113 - def __init__(self, fields):

114 """Initializes this class. 115 116 """ 117 self._query = query.Query(query.JOB_FIELDS, fields)

118

119 - def __call__(self, job):

120 """Executes a job query using cached field list. 121 122 """ 123 return self._query.OldStyleQuery([(job.id, job)], sort_by_name=False)[0]

124

125 126 -class _QueuedOpCode(object):

127 """Encapsulates an opcode object. 128 129 @ivar log: holds the execution log and consists of tuples 130 of the form C{(log_serial, timestamp, level, message)} 131 @ivar input: the OpCode we encapsulate 132 @ivar status: the current status 133 @ivar result: the result of the LU execution 134 @ivar start_timestamp: timestamp for the start of the execution 135 @ivar exec_timestamp: timestamp for the actual LU Exec() function invocation 136 @ivar stop_timestamp: timestamp for the end of the execution 137 138 """ 139 __slots__ = ["input", "status", "result", "log", "priority", 140 "start_timestamp", "exec_timestamp", "end_timestamp", 141 "__weakref__"] 142

143 - def __init__(self, op):

144 """Initializes instances of this class. 145 146 @type op: L{opcodes.OpCode} 147 @param op: the opcode we encapsulate 148 149 """ 150 self.input = op 151 self.status = constants.OP_STATUS_QUEUED 152 self.result = None 153 self.log = [] 154 self.start_timestamp = None 155 self.exec_timestamp = None 156 self.end_timestamp = None 157 158 # Get initial priority (it might change during the lifetime of this opcode) 159 self.priority = getattr(op, "priority", constants.OP_PRIO_DEFAULT)

160 161 @classmethod

162 - def Restore(cls, state):

163 """Restore the _QueuedOpCode from the serialized form. 164 165 @type state: dict 166 @param state: the serialized state 167 @rtype: _QueuedOpCode 168 @return: a new _QueuedOpCode instance 169 170 """ 171 obj = _QueuedOpCode.__new__(cls) 172 obj.input = opcodes.OpCode.LoadOpCode(state["input"]) 173 obj.status = state["status"] 174 obj.result = state["result"] 175 obj.log = state["log"] 176 obj.start_timestamp = state.get("start_timestamp", None) 177 obj.exec_timestamp = state.get("exec_timestamp", None) 178 obj.end_timestamp = state.get("end_timestamp", None) 179 obj.priority = state.get("priority", constants.OP_PRIO_DEFAULT) 180 return obj

181

182 - def Serialize(self):

183 """Serializes this _QueuedOpCode. 184 185 @rtype: dict 186 @return: the dictionary holding the serialized state 187 188 """ 189 return { 190 "input": self.input.__getstate__(), 191 "status": self.status, 192 "result": self.result, 193 "log": self.log, 194 "start_timestamp": self.start_timestamp, 195 "exec_timestamp": self.exec_timestamp, 196 "end_timestamp": self.end_timestamp, 197 "priority": self.priority, 198 }

199

200 201 -class _QueuedJob(object):

202 """In-memory job representation. 203 204 This is what we use to track the user-submitted jobs. Locking must 205 be taken care of by users of this class. 206 207 @type queue: L{JobQueue} 208 @ivar queue: the parent queue 209 @ivar id: the job ID 210 @type ops: list 211 @ivar ops: the list of _QueuedOpCode that constitute the job 212 @type log_serial: int 213 @ivar log_serial: holds the index for the next log entry 214 @ivar received_timestamp: the timestamp for when the job was received 215 @ivar start_timestmap: the timestamp for start of execution 216 @ivar end_timestamp: the timestamp for end of execution 217 @ivar writable: Whether the job is allowed to be modified 218 219 """ 220 # pylint: disable=W0212 221 __slots__ = ["queue", "id", "ops", "log_serial", "ops_iter", "cur_opctx", 222 "received_timestamp", "start_timestamp", "end_timestamp", 223 "__weakref__", "processor_lock", "writable", "archived"] 224

225 - def __init__(self, queue, job_id, ops, writable):

226 """Constructor for the _QueuedJob. 227 228 @type queue: L{JobQueue} 229 @param queue: our parent queue 230 @type job_id: job_id 231 @param job_id: our job id 232 @type ops: list 233 @param ops: the list of opcodes we hold, which will be encapsulated 234 in _QueuedOpCodes 235 @type writable: bool 236 @param writable: Whether job can be modified 237 238 """ 239 if not ops: 240 raise errors.GenericError("A job needs at least one opcode") 241 242 self.queue = queue 243 self.id = int(job_id) 244 self.ops = [_QueuedOpCode(op) for op in ops] 245 self.log_serial = 0 246 self.received_timestamp = TimeStampNow() 247 self.start_timestamp = None 248 self.end_timestamp = None 249 self.archived = False 250 251 self._InitInMemory(self, writable) 252 253 assert not self.archived, "New jobs can not be marked as archived"

254 255 @staticmethod

256 - def _InitInMemory(obj, writable):

257 """Initializes in-memory variables. 258 259 """ 260 obj.writable = writable 261 obj.ops_iter = None 262 obj.cur_opctx = None 263 264 # Read-only jobs are not processed and therefore don't need a lock 265 if writable: 266 obj.processor_lock = threading.Lock() 267 else: 268 obj.processor_lock = None

269

270 - def __repr__(self):

271 status = ["%s.%s" % (self.__class__.__module__, self.__class__.__name__), 272 "id=%s" % self.id, 273 "ops=%s" % ",".join([op.input.Summary() for op in self.ops])] 274 275 return "<%s at %#x>" % (" ".join(status), id(self))

276 277 @classmethod

278 - def Restore(cls, queue, state, writable, archived):

279 """Restore a _QueuedJob from serialized state: 280 281 @type queue: L{JobQueue} 282 @param queue: to which queue the restored job belongs 283 @type state: dict 284 @param state: the serialized state 285 @type writable: bool 286 @param writable: Whether job can be modified 287 @type archived: bool 288 @param archived: Whether job was already archived 289 @rtype: _JobQueue 290 @return: the restored _JobQueue instance 291 292 """ 293 obj = _QueuedJob.__new__(cls) 294 obj.queue = queue 295 obj.id = int(state["id"]) 296 obj.received_timestamp = state.get("received_timestamp", None) 297 obj.start_timestamp = state.get("start_timestamp", None) 298 obj.end_timestamp = state.get("end_timestamp", None) 299 obj.archived = archived 300 301 obj.ops = [] 302 obj.log_serial = 0 303 for op_state in state["ops"]: 304 op = _QueuedOpCode.Restore(op_state) 305 for log_entry in op.log: 306 obj.log_serial = max(obj.log_serial, log_entry[0]) 307 obj.ops.append(op) 308 309 cls._InitInMemory(obj, writable) 310 311 return obj

312

313 - def Serialize(self):

314 """Serialize the _JobQueue instance. 315 316 @rtype: dict 317 @return: the serialized state 318 319 """ 320 return { 321 "id": self.id, 322 "ops": [op.Serialize() for op in self.ops], 323 "start_timestamp": self.start_timestamp, 324 "end_timestamp": self.end_timestamp, 325 "received_timestamp": self.received_timestamp, 326 }

327

328 - def CalcStatus(self):

329 """Compute the status of this job. 330 331 This function iterates over all the _QueuedOpCodes in the job and 332 based on their status, computes the job status. 333 334 The algorithm is: 335 - if we find a cancelled, or finished with error, the job 336 status will be the same 337 - otherwise, the last opcode with the status one of: 338 - waitlock 339 - canceling 340 - running 341 342 will determine the job status 343 344 - otherwise, it means either all opcodes are queued, or success, 345 and the job status will be the same 346 347 @return: the job status 348 349 """ 350 status = constants.JOB_STATUS_QUEUED 351 352 all_success = True 353 for op in self.ops: 354 if op.status == constants.OP_STATUS_SUCCESS: 355 continue 356 357 all_success = False 358 359 if op.status == constants.OP_STATUS_QUEUED: 360 pass 361 elif op.status == constants.OP_STATUS_WAITING: 362 status = constants.JOB_STATUS_WAITING 363 elif op.status == constants.OP_STATUS_RUNNING: 364 status = constants.JOB_STATUS_RUNNING 365 elif op.status == constants.OP_STATUS_CANCELING: 366 status = constants.JOB_STATUS_CANCELING 367 break 368 elif op.status == constants.OP_STATUS_ERROR: 369 status = constants.JOB_STATUS_ERROR 370 # The whole job fails if one opcode failed 371 break 372 elif op.status == constants.OP_STATUS_CANCELED: 373 status = constants.OP_STATUS_CANCELED 374 break 375 376 if all_success: 377 status = constants.JOB_STATUS_SUCCESS 378 379 return status

380

381 - def CalcPriority(self):

382 """Gets the current priority for this job. 383 384 Only unfinished opcodes are considered. When all are done, the default 385 priority is used. 386 387 @rtype: int 388 389 """ 390 priorities = [op.priority for op in self.ops 391 if op.status not in constants.OPS_FINALIZED] 392 393 if not priorities: 394 # All opcodes are done, assume default priority 395 return constants.OP_PRIO_DEFAULT 396 397 return min(priorities)

398

399 - def GetLogEntries(self, newer_than):

400 """Selectively returns the log entries. 401 402 @type newer_than: None or int 403 @param newer_than: if this is None, return all log entries, 404 otherwise return only the log entries with serial higher 405 than this value 406 @rtype: list 407 @return: the list of the log entries selected 408 409 """ 410 if newer_than is None: 411 serial = -1 412 else: 413 serial = newer_than 414 415 entries = [] 416 for op in self.ops: 417 entries.extend(filter(lambda entry: entry[0] > serial, op.log)) 418 419 return entries

420

421 - def GetInfo(self, fields):

422 """Returns information about a job. 423 424 @type fields: list 425 @param fields: names of fields to return 426 @rtype: list 427 @return: list with one element for each field 428 @raise errors.OpExecError: when an invalid field 429 has been passed 430 431 """ 432 return _SimpleJobQuery(fields)(self)

433

434 - def MarkUnfinishedOps(self, status, result):

435 """Mark unfinished opcodes with a given status and result. 436 437 This is an utility function for marking all running or waiting to 438 be run opcodes with a given status. Opcodes which are already 439 finalised are not changed. 440 441 @param status: a given opcode status 442 @param result: the opcode result 443 444 """ 445 not_marked = True 446 for op in self.ops: 447 if op.status in constants.OPS_FINALIZED: 448 assert not_marked, "Finalized opcodes found after non-finalized ones" 449 continue 450 op.status = status 451 op.result = result 452 not_marked = False

453

454 - def Finalize(self):

455 """Marks the job as finalized. 456 457 """ 458 self.end_timestamp = TimeStampNow()

459

460 - def Cancel(self):

461 """Marks job as canceled/-ing if possible. 462 463 @rtype: tuple; (bool, string) 464 @return: Boolean describing whether job was successfully canceled or marked 465 as canceling and a text message 466 467 """ 468 status = self.CalcStatus() 469 470 if status == constants.JOB_STATUS_QUEUED: 471 self.MarkUnfinishedOps(constants.OP_STATUS_CANCELED, 472 "Job canceled by request") 473 self.Finalize() 474 return (True, "Job %s canceled" % self.id) 475 476 elif status == constants.JOB_STATUS_WAITING: 477 # The worker will notice the new status and cancel the job 478 self.MarkUnfinishedOps(constants.OP_STATUS_CANCELING, None) 479 return (True, "Job %s will be canceled" % self.id) 480 481 else: 482 logging.debug("Job %s is no longer waiting in the queue", self.id) 483 return (False, "Job %s is no longer waiting in the queue" % self.id)

484

485 - def ChangePriority(self, priority):

486 """Changes the job priority. 487 488 @type priority: int 489 @param priority: New priority 490 @rtype: tuple; (bool, string) 491 @return: Boolean describing whether job's priority was successfully changed 492 and a text message 493 494 """ 495 status = self.CalcStatus() 496 497 if status in constants.JOBS_FINALIZED: 498 return (False, "Job %s is finished" % self.id) 499 elif status == constants.JOB_STATUS_CANCELING: 500 return (False, "Job %s is cancelling" % self.id) 501 else: 502 assert status in (constants.JOB_STATUS_QUEUED, 503 constants.JOB_STATUS_WAITING, 504 constants.JOB_STATUS_RUNNING) 505 506 changed = False 507 for op in self.ops: 508 if (op.status == constants.OP_STATUS_RUNNING or 509 op.status in constants.OPS_FINALIZED): 510 assert not changed, \ 511 ("Found opcode for which priority should not be changed after" 512 " priority has been changed for previous opcodes") 513 continue 514 515 assert op.status in (constants.OP_STATUS_QUEUED, 516 constants.OP_STATUS_WAITING) 517 518 changed = True 519 520 # Set new priority (doesn't modify opcode input) 521 op.priority = priority 522 523 if changed: 524 return (True, ("Priorities of pending opcodes for job %s have been" 525 " changed to %s" % (self.id, priority))) 526 else: 527 return (False, "Job %s had no pending opcodes" % self.id)

528

529 530 -class _OpExecCallbacks(mcpu.OpExecCbBase):

531 - def __init__(self, queue, job, op):

532 """Initializes this class. 533 534 @type queue: L{JobQueue} 535 @param queue: Job queue 536 @type job: L{_QueuedJob} 537 @param job: Job object 538 @type op: L{_QueuedOpCode} 539 @param op: OpCode 540 541 """ 542 assert queue, "Queue is missing" 543 assert job, "Job is missing" 544 assert op, "Opcode is missing" 545 546 self._queue = queue 547 self._job = job 548 self._op = op

549

550 - def _CheckCancel(self):

551 """Raises an exception to cancel the job if asked to. 552 553 """ 554 # Cancel here if we were asked to 555 if self._op.status == constants.OP_STATUS_CANCELING: 556 logging.debug("Canceling opcode") 557 raise CancelJob() 558 559 # See if queue is shutting down 560 if not self._queue.AcceptingJobsUnlocked(): 561 logging.debug("Queue is shutting down") 562 raise QueueShutdown()

563 564 @locking.ssynchronized(_QUEUE, shared=1)

565 - def NotifyStart(self):

566 """Mark the opcode as running, not lock-waiting. 567 568 This is called from the mcpu code as a notifier function, when the LU is 569 finally about to start the Exec() method. Of course, to have end-user 570 visible results, the opcode must be initially (before calling into 571 Processor.ExecOpCode) set to OP_STATUS_WAITING. 572 573 """ 574 assert self._op in self._job.ops 575 assert self._op.status in (constants.OP_STATUS_WAITING, 576 constants.OP_STATUS_CANCELING) 577 578 # Cancel here if we were asked to 579 self._CheckCancel() 580 581 logging.debug("Opcode is now running") 582 583 self._op.status = constants.OP_STATUS_RUNNING 584 self._op.exec_timestamp = TimeStampNow() 585 586 # And finally replicate the job status 587 self._queue.UpdateJobUnlocked(self._job)

588 589 @locking.ssynchronized(_QUEUE, shared=1)

590 - def _AppendFeedback(self, timestamp, log_type, log_msg):

591 """Internal feedback append function, with locks 592 593 """ 594 self._job.log_serial += 1 595 self._op.log.append((self._job.log_serial, timestamp, log_type, log_msg)) 596 self._queue.UpdateJobUnlocked(self._job, replicate=False)

597

598 - def Feedback(self, *args):

599 """Append a log entry. 600 601 """ 602 assert len(args) < 3 603 604 if len(args) == 1: 605 log_type = constants.ELOG_MESSAGE 606 log_msg = args[0] 607 else: 608 (log_type, log_msg) = args 609 610 # The time is split to make serialization easier and not lose 611 # precision. 612 timestamp = utils.SplitTime(time.time()) 613 self._AppendFeedback(timestamp, log_type, log_msg)

614

615 - def CurrentPriority(self):

616 """Returns current priority for opcode. 617 618 """ 619 assert self._op.status in (constants.OP_STATUS_WAITING, 620 constants.OP_STATUS_CANCELING) 621 622 # Cancel here if we were asked to 623 self._CheckCancel() 624 625 return self._op.priority

626

627 - def SubmitManyJobs(self, jobs):

628 """Submits jobs for processing. 629 630 See L{JobQueue.SubmitManyJobs}. 631 632 """ 633 # Locking is done in job queue 634 return self._queue.SubmitManyJobs(jobs)

635

636 637 -class _JobChangesChecker(object):

638 - def __init__(self, fields, prev_job_info, prev_log_serial):

639 """Initializes this class. 640 641 @type fields: list of strings 642 @param fields: Fields requested by LUXI client 643 @type prev_job_info: string 644 @param prev_job_info: previous job info, as passed by the LUXI client 645 @type prev_log_serial: string 646 @param prev_log_serial: previous job serial, as passed by the LUXI client 647 648 """ 649 self._squery = _SimpleJobQuery(fields) 650 self._prev_job_info = prev_job_info 651 self._prev_log_serial = prev_log_serial

652

653 - def __call__(self, job):

654 """Checks whether job has changed. 655 656 @type job: L{_QueuedJob} 657 @param job: Job object 658 659 """ 660 assert not job.writable, "Expected read-only job" 661 662 status = job.CalcStatus() 663 job_info = self._squery(job) 664 log_entries = job.GetLogEntries(self._prev_log_serial) 665 666 # Serializing and deserializing data can cause type changes (e.g. from 667 # tuple to list) or precision loss. We're doing it here so that we get 668 # the same modifications as the data received from the client. Without 669 # this, the comparison afterwards might fail without the data being 670 # significantly different. 671 # TODO: we just deserialized from disk, investigate how to make sure that 672 # the job info and log entries are compatible to avoid this further step. 673 # TODO: Doing something like in testutils.py:UnifyValueType might be more 674 # efficient, though floats will be tricky 675 job_info = serializer.LoadJson(serializer.DumpJson(job_info)) 676 log_entries = serializer.LoadJson(serializer.DumpJson(log_entries)) 677 678 # Don't even try to wait if the job is no longer running, there will be 679 # no changes. 680 if (status not in (constants.JOB_STATUS_QUEUED, 681 constants.JOB_STATUS_RUNNING, 682 constants.JOB_STATUS_WAITING) or 683 job_info != self._prev_job_info or 684 (log_entries and self._prev_log_serial != log_entries[0][0])): 685 logging.debug("Job %s changed", job.id) 686 return (job_info, log_entries) 687 688 return None

689

690 691 -class _JobFileChangesWaiter(object):

692 - def __init__(self, filename, _inotify_wm_cls=pyinotify.WatchManager):

693 """Initializes this class. 694 695 @type filename: string 696 @param filename: Path to job file 697 @raises errors.InotifyError: if the notifier cannot be setup 698 699 """ 700 self._wm = _inotify_wm_cls() 701 self._inotify_handler = \ 702 asyncnotifier.SingleFileEventHandler(self._wm, self._OnInotify, filename) 703 self._notifier = \ 704 pyinotify.Notifier(self._wm, default_proc_fun=self._inotify_handler) 705 try: 706 self._inotify_handler.enable() 707 except Exception: 708 # pyinotify doesn't close file descriptors automatically 709 self._notifier.stop() 710 raise

711

712 - def _OnInotify(self, notifier_enabled):

713 """Callback for inotify. 714 715 """ 716 if not notifier_enabled: 717 self._inotify_handler.enable()

718

719 - def Wait(self, timeout):

720 """Waits for the job file to change. 721 722 @type timeout: float 723 @param timeout: Timeout in seconds 724 @return: Whether there have been events 725 726 """ 727 assert timeout >= 0 728 have_events = self._notifier.check_events(timeout * 1000) 729 if have_events: 730 self._notifier.read_events() 731 self._notifier.process_events() 732 return have_events

733

734 - def Close(self):

735 """Closes underlying notifier and its file descriptor. 736 737 """ 738 self._notifier.stop()

739

740 741 -class _JobChangesWaiter(object):

742 - def __init__(self, filename, _waiter_cls=_JobFileChangesWaiter):

743 """Initializes this class. 744 745 @type filename: string 746 @param filename: Path to job file 747 748 """ 749 self._filewaiter = None 750 self._filename = filename 751 self._waiter_cls = _waiter_cls

752

753 - def Wait(self, timeout):

754 """Waits for a job to change. 755 756 @type timeout: float 757 @param timeout: Timeout in seconds 758 @return: Whether there have been events 759 760 """ 761 if self._filewaiter: 762 return self._filewaiter.Wait(timeout) 763 764 # Lazy setup: Avoid inotify setup cost when job file has already changed. 765 # If this point is reached, return immediately and let caller check the job 766 # file again in case there were changes since the last check. This avoids a 767 # race condition. 768 self._filewaiter = self._waiter_cls(self._filename) 769 770 return True

771

772 - def Close(self):

773 """Closes underlying waiter. 774 775 """ 776 if self._filewaiter: 777 self._filewaiter.Close()

778

779 780 -class _WaitForJobChangesHelper(object):

781 """Helper class using inotify to wait for changes in a job file. 782 783 This class takes a previous job status and serial, and alerts the client when 784 the current job status has changed. 785 786 """ 787 @staticmethod

788 - def _CheckForChanges(counter, job_load_fn, check_fn):

789 if counter.next() > 0: 790 # If this isn't the first check the job is given some more time to change 791 # again. This gives better performance for jobs generating many 792 # changes/messages. 793 time.sleep(0.1) 794 795 job = job_load_fn() 796 if not job: 797 raise errors.JobLost() 798 799 result = check_fn(job) 800 if result is None: 801 raise utils.RetryAgain() 802 803 return result

804

805 - def __call__(self, filename, job_load_fn, 806 fields, prev_job_info, prev_log_serial, timeout, 807 _waiter_cls=_JobChangesWaiter):

808 """Waits for changes on a job. 809 810 @type filename: string 811 @param filename: File on which to wait for changes 812 @type job_load_fn: callable 813 @param job_load_fn: Function to load job 814 @type fields: list of strings 815 @param fields: Which fields to check for changes 816 @type prev_job_info: list or None 817 @param prev_job_info: Last job information returned 818 @type prev_log_serial: int 819 @param prev_log_serial: Last job message serial number 820 @type timeout: float 821 @param timeout: maximum time to wait in seconds 822 823 """ 824 counter = itertools.count() 825 try: 826 check_fn = _JobChangesChecker(fields, prev_job_info, prev_log_serial) 827 waiter = _waiter_cls(filename) 828 try: 829 return utils.Retry(compat.partial(self._CheckForChanges, 830 counter, job_load_fn, check_fn), 831 utils.RETRY_REMAINING_TIME, timeout, 832 wait_fn=waiter.Wait) 833 finally: 834 waiter.Close() 835 except errors.JobLost: 836 return None 837 except utils.RetryTimeout: 838 return constants.JOB_NOTCHANGED

839

840 841 -def _EncodeOpError(err):

842 """Encodes an error which occurred while processing an opcode. 843 844 """ 845 if isinstance(err, errors.GenericError): 846 to_encode = err 847 else: 848 to_encode = errors.OpExecError(str(err)) 849 850 return errors.EncodeException(to_encode)

851

852 853 -class _TimeoutStrategyWrapper:

854 - def __init__(self, fn):

855 """Initializes this class. 856 857 """ 858 self._fn = fn 859 self._next = None

860

861 - def _Advance(self):

862 """Gets the next timeout if necessary. 863 864 """ 865 if self._next is None: 866 self._next = self._fn()

867

868 - def Peek(self):

869 """Returns the next timeout. 870 871 """ 872 self._Advance() 873 return self._next

874

875 - def Next(self):

876 """Returns the current timeout and advances the internal state. 877 878 """ 879 self._Advance() 880 result = self._next 881 self._next = None 882 return result

883

884 885 -class _OpExecContext:

886 - def __init__(self, op, index, log_prefix, timeout_strategy_factory):

887 """Initializes this class. 888 889 """ 890 self.op = op 891 self.index = index 892 self.log_prefix = log_prefix 893 self.summary = op.input.Summary() 894 895 # Create local copy to modify 896 if getattr(op.input, opcodes.DEPEND_ATTR, None): 897 self.jobdeps = op.input.depends[:] 898 else: 899 self.jobdeps = None 900 901 self._timeout_strategy_factory = timeout_strategy_factory 902 self._ResetTimeoutStrategy()

903

904 - def _ResetTimeoutStrategy(self):

905 """Creates a new timeout strategy. 906 907 """ 908 self._timeout_strategy = \ 909 _TimeoutStrategyWrapper(self._timeout_strategy_factory().NextAttempt)

910

911 - def CheckPriorityIncrease(self):

912 """Checks whether priority can and should be increased. 913 914 Called when locks couldn't be acquired. 915 916 """ 917 op = self.op 918 919 # Exhausted all retries and next round should not use blocking acquire 920 # for locks? 921 if (self._timeout_strategy.Peek() is None and 922 op.priority > constants.OP_PRIO_HIGHEST): 923 logging.debug("Increasing priority") 924 op.priority -= 1 925 self._ResetTimeoutStrategy() 926 return True 927 928 return False

929

930 - def GetNextLockTimeout(self):

931 """Returns the next lock acquire timeout. 932 933 """ 934 return self._timeout_strategy.Next()

935

936 937 -class _JobProcessor(object):

938 (DEFER, 939 WAITDEP, 940 FINISHED) = range(1, 4) 941

942 - def __init__(self, queue, opexec_fn, job, 943 _timeout_strategy_factory=mcpu.LockAttemptTimeoutStrategy):

944 """Initializes this class. 945 946 """ 947 self.queue = queue 948 self.opexec_fn = opexec_fn 949 self.job = job 950 self._timeout_strategy_factory = _timeout_strategy_factory

951 952 @staticmethod

953 - def _FindNextOpcode(job, timeout_strategy_factory):

954 """Locates the next opcode to run. 955 956 @type job: L{_QueuedJob} 957 @param job: Job object 958 @param timeout_strategy_factory: Callable to create new timeout strategy 959 960 """ 961 # Create some sort of a cache to speed up locating next opcode for future 962 # lookups 963 # TODO: Consider splitting _QueuedJob.ops into two separate lists, one for 964 # pending and one for processed ops. 965 if job.ops_iter is None: 966 job.ops_iter = enumerate(job.ops) 967 968 # Find next opcode to run 969 while True: 970 try: 971 (idx, op) = job.ops_iter.next() 972 except StopIteration: 973 raise errors.ProgrammerError("Called for a finished job") 974 975 if op.status == constants.OP_STATUS_RUNNING: 976 # Found an opcode already marked as running 977 raise errors.ProgrammerError("Called for job marked as running") 978 979 opctx = _OpExecContext(op, idx, "Op %s/%s" % (idx + 1, len(job.ops)), 980 timeout_strategy_factory) 981 982 if op.status not in constants.OPS_FINALIZED: 983 return opctx 984 985 # This is a job that was partially completed before master daemon 986 # shutdown, so it can be expected that some opcodes are already 987 # completed successfully (if any did error out, then the whole job 988 # should have been aborted and not resubmitted for processing). 989 logging.info("%s: opcode %s already processed, skipping", 990 opctx.log_prefix, opctx.summary)

991 992 @staticmethod

993 - def _MarkWaitlock(job, op):

994 """Marks an opcode as waiting for locks. 995 996 The job's start timestamp is also set if necessary. 997 998 @type job: L{_QueuedJob} 999 @param job: Job object 1000 @type op: L{_QueuedOpCode} 1001 @param op: Opcode object 1002 1003 """ 1004 assert op in job.ops 1005 assert op.status in (constants.OP_STATUS_QUEUED, 1006 constants.OP_STATUS_WAITING) 1007 1008 update = False 1009 1010 op.result = None 1011 1012 if op.status == constants.OP_STATUS_QUEUED: 1013 op.status = constants.OP_STATUS_WAITING 1014 update = True 1015 1016 if op.start_timestamp is None: 1017 op.start_timestamp = TimeStampNow() 1018 update = True 1019 1020 if job.start_timestamp is None: 1021 job.start_timestamp = op.start_timestamp 1022 update = True 1023 1024 assert op.status == constants.OP_STATUS_WAITING 1025 1026 return update

1027 1028 @staticmethod

1029 - def _CheckDependencies(queue, job, opctx):

1030 """Checks if an opcode has dependencies and if so, processes them. 1031 1032 @type queue: L{JobQueue} 1033 @param queue: Queue object 1034 @type job: L{_QueuedJob} 1035 @param job: Job object 1036 @type opctx: L{_OpExecContext} 1037 @param opctx: Opcode execution context 1038 @rtype: bool 1039 @return: Whether opcode will be re-scheduled by dependency tracker 1040 1041 """ 1042 op = opctx.op 1043 1044 result = False 1045 1046 while opctx.jobdeps: 1047 (dep_job_id, dep_status) = opctx.jobdeps[0] 1048 1049 (depresult, depmsg) = queue.depmgr.CheckAndRegister(job, dep_job_id, 1050 dep_status) 1051 assert ht.TNonEmptyString(depmsg), "No dependency message" 1052 1053 logging.info("%s: %s", opctx.log_prefix, depmsg) 1054 1055 if depresult == _JobDependencyManager.CONTINUE: 1056 # Remove dependency and continue 1057 opctx.jobdeps.pop(0) 1058 1059 elif depresult == _JobDependencyManager.WAIT: 1060 # Need to wait for notification, dependency tracker will re-add job 1061 # to workerpool 1062 result = True 1063 break 1064 1065 elif depresult == _JobDependencyManager.CANCEL: 1066 # Job was cancelled, cancel this job as well 1067 job.Cancel() 1068 assert op.status == constants.OP_STATUS_CANCELING 1069 break 1070 1071 elif depresult in (_JobDependencyManager.WRONGSTATUS, 1072 _JobDependencyManager.ERROR): 1073 # Job failed or there was an error, this job must fail 1074 op.status = constants.OP_STATUS_ERROR 1075 op.result = _EncodeOpError(errors.OpExecError(depmsg)) 1076 break 1077 1078 else: 1079 raise errors.ProgrammerError("Unknown dependency result '%s'" % 1080 depresult) 1081 1082 return result

1083

1084 - def _ExecOpCodeUnlocked(self, opctx):

1085 """Processes one opcode and returns the result. 1086 1087 """ 1088 op = opctx.op 1089 1090 assert op.status == constants.OP_STATUS_WAITING 1091 1092 timeout = opctx.GetNextLockTimeout() 1093 1094 try: 1095 # Make sure not to hold queue lock while calling ExecOpCode 1096 result = self.opexec_fn(op.input, 1097 _OpExecCallbacks(self.queue, self.job, op), 1098 timeout=timeout) 1099 except mcpu.LockAcquireTimeout: 1100 assert timeout is not None, "Received timeout for blocking acquire" 1101 logging.debug("Couldn't acquire locks in %0.6fs", timeout) 1102 1103 assert op.status in (constants.OP_STATUS_WAITING, 1104 constants.OP_STATUS_CANCELING) 1105 1106 # Was job cancelled while we were waiting for the lock? 1107 if op.status == constants.OP_STATUS_CANCELING: 1108 return (constants.OP_STATUS_CANCELING, None) 1109 1110 # Queue is shutting down, return to queued 1111 if not self.queue.AcceptingJobsUnlocked(): 1112 return (constants.OP_STATUS_QUEUED, None) 1113 1114 # Stay in waitlock while trying to re-acquire lock 1115 return (constants.OP_STATUS_WAITING, None) 1116 except CancelJob: 1117 logging.exception("%s: Canceling job", opctx.log_prefix) 1118 assert op.status == constants.OP_STATUS_CANCELING 1119 return (constants.OP_STATUS_CANCELING, None) 1120 1121 except QueueShutdown: 1122 logging.exception("%s: Queue is shutting down", opctx.log_prefix) 1123 1124 assert op.status == constants.OP_STATUS_WAITING 1125 1126 # Job hadn't been started yet, so it should return to the queue 1127 return (constants.OP_STATUS_QUEUED, None) 1128 1129 except Exception, err: # pylint: disable=W0703 1130 logging.exception("%s: Caught exception in %s", 1131 opctx.log_prefix, opctx.summary) 1132 return (constants.OP_STATUS_ERROR, _EncodeOpError(err)) 1133 else: 1134 logging.debug("%s: %s successful", 1135 opctx.log_prefix, opctx.summary) 1136 return (constants.OP_STATUS_SUCCESS, result)

1137

1138 - def __call__(self, _nextop_fn=None):

1139 """Continues execution of a job. 1140 1141 @param _nextop_fn: Callback function for tests 1142 @return: C{FINISHED} if job is fully processed, C{DEFER} if the job should 1143 be deferred and C{WAITDEP} if the dependency manager 1144 (L{_JobDependencyManager}) will re-schedule the job when appropriate 1145 1146 """ 1147 queue = self.queue 1148 job = self.job 1149 1150 logging.debug("Processing job %s", job.id) 1151 1152 queue.acquire(shared=1) 1153 try: 1154 opcount = len(job.ops) 1155 1156 assert job.writable, "Expected writable job" 1157 1158 # Don't do anything for finalized jobs 1159 if job.CalcStatus() in constants.JOBS_FINALIZED: 1160 return self.FINISHED 1161 1162 # Is a previous opcode still pending? 1163 if job.cur_opctx: 1164 opctx = job.cur_opctx 1165 job.cur_opctx = None 1166 else: 1167 if __debug__ and _nextop_fn: 1168 _nextop_fn() 1169 opctx = self._FindNextOpcode(job, self._timeout_strategy_factory) 1170 1171 op = opctx.op 1172 1173 # Consistency check 1174 assert compat.all(i.status in (constants.OP_STATUS_QUEUED, 1175 constants.OP_STATUS_CANCELING) 1176 for i in job.ops[opctx.index + 1:]) 1177 1178 assert op.status in (constants.OP_STATUS_QUEUED, 1179 constants.OP_STATUS_WAITING, 1180 constants.OP_STATUS_CANCELING) 1181 1182 assert (op.priority <= constants.OP_PRIO_LOWEST and 1183 op.priority >= constants.OP_PRIO_HIGHEST) 1184 1185 waitjob = None 1186 1187 if op.status != constants.OP_STATUS_CANCELING: 1188 assert op.status in (constants.OP_STATUS_QUEUED, 1189 constants.OP_STATUS_WAITING) 1190 1191 # Prepare to start opcode 1192 if self._MarkWaitlock(job, op): 1193 # Write to disk 1194 queue.UpdateJobUnlocked(job) 1195 1196 assert op.status == constants.OP_STATUS_WAITING 1197 assert job.CalcStatus() == constants.JOB_STATUS_WAITING 1198 assert job.start_timestamp and op.start_timestamp 1199 assert waitjob is None 1200 1201 # Check if waiting for a job is necessary 1202 waitjob = self._CheckDependencies(queue, job, opctx) 1203 1204 assert op.status in (constants.OP_STATUS_WAITING, 1205 constants.OP_STATUS_CANCELING, 1206 constants.OP_STATUS_ERROR) 1207 1208 if not (waitjob or op.status in (constants.OP_STATUS_CANCELING, 1209 constants.OP_STATUS_ERROR)): 1210 logging.info("%s: opcode %s waiting for locks", 1211 opctx.log_prefix, opctx.summary) 1212 1213 assert not opctx.jobdeps, "Not all dependencies were removed" 1214 1215 queue.release() 1216 try: 1217 (op_status, op_result) = self._ExecOpCodeUnlocked(opctx) 1218 finally: 1219 queue.acquire(shared=1) 1220 1221 op.status = op_status 1222 op.result = op_result 1223 1224 assert not waitjob 1225 1226 if op.status in (constants.OP_STATUS_WAITING, 1227 constants.OP_STATUS_QUEUED): 1228 # waiting: Couldn't get locks in time 1229 # queued: Queue is shutting down 1230 assert not op.end_timestamp 1231 else: 1232 # Finalize opcode 1233 op.end_timestamp = TimeStampNow() 1234 1235 if op.status == constants.OP_STATUS_CANCELING: 1236 assert not compat.any(i.status != constants.OP_STATUS_CANCELING 1237 for i in job.ops[opctx.index:]) 1238 else: 1239 assert op.status in constants.OPS_FINALIZED 1240 1241 if op.status == constants.OP_STATUS_QUEUED: 1242 # Queue is shutting down 1243 assert not waitjob 1244 1245 finalize = False 1246 1247 # Reset context 1248 job.cur_opctx = None 1249 1250 # In no case must the status be finalized here 1251 assert job.CalcStatus() == constants.JOB_STATUS_QUEUED 1252 1253 elif op.status == constants.OP_STATUS_WAITING or waitjob: 1254 finalize = False 1255 1256 if not waitjob and opctx.CheckPriorityIncrease(): 1257 # Priority was changed, need to update on-disk file 1258 queue.UpdateJobUnlocked(job) 1259 1260 # Keep around for another round 1261 job.cur_opctx = opctx 1262 1263 assert (op.priority <= constants.OP_PRIO_LOWEST and 1264 op.priority >= constants.OP_PRIO_HIGHEST) 1265 1266 # In no case must the status be finalized here 1267 assert job.CalcStatus() == constants.JOB_STATUS_WAITING 1268 1269 else: 1270 # Ensure all opcodes so far have been successful 1271 assert (opctx.index == 0 or 1272 compat.all(i.status == constants.OP_STATUS_SUCCESS 1273 for i in job.ops[:opctx.index])) 1274 1275 # Reset context 1276 job.cur_opctx = None 1277 1278 if op.status == constants.OP_STATUS_SUCCESS: 1279 finalize = False 1280 1281 elif op.status == constants.OP_STATUS_ERROR: 1282 # Ensure failed opcode has an exception as its result 1283 assert errors.GetEncodedError(job.ops[opctx.index].result) 1284 1285 to_encode = errors.OpExecError("Preceding opcode failed") 1286 job.MarkUnfinishedOps(constants.OP_STATUS_ERROR, 1287 _EncodeOpError(to_encode)) 1288 finalize = True 1289 1290 # Consistency check 1291 assert compat.all(i.status == constants.OP_STATUS_ERROR and 1292 errors.GetEncodedError(i.result) 1293 for i in job.ops[opctx.index:]) 1294 1295 elif op.status == constants.OP_STATUS_CANCELING: 1296 job.MarkUnfinishedOps(constants.OP_STATUS_CANCELED, 1297 "Job canceled by request") 1298 finalize = True 1299 1300 else: 1301 raise errors.ProgrammerError("Unknown status '%s'" % op.status) 1302 1303 if opctx.index == (opcount - 1): 1304 # Finalize on last opcode 1305 finalize = True 1306 1307 if finalize: 1308 # All opcodes have been run, finalize job 1309 job.Finalize() 1310 1311 # Write to disk. If the job status is final, this is the final write 1312 # allowed. Once the file has been written, it can be archived anytime. 1313 queue.UpdateJobUnlocked(job) 1314 1315 assert not waitjob 1316 1317 if finalize: 1318 logging.info("Finished job %s, status = %s", job.id, job.CalcStatus()) 1319 return self.FINISHED 1320 1321 assert not waitjob or queue.depmgr.JobWaiting(job) 1322 1323 if waitjob: 1324 return self.WAITDEP 1325 else: 1326 return self.DEFER 1327 finally: 1328 assert job.writable, "Job became read-only while being processed" 1329 queue.release()

1330

1331 1332 -def _EvaluateJobProcessorResult(depmgr, job, result):

1333 """Looks at a result from L{_JobProcessor} for a job. 1334 1335 To be used in a L{_JobQueueWorker}. 1336 1337 """ 1338 if result == _JobProcessor.FINISHED: 1339 # Notify waiting jobs 1340 depmgr.NotifyWaiters(job.id) 1341 1342 elif result == _JobProcessor.DEFER: 1343 # Schedule again 1344 raise workerpool.DeferTask(priority=job.CalcPriority()) 1345 1346 elif result == _JobProcessor.WAITDEP: 1347 # No-op, dependency manager will re-schedule 1348 pass 1349 1350 else: 1351 raise errors.ProgrammerError("Job processor returned unknown status %s" % 1352 (result, ))

1353

1354 1355 -class _JobQueueWorker(workerpool.BaseWorker):

1356 """The actual job workers. 1357 1358 """

1359 - def RunTask(self, job): # pylint: disable=W0221

1360 """Job executor. 1361 1362 @type job: L{_QueuedJob} 1363 @param job: the job to be processed 1364 1365 """ 1366 assert job.writable, "Expected writable job" 1367 1368 # Ensure only one worker is active on a single job. If a job registers for 1369 # a dependency job, and the other job notifies before the first worker is 1370 # done, the job can end up in the tasklist more than once. 1371 job.processor_lock.acquire() 1372 try: 1373 return self._RunTaskInner(job) 1374 finally: 1375 job.processor_lock.release()

1376

1377 - def _RunTaskInner(self, job):

1378 """Executes a job. 1379 1380 Must be called with per-job lock acquired. 1381 1382 """ 1383 queue = job.queue 1384 assert queue == self.pool.queue 1385 1386 setname_fn = lambda op: self.SetTaskName(self._GetWorkerName(job, op)) 1387 setname_fn(None) 1388 1389 proc = mcpu.Processor(queue.context, job.id) 1390 1391 # Create wrapper for setting thread name 1392 wrap_execop_fn = compat.partial(self._WrapExecOpCode, setname_fn, 1393 proc.ExecOpCode) 1394 1395 _EvaluateJobProcessorResult(queue.depmgr, job, 1396 _JobProcessor(queue, wrap_execop_fn, job)())

1397 1398 @staticmethod

1399 - def _WrapExecOpCode(setname_fn, execop_fn, op, *args, **kwargs):

1400 """Updates the worker thread name to include a short summary of the opcode. 1401 1402 @param setname_fn: Callable setting worker thread name 1403 @param execop_fn: Callable for executing opcode (usually 1404 L{mcpu.Processor.ExecOpCode}) 1405 1406 """ 1407 setname_fn(op) 1408 try: 1409 return execop_fn(op, *args, **kwargs) 1410 finally: 1411 setname_fn(None)

1412 1413 @staticmethod

1414 - def _GetWorkerName(job, op):

1415 """Sets the worker thread name. 1416 1417 @type job: L{_QueuedJob} 1418 @type op: L{opcodes.OpCode} 1419 1420 """ 1421 parts = ["Job%s" % job.id] 1422 1423 if op: 1424 parts.append(op.TinySummary()) 1425 1426 return "/".join(parts)

1427

1428 1429 -class _JobQueueWorkerPool(workerpool.WorkerPool):

1430 """Simple class implementing a job-processing workerpool. 1431 1432 """

1433 - def __init__(self, queue):

1434 super(_JobQueueWorkerPool, self).__init__("Jq", 1435 JOBQUEUE_THREADS, 1436 _JobQueueWorker) 1437 self.queue = queue

1438

1439 1440 -class _JobDependencyManager:

1441 """Keeps track of job dependencies. 1442 1443 """ 1444 (WAIT, 1445 ERROR, 1446 CANCEL, 1447 CONTINUE, 1448 WRONGSTATUS) = range(1, 6) 1449

1450 - def __init__(self, getstatus_fn, enqueue_fn):

1451 """Initializes this class. 1452 1453 """ 1454 self._getstatus_fn = getstatus_fn 1455 self._enqueue_fn = enqueue_fn 1456 1457 self._waiters = {} 1458 self._lock = locking.SharedLock("JobDepMgr")

1459 1460 @locking.ssynchronized(_LOCK, shared=1)

1461 - def GetLockInfo(self, requested): # pylint: disable=W0613

1462 """Retrieves information about waiting jobs. 1463 1464 @type requested: set 1465 @param requested: Requested information, see C{query.LQ_*} 1466 1467 """ 1468 # No need to sort here, that's being done by the lock manager and query 1469 # library. There are no priorities for notifying jobs, hence all show up as 1470 # one item under "pending". 1471 return [("job/%s" % job_id, None, None, 1472 [("job", [job.id for job in waiters])]) 1473 for job_id, waiters in self._waiters.items() 1474 if waiters]

1475 1476 @locking.ssynchronized(_LOCK, shared=1)

1477 - def JobWaiting(self, job):

1478 """Checks if a job is waiting. 1479 1480 """ 1481 return compat.any(job in jobs 1482 for jobs in self._waiters.values())

1483 1484 @locking.ssynchronized(_LOCK)

1485 - def CheckAndRegister(self, job, dep_job_id, dep_status):

1486 """Checks if a dependency job has the requested status. 1487 1488 If the other job is not yet in a finalized status, the calling job will be 1489 notified (re-added to the workerpool) at a later point. 1490 1491 @type job: L{_QueuedJob} 1492 @param job: Job object 1493 @type dep_job_id: int 1494 @param dep_job_id: ID of dependency job 1495 @type dep_status: list 1496 @param dep_status: Required status 1497 1498 """ 1499 assert ht.TJobId(job.id) 1500 assert ht.TJobId(dep_job_id) 1501 assert ht.TListOf(ht.TElemOf(constants.JOBS_FINALIZED))(dep_status) 1502 1503 if job.id == dep_job_id: 1504 return (self.ERROR, "Job can't depend on itself") 1505 1506 # Get status of dependency job 1507 try: 1508 status = self._getstatus_fn(dep_job_id) 1509 except errors.JobLost, err: 1510 return (self.ERROR, "Dependency error: %s" % err) 1511 1512 assert status in constants.JOB_STATUS_ALL 1513 1514 job_id_waiters = self._waiters.setdefault(dep_job_id, set()) 1515 1516 if status not in constants.JOBS_FINALIZED: 1517 # Register for notification and wait for job to finish 1518 job_id_waiters.add(job) 1519 return (self.WAIT, 1520 "Need to wait for job %s, wanted status '%s'" % 1521 (dep_job_id, dep_status)) 1522 1523 # Remove from waiters list 1524 if job in job_id_waiters: 1525 job_id_waiters.remove(job) 1526 1527 if (status == constants.JOB_STATUS_CANCELED and 1528 constants.JOB_STATUS_CANCELED not in dep_status): 1529 return (self.CANCEL, "Dependency job %s was cancelled" % dep_job_id) 1530 1531 elif not dep_status or status in dep_status: 1532 return (self.CONTINUE, 1533 "Dependency job %s finished with status '%s'" % 1534 (dep_job_id, status)) 1535 1536 else: 1537 return (self.WRONGSTATUS, 1538 "Dependency job %s finished with status '%s'," 1539 " not one of '%s' as required" % 1540 (dep_job_id, status, utils.CommaJoin(dep_status)))

1541

1542 - def _RemoveEmptyWaitersUnlocked(self):

1543 """Remove all jobs without actual waiters. 1544 1545 """ 1546 for job_id in [job_id for (job_id, waiters) in self._waiters.items() 1547 if not waiters]: 1548 del self._waiters[job_id]

1549

1550 - def NotifyWaiters(self, job_id):

1551 """Notifies all jobs waiting for a certain job ID. 1552 1553 @attention: Do not call until L{CheckAndRegister} returned a status other 1554 than C{WAITDEP} for C{job_id}, or behaviour is undefined 1555 @type job_id: int 1556 @param job_id: Job ID 1557 1558 """ 1559 assert ht.TJobId(job_id) 1560 1561 self._lock.acquire() 1562 try: 1563 self._RemoveEmptyWaitersUnlocked() 1564 1565 jobs = self._waiters.pop(job_id, None) 1566 finally: 1567 self._lock.release() 1568 1569 if jobs: 1570 # Re-add jobs to workerpool 1571 logging.debug("Re-adding %s jobs which were waiting for job %s", 1572 len(jobs), job_id) 1573 self._enqueue_fn(jobs)

1574

1575 1576 -def _RequireOpenQueue(fn):

1577 """Decorator for "public" functions. 1578 1579 This function should be used for all 'public' functions. That is, 1580 functions usually called from other classes. Note that this should 1581 be applied only to methods (not plain functions), since it expects 1582 that the decorated function is called with a first argument that has 1583 a '_queue_filelock' argument. 1584 1585 @warning: Use this decorator only after locking.ssynchronized 1586 1587 Example:: 1588 @locking.ssynchronized(_LOCK) 1589 @_RequireOpenQueue 1590 def Example(self): 1591 pass 1592 1593 """ 1594 def wrapper(self, *args, **kwargs): 1595 # pylint: disable=W0212 1596 assert self._queue_filelock is not None, "Queue should be open" 1597 return fn(self, *args, **kwargs)

1598 return wrapper 1599

1600 1601 -def _RequireNonDrainedQueue(fn):

1602 """Decorator checking for a non-drained queue. 1603 1604 To be used with functions submitting new jobs. 1605 1606 """ 1607 def wrapper(self, *args, **kwargs): 1608 """Wrapper function. 1609 1610 @raise errors.JobQueueDrainError: if the job queue is marked for draining 1611 1612 """ 1613 # Ok when sharing the big job queue lock, as the drain file is created when 1614 # the lock is exclusive. 1615 # Needs access to protected member, pylint: disable=W0212 1616 if self._drained: 1617 raise errors.JobQueueDrainError("Job queue is drained, refusing job") 1618 1619 if not self._accepting_jobs: 1620 raise errors.JobQueueError("Job queue is shutting down, refusing job") 1621 1622 return fn(self, *args, **kwargs)

1623 return wrapper 1624

1625 1626 -class JobQueue(object):

1627 """Queue used to manage the jobs. 1628 1629 """

1630 - def __init__(self, context):

1631 """Constructor for JobQueue. 1632 1633 The constructor will initialize the job queue object and then 1634 start loading the current jobs from disk, either for starting them 1635 (if they were queue) or for aborting them (if they were already 1636 running). 1637 1638 @type context: GanetiContext 1639 @param context: the context object for access to the configuration 1640 data and other ganeti objects 1641 1642 """ 1643 self.context = context 1644 self._memcache = weakref.WeakValueDictionary() 1645 self._my_hostname = netutils.Hostname.GetSysName() 1646 1647 # The Big JobQueue lock. If a code block or method acquires it in shared 1648 # mode safe it must guarantee concurrency with all the code acquiring it in 1649 # shared mode, including itself. In order not to acquire it at all 1650 # concurrency must be guaranteed with all code acquiring it in shared mode 1651 # and all code acquiring it exclusively. 1652 self._lock = locking.SharedLock("JobQueue") 1653 1654 self.acquire = self._lock.acquire 1655 self.release = self._lock.release 1656 1657 # Accept jobs by default 1658 self._accepting_jobs = True 1659 1660 # Initialize the queue, and acquire the filelock. 1661 # This ensures no other process is working on the job queue. 1662 self._queue_filelock = jstore.InitAndVerifyQueue(must_lock=True) 1663 1664 # Read serial file 1665 self._last_serial = jstore.ReadSerial() 1666 assert self._last_serial is not None, ("Serial file was modified between" 1667 " check in jstore and here") 1668 1669 # Get initial list of nodes 1670 self._nodes = dict((n.name, n.primary_ip) 1671 for n in self.context.cfg.GetAllNodesInfo().values() 1672 if n.master_candidate) 1673 1674 # Remove master node 1675 self._nodes.pop(self._my_hostname, None) 1676 1677 # TODO: Check consistency across nodes 1678 1679 self._queue_size = None 1680 self._UpdateQueueSizeUnlocked() 1681 assert ht.TInt(self._queue_size) 1682 self._drained = jstore.CheckDrainFlag() 1683 1684 # Job dependencies 1685 self.depmgr = _JobDependencyManager(self._GetJobStatusForDependencies, 1686 self._EnqueueJobs) 1687 self.context.glm.AddToLockMonitor(self.depmgr) 1688 1689 # Setup worker pool 1690 self._wpool = _JobQueueWorkerPool(self) 1691 try: 1692 self._InspectQueue() 1693 except: 1694 self._wpool.TerminateWorkers() 1695 raise

1696 1697 @locking.ssynchronized(_LOCK) 1698 @_RequireOpenQueue

1699 - def _InspectQueue(self):

1700 """Loads the whole job queue and resumes unfinished jobs. 1701 1702 This function needs the lock here because WorkerPool.AddTask() may start a 1703 job while we're still doing our work. 1704 1705 """ 1706 logging.info("Inspecting job queue") 1707 1708 restartjobs = [] 1709 1710 all_job_ids = self._GetJobIDsUnlocked() 1711 jobs_count = len(all_job_ids) 1712 lastinfo = time.time() 1713 for idx, job_id in enumerate(all_job_ids): 1714 # Give an update every 1000 jobs or 10 seconds 1715 if (idx % 1000 == 0 or time.time() >= (lastinfo + 10.0) or 1716 idx == (jobs_count - 1)): 1717 logging.info("Job queue inspection: %d/%d (%0.1f %%)", 1718 idx, jobs_count - 1, 100.0 * (idx + 1) / jobs_count) 1719 lastinfo = time.time() 1720 1721 job = self._LoadJobUnlocked(job_id) 1722 1723 # a failure in loading the job can cause 'None' to be returned 1724 if job is None: 1725 continue 1726 1727 status = job.CalcStatus() 1728 1729 if status == constants.JOB_STATUS_QUEUED: 1730 restartjobs.append(job) 1731 1732 elif status in (constants.JOB_STATUS_RUNNING, 1733 constants.JOB_STATUS_WAITING, 1734 constants.JOB_STATUS_CANCELING): 1735 logging.warning("Unfinished job %s found: %s", job.id, job) 1736 1737 if status == constants.JOB_STATUS_WAITING: 1738 # Restart job 1739 job.MarkUnfinishedOps(constants.OP_STATUS_QUEUED, None) 1740 restartjobs.append(job) 1741 else: 1742 job.MarkUnfinishedOps(constants.OP_STATUS_ERROR, 1743 "Unclean master daemon shutdown") 1744 job.Finalize() 1745 1746 self.UpdateJobUnlocked(job) 1747 1748 if restartjobs: 1749 logging.info("Restarting %s jobs", len(restartjobs)) 1750 self._EnqueueJobsUnlocked(restartjobs) 1751 1752 logging.info("Job queue inspection finished")

1753

1754 - def _GetRpc(self, address_list):

1755 """Gets RPC runner with context. 1756 1757 """ 1758 return rpc.JobQueueRunner(self.context, address_list)

1759 1760 @locking.ssynchronized(_LOCK) 1761 @_RequireOpenQueue

1762 - def AddNode(self, node):

1763 """Register a new node with the queue. 1764 1765 @type node: L{objects.Node} 1766 @param node: the node object to be added 1767 1768 """ 1769 node_name = node.name 1770 assert node_name != self._my_hostname 1771 1772 # Clean queue directory on added node 1773 result = self._GetRpc(None).call_jobqueue_purge(node_name) 1774 msg = result.fail_msg 1775 if msg: 1776 logging.warning("Cannot cleanup queue directory on node %s: %s", 1777 node_name, msg) 1778 1779 if not node.master_candidate: 1780 # remove if existing, ignoring errors 1781 self._nodes.pop(node_name, None) 1782 # and skip the replication of the job ids 1783 return 1784 1785 # Upload the whole queue excluding archived jobs 1786 files = [self._GetJobPath(job_id) for job_id in self._GetJobIDsUnlocked()] 1787 1788 # Upload current serial file 1789 files.append(pathutils.JOB_QUEUE_SERIAL_FILE) 1790 1791 # Static address list 1792 addrs = [node.primary_ip] 1793 1794 for file_name in files: 1795 # Read file content 1796 content = utils.ReadFile(file_name) 1797 1798 result = _CallJqUpdate(self._GetRpc(addrs), [node_name], 1799 file_name, content) 1800 msg = result[node_name].fail_msg 1801 if msg: 1802 logging.error("Failed to upload file %s to node %s: %s", 1803 file_name, node_name, msg) 1804 1805 # Set queue drained flag 1806 result = \ 1807 self._GetRpc(addrs).call_jobqueue_set_drain_flag([node_name], 1808 self._drained) 1809 msg = result[node_name].fail_msg 1810 if msg: 1811 logging.error("Failed to set queue drained flag on node %s: %s", 1812 node_name, msg) 1813 1814 self._nodes[node_name] = node.primary_ip

1815 1816 @locking.ssynchronized(_LOCK) 1817 @_RequireOpenQueue

1818 - def RemoveNode(self, node_name):

1819 """Callback called when removing nodes from the cluster. 1820 1821 @type node_name: str 1822 @param node_name: the name of the node to remove 1823 1824 """ 1825 self._nodes.pop(node_name, None)

1826 1827 @staticmethod

1828 - def _CheckRpcResult(result, nodes, failmsg):

1829 """Verifies the status of an RPC call. 1830 1831 Since we aim to keep consistency should this node (the current 1832 master) fail, we will log errors if our rpc fail, and especially 1833 log the case when more than half of the nodes fails. 1834 1835 @param result: the data as returned from the rpc call 1836 @type nodes: list 1837 @param nodes: the list of nodes we made the call to 1838 @type failmsg: str 1839 @param failmsg: the identifier to be used for logging 1840 1841 """ 1842 failed = [] 1843 success = [] 1844 1845 for node in nodes: 1846 msg = result[node].fail_msg 1847 if msg: 1848 failed.append(node) 1849 logging.error("RPC call %s (%s) failed on node %s: %s", 1850 result[node].call, failmsg, node, msg) 1851 else: 1852 success.append(node) 1853 1854 # +1 for the master node 1855 if (len(success) + 1) < len(failed): 1856 # TODO: Handle failing nodes 1857 logging.error("More than half of the nodes failed")

1858

1859 - def _GetNodeIp(self):

1860 """Helper for returning the node name/ip list. 1861 1862 @rtype: (list, list) 1863 @return: a tuple of two lists, the first one with the node 1864 names and the second one with the node addresses 1865 1866 """ 1867 # TODO: Change to "tuple(map(list, zip(*self._nodes.items())))"? 1868 name_list = self._nodes.keys() 1869 addr_list = [self._nodes[name] for name in name_list] 1870 return name_list, addr_list

1871

1872 - def _UpdateJobQueueFile(self, file_name, data, replicate):

1873 """Writes a file locally and then replicates it to all nodes. 1874 1875 This function will replace the contents of a file on the local 1876 node and then replicate it to all the other nodes we have. 1877 1878 @type file_name: str 1879 @param file_name: the path of the file to be replicated 1880 @type data: str 1881 @param data: the new contents of the file 1882 @type replicate: boolean 1883 @param replicate: whether to spread the changes to the remote nodes 1884 1885 """ 1886 getents = runtime.GetEnts() 1887 utils.WriteFile(file_name, data=data, uid=getents.masterd_uid, 1888 gid=getents.daemons_gid, 1889 mode=constants.JOB_QUEUE_FILES_PERMS) 1890 1891 if replicate: 1892 names, addrs = self._GetNodeIp() 1893 result = _CallJqUpdate(self._GetRpc(addrs), names, file_name, data) 1894 self._CheckRpcResult(result, self._nodes, "Updating %s" % file_name)

1895

1896 - def _RenameFilesUnlocked(self, rename):

1897 """Renames a file locally and then replicate the change. 1898 1899 This function will rename a file in the local queue directory 1900 and then replicate this rename to all the other nodes we have. 1901 1902 @type rename: list of (old, new) 1903 @param rename: List containing tuples mapping old to new names 1904 1905 """ 1906 # Rename them locally 1907 for old, new in rename: 1908 utils.RenameFile(old, new, mkdir=True) 1909 1910 # ... and on all nodes 1911 names, addrs = self._GetNodeIp() 1912 result = self._GetRpc(addrs).call_jobqueue_rename(names, rename) 1913 self._CheckRpcResult(result, self._nodes, "Renaming files (%r)" % rename)

1914

1915 - def _NewSerialsUnlocked(self, count):

1916 """Generates a new job identifier. 1917 1918 Job identifiers are unique during the lifetime of a cluster. 1919 1920 @type count: integer 1921 @param count: how many serials to return 1922 @rtype: list of int 1923 @return: a list of job identifiers. 1924 1925 """ 1926 assert ht.TNonNegativeInt(count) 1927 1928 # New number 1929 serial = self._last_serial + count 1930 1931 # Write to file 1932 self._UpdateJobQueueFile(pathutils.JOB_QUEUE_SERIAL_FILE, 1933 "%s\n" % serial, True) 1934 1935 result = [jstore.FormatJobID(v) 1936 for v in range(self._last_serial + 1, serial + 1)] 1937 1938 # Keep it only if we were able to write the file 1939 self._last_serial = serial 1940 1941 assert len(result) == count 1942 1943 return result

1944 1945 @staticmethod

1946 - def _GetJobPath(job_id):

1947 """Returns the job file for a given job id. 1948 1949 @type job_id: str 1950 @param job_id: the job identifier 1951 @rtype: str 1952 @return: the path to the job file 1953 1954 """ 1955 return utils.PathJoin(pathutils.QUEUE_DIR, "job-%s" % job_id)

1956 1957 @staticmethod

1958 - def _GetArchivedJobPath(job_id):

1959 """Returns the archived job file for a give job id. 1960 1961 @type job_id: str 1962 @param job_id: the job identifier 1963 @rtype: str 1964 @return: the path to the archived job file 1965 1966 """ 1967 return utils.PathJoin(pathutils.JOB_QUEUE_ARCHIVE_DIR, 1968 jstore.GetArchiveDirectory(job_id), 1969 "job-%s" % job_id)

1970 1971 @staticmethod

1972 - def _DetermineJobDirectories(archived):

1973 """Build list of directories containing job files. 1974 1975 @type archived: bool 1976 @param archived: Whether to include directories for archived jobs 1977 @rtype: list 1978 1979 """ 1980 result = [pathutils.QUEUE_DIR] 1981 1982 if archived: 1983 archive_path = pathutils.JOB_QUEUE_ARCHIVE_DIR 1984 result.extend(map(compat.partial(utils.PathJoin, archive_path), 1985 utils.ListVisibleFiles(archive_path))) 1986 1987 return result

1988 1989 @classmethod

1990 - def _GetJobIDsUnlocked(cls, sort=True, archived=False):

1991 """Return all known job IDs. 1992 1993 The method only looks at disk because it's a requirement that all 1994 jobs are present on disk (so in the _memcache we don't have any 1995 extra IDs). 1996 1997 @type sort: boolean 1998 @param sort: perform sorting on the returned job ids 1999 @rtype: list 2000 @return: the list of job IDs 2001 2002 """ 2003 jlist = [] 2004 2005 for path in cls._DetermineJobDirectories(archived): 2006 for filename in utils.ListVisibleFiles(path): 2007 m = constants.JOB_FILE_RE.match(filename) 2008 if m: 2009 jlist.append(int(m.group(1))) 2010 2011 if sort: 2012 jlist.sort() 2013 return jlist

2014

2015 - def _LoadJobUnlocked(self, job_id):

2016 """Loads a job from the disk or memory. 2017 2018 Given a job id, this will return the cached job object if 2019 existing, or try to load the job from the disk. If loading from 2020 disk, it will also add the job to the cache. 2021 2022 @type job_id: int 2023 @param job_id: the job id 2024 @rtype: L{_QueuedJob} or None 2025 @return: either None or the job object 2026 2027 """ 2028 job = self._memcache.get(job_id, None) 2029 if job: 2030 logging.debug("Found job %s in memcache", job_id) 2031 assert job.writable, "Found read-only job in memcache" 2032 return job 2033 2034 try: 2035 job = self._LoadJobFromDisk(job_id, False) 2036 if job is None: 2037 return job 2038 except errors.JobFileCorrupted: 2039 old_path = self._GetJobPath(job_id) 2040 new_path = self._GetArchivedJobPath(job_id) 2041 if old_path == new_path: 2042 # job already archived (future case) 2043 logging.exception("Can't parse job %s", job_id) 2044 else: 2045 # non-archived case 2046 logging.exception("Can't parse job %s, will archive.", job_id) 2047 self._RenameFilesUnlocked([(old_path, new_path)]) 2048 return None 2049 2050 assert job.writable, "Job just loaded is not writable" 2051 2052 self._memcache[job_id] = job 2053 logging.debug("Added job %s to the cache", job_id) 2054 return job

2055

2056 - def _LoadJobFromDisk(self, job_id, try_archived, writable=None):

2057 """Load the given job file from disk. 2058 2059 Given a job file, read, load and restore it in a _QueuedJob format. 2060 2061 @type job_id: int 2062 @param job_id: job identifier 2063 @type try_archived: bool 2064 @param try_archived: Whether to try loading an archived job 2065 @rtype: L{_QueuedJob} or None 2066 @return: either None or the job object 2067 2068 """ 2069 path_functions = [(self._GetJobPath, False)] 2070 2071 if try_archived: 2072 path_functions.append((self._GetArchivedJobPath, True)) 2073 2074 raw_data = None 2075 archived = None 2076 2077 for (fn, archived) in path_functions: 2078 filepath = fn(job_id) 2079 logging.debug("Loading job from %s", filepath) 2080 try: 2081 raw_data = utils.ReadFile(filepath) 2082 except EnvironmentError, err: 2083 if err.errno != errno.ENOENT: 2084 raise 2085 else: 2086 break 2087 2088 if not raw_data: 2089 return None 2090 2091 if writable is None: 2092 writable = not archived 2093 2094 try: 2095 data = serializer.LoadJson(raw_data) 2096 job = _QueuedJob.Restore(self, data, writable, archived) 2097 except Exception, err: # pylint: disable=W0703 2098 raise errors.JobFileCorrupted(err) 2099 2100 return job

2101

2102 - def SafeLoadJobFromDisk(self, job_id, try_archived, writable=None):

2103 """Load the given job file from disk. 2104 2105 Given a job file, read, load and restore it in a _QueuedJob format. 2106 In case of error reading the job, it gets returned as None, and the 2107 exception is logged. 2108 2109 @type job_id: int 2110 @param job_id: job identifier 2111 @type try_archived: bool 2112 @param try_archived: Whether to try loading an archived job 2113 @rtype: L{_QueuedJob} or None 2114 @return: either None or the job object 2115 2116 """ 2117 try: 2118 return self._LoadJobFromDisk(job_id, try_archived, writable=writable) 2119 except (errors.JobFileCorrupted, EnvironmentError): 2120 logging.exception("Can't load/parse job %s", job_id) 2121 return None

2122

2123 - def _UpdateQueueSizeUnlocked(self):

2124 """Update the queue size. 2125 2126 """ 2127 self._queue_size = len(self._GetJobIDsUnlocked(sort=False))

2128 2129 @locking.ssynchronized(_LOCK) 2130 @_RequireOpenQueue

2131 - def SetDrainFlag(self, drain_flag):

2132 """Sets the drain flag for the queue. 2133 2134 @type drain_flag: boolean 2135 @param drain_flag: Whether to set or unset the drain flag 2136 2137 """ 2138 # Change flag locally 2139 jstore.SetDrainFlag(drain_flag) 2140 2141 self._drained = drain_flag 2142 2143 # ... and on all nodes 2144 (names, addrs) = self._GetNodeIp() 2145 result = \ 2146 self._GetRpc(addrs).call_jobqueue_set_drain_flag(names, drain_flag) 2147 self._CheckRpcResult(result, self._nodes, 2148 "Setting queue drain flag to %s" % drain_flag) 2149 2150 return True

2151 2152 @_RequireOpenQueue

2153 - def _SubmitJobUnlocked(self, job_id, ops):

2154 """Create and store a new job. 2155 2156 This enters the job into our job queue and also puts it on the new 2157 queue, in order for it to be picked up by the queue processors. 2158 2159 @type job_id: job ID 2160 @param job_id: the job ID for the new job 2161 @type ops: list 2162 @param ops: The list of OpCodes that will become the new job. 2163 @rtype: L{_QueuedJob} 2164 @return: the job object to be queued 2165 @raise errors.JobQueueFull: if the job queue has too many jobs in it 2166 @raise errors.GenericError: If an opcode is not valid 2167 2168 """ 2169 if self._queue_size >= constants.JOB_QUEUE_SIZE_HARD_LIMIT: 2170 raise errors.JobQueueFull() 2171 2172 job = _QueuedJob(self, job_id, ops, True) 2173 2174 for idx, op in enumerate(job.ops): 2175 # Check priority 2176 if op.priority not in constants.OP_PRIO_SUBMIT_VALID: 2177 allowed = utils.CommaJoin(constants.OP_PRIO_SUBMIT_VALID) 2178 raise errors.GenericError("Opcode %s has invalid priority %s, allowed" 2179 " are %s" % (idx, op.priority, allowed)) 2180 2181 # Check job dependencies 2182 dependencies = getattr(op.input, opcodes.DEPEND_ATTR, None) 2183 if not opcodes.TNoRelativeJobDependencies(dependencies): 2184 raise errors.GenericError("Opcode %s has invalid dependencies, must" 2185 " match %s: %s" % 2186 (idx, opcodes.TNoRelativeJobDependencies, 2187 dependencies)) 2188 2189 # Write to disk 2190 self.UpdateJobUnlocked(job) 2191 2192 self._queue_size += 1 2193 2194 logging.debug("Adding new job %s to the cache", job_id) 2195 self._memcache[job_id] = job 2196 2197 return job

2198 2199 @locking.ssynchronized(_LOCK) 2200 @_RequireOpenQueue 2201 @_RequireNonDrainedQueue

2202 - def SubmitJob(self, ops):

2203 """Create and store a new job. 2204 2205 @see: L{_SubmitJobUnlocked} 2206 2207 """ 2208 (job_id, ) = self._NewSerialsUnlocked(1) 2209 self._EnqueueJobsUnlocked([self._SubmitJobUnlocked(job_id, ops)]) 2210 return job_id

2211 2212 @locking.ssynchronized(_LOCK) 2213 @_RequireOpenQueue 2214 @_RequireNonDrainedQueue

2215 - def SubmitManyJobs(self, jobs):

2216 """Create and store multiple jobs. 2217 2218 @see: L{_SubmitJobUnlocked} 2219 2220 """ 2221 all_job_ids = self._NewSerialsUnlocked(len(jobs)) 2222 2223 (results, added_jobs) = \ 2224 self._SubmitManyJobsUnlocked(jobs, all_job_ids, []) 2225 2226 self._EnqueueJobsUnlocked(added_jobs) 2227 2228 return results

2229 2230 @staticmethod

2231 - def _FormatSubmitError(msg, ops):

2232 """Formats errors which occurred while submitting a job. 2233 2234 """ 2235 return ("%s; opcodes %s" % 2236 (msg, utils.CommaJoin(op.Summary() for op in ops)))

2237 2238 @staticmethod

2239 - def _ResolveJobDependencies(resolve_fn, deps):

2240 """Resolves relative job IDs in dependencies. 2241 2242 @type resolve_fn: callable 2243 @param resolve_fn: Function to resolve a relative job ID 2244 @type deps: list 2245 @param deps: Dependencies 2246 @rtype: tuple; (boolean, string or list) 2247 @return: If successful (first tuple item), the returned list contains 2248 resolved job IDs along with the requested status; if not successful, 2249 the second element is an error message 2250 2251 """ 2252 result = [] 2253 2254 for (dep_job_id, dep_status) in deps: 2255 if ht.TRelativeJobId(dep_job_id): 2256 assert ht.TInt(dep_job_id) and dep_job_id < 0 2257 try: 2258 job_id = resolve_fn(dep_job_id) 2259 except IndexError: 2260 # Abort 2261 return (False, "Unable to resolve relative job ID %s" % dep_job_id) 2262 else: 2263 job_id = dep_job_id 2264 2265 result.append((job_id, dep_status)) 2266 2267 return (True, result)

2268

2269 - def _SubmitManyJobsUnlocked(self, jobs, job_ids, previous_job_ids):

2270 """Create and store multiple jobs. 2271 2272 @see: L{_SubmitJobUnlocked} 2273 2274 """ 2275 results = [] 2276 added_jobs = [] 2277 2278 def resolve_fn(job_idx, reljobid): 2279 assert reljobid < 0 2280 return (previous_job_ids + job_ids[:job_idx])[reljobid]

2281 2282 for (idx, (job_id, ops)) in enumerate(zip(job_ids, jobs)): 2283 for op in ops: 2284 if getattr(op, opcodes.DEPEND_ATTR, None): 2285 (status, data) = \ 2286 self._ResolveJobDependencies(compat.partial(resolve_fn, idx), 2287 op.depends) 2288 if not status: 2289 # Abort resolving dependencies 2290 assert ht.TNonEmptyString(data), "No error message" 2291 break 2292 # Use resolved dependencies 2293 op.depends = data 2294 else: 2295 try: 2296 job = self._SubmitJobUnlocked(job_id, ops) 2297 except errors.GenericError, err: 2298 status = False 2299 data = self._FormatSubmitError(str(err), ops) 2300 else: 2301 status = True 2302 data = job_id 2303 added_jobs.append(job) 2304 2305 results.append((status, data)) 2306 2307 return (results, added_jobs)

2308 2309 @locking.ssynchronized(_LOCK)

2310 - def _EnqueueJobs(self, jobs):

2311 """Helper function to add jobs to worker pool's queue. 2312 2313 @type jobs: list 2314 @param jobs: List of all jobs 2315 2316 """ 2317 return self._EnqueueJobsUnlocked(jobs)

2318

2319 - def _EnqueueJobsUnlocked(self, jobs):

2320 """Helper function to add jobs to worker pool's queue. 2321 2322 @type jobs: list 2323 @param jobs: List of all jobs 2324 2325 """ 2326 assert self._lock.is_owned(shared=0), "Must own lock in exclusive mode" 2327 self._wpool.AddManyTasks([(job, ) for job in jobs], 2328 priority=[job.CalcPriority() for job in jobs], 2329 task_id=map(_GetIdAttr, jobs))

2330

2331 - def _GetJobStatusForDependencies(self, job_id):

2332 """Gets the status of a job for dependencies. 2333 2334 @type job_id: int 2335 @param job_id: Job ID 2336 @raise errors.JobLost: If job can't be found 2337 2338 """ 2339 # Not using in-memory cache as doing so would require an exclusive lock 2340 2341 # Try to load from disk 2342 job = self.SafeLoadJobFromDisk(job_id, True, writable=False) 2343 2344 assert not job.writable, "Got writable job" # pylint: disable=E1101 2345 2346 if job: 2347 return job.CalcStatus() 2348 2349 raise errors.JobLost("Job %s not found" % job_id)

2350 2351 @_RequireOpenQueue

2352 - def UpdateJobUnlocked(self, job, replicate=True):

2353 """Update a job's on disk storage. 2354 2355 After a job has been modified, this function needs to be called in 2356 order to write the changes to disk and replicate them to the other 2357 nodes. 2358 2359 @type job: L{_QueuedJob} 2360 @param job: the changed job 2361 @type replicate: boolean 2362 @param replicate: whether to replicate the change to remote nodes 2363 2364 """ 2365 if __debug__: 2366 finalized = job.CalcStatus() in constants.JOBS_FINALIZED 2367 assert (finalized ^ (job.end_timestamp is None)) 2368 assert job.writable, "Can't update read-only job" 2369 assert not job.archived, "Can't update archived job" 2370 2371 filename = self._GetJobPath(job.id) 2372 data = serializer.DumpJson(job.Serialize()) 2373 logging.debug("Writing job %s to %s", job.id, filename) 2374 self._UpdateJobQueueFile(filename, data, replicate)

2375

2376 - def WaitForJobChanges(self, job_id, fields, prev_job_info, prev_log_serial, 2377 timeout):

2378 """Waits for changes in a job. 2379 2380 @type job_id: int 2381 @param job_id: Job identifier 2382 @type fields: list of strings 2383 @param fields: Which fields to check for changes 2384 @type prev_job_info: list or None 2385 @param prev_job_info: Last job information returned 2386 @type prev_log_serial: int 2387 @param prev_log_serial: Last job message serial number 2388 @type timeout: float 2389 @param timeout: maximum time to wait in seconds 2390 @rtype: tuple (job info, log entries) 2391 @return: a tuple of the job information as required via 2392 the fields parameter, and the log entries as a list 2393 2394 if the job has not changed and the timeout has expired, 2395 we instead return a special value, 2396 L{constants.JOB_NOTCHANGED}, which should be interpreted 2397 as such by the clients 2398 2399 """ 2400 load_fn = compat.partial(self.SafeLoadJobFromDisk, job_id, True, 2401 writable=False) 2402 2403 helper = _WaitForJobChangesHelper() 2404 2405 return helper(self._GetJobPath(job_id), load_fn, 2406 fields, prev_job_info, prev_log_serial, timeout)

2407 2408 @locking.ssynchronized(_LOCK) 2409 @_RequireOpenQueue

2410 - def CancelJob(self, job_id):

2411 """Cancels a job. 2412 2413 This will only succeed if the job has not started yet. 2414 2415 @type job_id: int 2416 @param job_id: job ID of job to be cancelled. 2417 2418 """ 2419 logging.info("Cancelling job %s", job_id) 2420 2421 return self._ModifyJobUnlocked(job_id, lambda job: job.Cancel())

2422 2423 @locking.ssynchronized(_LOCK) 2424 @_RequireOpenQueue

2425 - def ChangeJobPriority(self, job_id, priority):

2426 """Changes a job's priority. 2427 2428 @type job_id: int 2429 @param job_id: ID of the job whose priority should be changed 2430 @type priority: int 2431 @param priority: New priority 2432 2433 """ 2434 logging.info("Changing priority of job %s to %s", job_id, priority) 2435 2436 if priority not in constants.OP_PRIO_SUBMIT_VALID: 2437 allowed = utils.CommaJoin(constants.OP_PRIO_SUBMIT_VALID) 2438 raise errors.GenericError("Invalid priority %s, allowed are %s" % 2439 (priority, allowed)) 2440 2441 def fn(job): 2442 (success, msg) = job.ChangePriority(priority) 2443 2444 if success: 2445 try: 2446 self._wpool.ChangeTaskPriority(job.id, job.CalcPriority()) 2447 except workerpool.NoSuchTask: 2448 logging.debug("Job %s is not in workerpool at this time", job.id) 2449 2450 return (success, msg)

2451 2452 return self._ModifyJobUnlocked(job_id, fn) 2453

2454 - def _ModifyJobUnlocked(self, job_id, mod_fn):

2455 """Modifies a job. 2456 2457 @type job_id: int 2458 @param job_id: Job ID 2459 @type mod_fn: callable 2460 @param mod_fn: Modifying function, receiving job object as parameter, 2461 returning tuple of (status boolean, message string) 2462 2463 """ 2464 job = self._LoadJobUnlocked(job_id) 2465 if not job: 2466 logging.debug("Job %s not found", job_id) 2467 return (False, "Job %s not found" % job_id) 2468 2469 assert job.writable, "Can't modify read-only job" 2470 assert not job.archived, "Can't modify archived job" 2471 2472 (success, msg) = mod_fn(job) 2473 2474 if success: 2475 # If the job was finalized (e.g. cancelled), this is the final write 2476 # allowed. The job can be archived anytime. 2477 self.UpdateJobUnlocked(job) 2478 2479 return (success, msg)

2480 2481 @_RequireOpenQueue

2482 - def _ArchiveJobsUnlocked(self, jobs):

2483 """Archives jobs. 2484 2485 @type jobs: list of L{_QueuedJob} 2486 @param jobs: Job objects 2487 @rtype: int 2488 @return: Number of archived jobs 2489 2490 """ 2491 archive_jobs = [] 2492 rename_files = [] 2493 for job in jobs: 2494 assert job.writable, "Can't archive read-only job" 2495 assert not job.archived, "Can't cancel archived job" 2496 2497 if job.CalcStatus() not in constants.JOBS_FINALIZED: 2498 logging.debug("Job %s is not yet done", job.id) 2499 continue 2500 2501 archive_jobs.append(job) 2502 2503 old = self._GetJobPath(job.id) 2504 new = self._GetArchivedJobPath(job.id) 2505 rename_files.append((old, new)) 2506 2507 # TODO: What if 1..n files fail to rename? 2508 self._RenameFilesUnlocked(rename_files) 2509 2510 logging.debug("Successfully archived job(s) %s", 2511 utils.CommaJoin(job.id for job in archive_jobs)) 2512 2513 # Since we haven't quite checked, above, if we succeeded or failed renaming 2514 # the files, we update the cached queue size from the filesystem. When we 2515 # get around to fix the TODO: above, we can use the number of actually 2516 # archived jobs to fix this. 2517 self._UpdateQueueSizeUnlocked() 2518 return len(archive_jobs)

2519 2520 @locking.ssynchronized(_LOCK) 2521 @_RequireOpenQueue

2522 - def ArchiveJob(self, job_id):

2523 """Archives a job. 2524 2525 This is just a wrapper over L{_ArchiveJobsUnlocked}. 2526 2527 @type job_id: int 2528 @param job_id: Job ID of job to be archived. 2529 @rtype: bool 2530 @return: Whether job was archived 2531 2532 """ 2533 logging.info("Archiving job %s", job_id) 2534 2535 job = self._LoadJobUnlocked(job_id) 2536 if not job: 2537 logging.debug("Job %s not found", job_id) 2538 return False 2539 2540 return self._ArchiveJobsUnlocked([job]) == 1

2541 2542 @locking.ssynchronized(_LOCK) 2543 @_RequireOpenQueue

2544 - def AutoArchiveJobs(self, age, timeout):

2545 """Archives all jobs based on age. 2546 2547 The method will archive all jobs which are older than the age 2548 parameter. For jobs that don't have an end timestamp, the start 2549 timestamp will be considered. The special '-1' age will cause 2550 archival of all jobs (that are not running or queued). 2551 2552 @type age: int 2553 @param age: the minimum age in seconds 2554 2555 """ 2556 logging.info("Archiving jobs with age more than %s seconds", age) 2557 2558 now = time.time() 2559 end_time = now + timeout 2560 archived_count = 0 2561 last_touched = 0 2562 2563 all_job_ids = self._GetJobIDsUnlocked() 2564 pending = [] 2565 for idx, job_id in enumerate(all_job_ids): 2566 last_touched = idx + 1 2567 2568 # Not optimal because jobs could be pending 2569 # TODO: Measure average duration for job archival and take number of 2570 # pending jobs into account. 2571 if time.time() > end_time: 2572 break 2573 2574 # Returns None if the job failed to load 2575 job = self._LoadJobUnlocked(job_id) 2576 if job: 2577 if job.end_timestamp is None: 2578 if job.start_timestamp is None: 2579 job_age = job.received_timestamp 2580 else: 2581 job_age = job.start_timestamp 2582 else: 2583 job_age = job.end_timestamp 2584 2585 if age == -1 or now - job_age[0] > age: 2586 pending.append(job) 2587 2588 # Archive 10 jobs at a time 2589 if len(pending) >= 10: 2590 archived_count += self._ArchiveJobsUnlocked(pending) 2591 pending = [] 2592 2593 if pending: 2594 archived_count += self._ArchiveJobsUnlocked(pending) 2595 2596 return (archived_count, len(all_job_ids) - last_touched)

2597

2598 - def _Query(self, fields, qfilter):

2599 qobj = query.Query(query.JOB_FIELDS, fields, qfilter=qfilter, 2600 namefield="id") 2601 2602 # Archived jobs are only looked at if the "archived" field is referenced 2603 # either as a requested field or in the filter. By default archived jobs 2604 # are ignored. 2605 include_archived = (query.JQ_ARCHIVED in qobj.RequestedData()) 2606 2607 job_ids = qobj.RequestedNames() 2608 2609 list_all = (job_ids is None) 2610 2611 if list_all: 2612 # Since files are added to/removed from the queue atomically, there's no 2613 # risk of getting the job ids in an inconsistent state. 2614 job_ids = self._GetJobIDsUnlocked(archived=include_archived) 2615 2616 jobs = [] 2617 2618 for job_id in job_ids: 2619 job = self.SafeLoadJobFromDisk(job_id, True, writable=False) 2620 if job is not None or not list_all: 2621 jobs.append((job_id, job)) 2622 2623 return (qobj, jobs, list_all)

2624

2625 - def QueryJobs(self, fields, qfilter):

2626 """Returns a list of jobs in queue. 2627 2628 @type fields: sequence 2629 @param fields: List of wanted fields 2630 @type qfilter: None or query2 filter (list) 2631 @param qfilter: Query filter 2632 2633 """ 2634 (qobj, ctx, _) = self._Query(fields, qfilter) 2635 2636 return query.GetQueryResponse(qobj, ctx, sort_by_name=False)

2637

2638 - def OldStyleQueryJobs(self, job_ids, fields):

2639 """Returns a list of jobs in queue. 2640 2641 @type job_ids: list 2642 @param job_ids: sequence of job identifiers or None for all 2643 @type fields: list 2644 @param fields: names of fields to return 2645 @rtype: list 2646 @return: list one element per job, each element being list with 2647 the requested fields 2648 2649 """ 2650 # backwards compat: 2651 job_ids = [int(jid) for jid in job_ids] 2652 qfilter = qlang.MakeSimpleFilter("id", job_ids) 2653 2654 (qobj, ctx, _) = self._Query(fields, qfilter) 2655 2656 return qobj.OldStyleQuery(ctx, sort_by_name=False)

2657 2658 @locking.ssynchronized(_LOCK)

2659 - def PrepareShutdown(self):

2660 """Prepare to stop the job queue. 2661 2662 Disables execution of jobs in the workerpool and returns whether there are 2663 any jobs currently running. If the latter is the case, the job queue is not 2664 yet ready for shutdown. Once this function returns C{True} L{Shutdown} can 2665 be called without interfering with any job. Queued and unfinished jobs will 2666 be resumed next time. 2667 2668 Once this function has been called no new job submissions will be accepted 2669 (see L{_RequireNonDrainedQueue}). 2670 2671 @rtype: bool 2672 @return: Whether there are any running jobs 2673 2674 """ 2675 if self._accepting_jobs: 2676 self._accepting_jobs = False 2677 2678 # Tell worker pool to stop processing pending tasks 2679 self._wpool.SetActive(False) 2680 2681 return self._wpool.HasRunningTasks()

2682

2683 - def AcceptingJobsUnlocked(self):

2684 """Returns whether jobs are accepted. 2685 2686 Once L{PrepareShutdown} has been called, no new jobs are accepted and the 2687 queue is shutting down. 2688 2689 @rtype: bool 2690 2691 """ 2692 return self._accepting_jobs

2693 2694 @locking.ssynchronized(_LOCK) 2695 @_RequireOpenQueue

2696 - def Shutdown(self):

2697 """Stops the job queue. 2698 2699 This shutdowns all the worker threads an closes the queue. 2700 2701 """ 2702 self._wpool.TerminateWorkers() 2703 2704 self._queue_filelock.Close() 2705 self._queue_filelock = None

2706

Source Code for Module ganeti.jqueue