Package ganeti :: Package watcher :: Module state
[hide private]
[frames] | no frames]

Source Code for Module ganeti.watcher.state

  1  # 
  2  # 
  3   
  4  # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc. 
  5  # All rights reserved. 
  6  # 
  7  # Redistribution and use in source and binary forms, with or without 
  8  # modification, are permitted provided that the following conditions are 
  9  # met: 
 10  # 
 11  # 1. Redistributions of source code must retain the above copyright notice, 
 12  # this list of conditions and the following disclaimer. 
 13  # 
 14  # 2. Redistributions in binary form must reproduce the above copyright 
 15  # notice, this list of conditions and the following disclaimer in the 
 16  # documentation and/or other materials provided with the distribution. 
 17  # 
 18  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
 19  # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
 20  # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
 21  # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
 22  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
 23  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 24  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
 25  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
 26  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
 27  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
 28  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 29   
 30   
 31  """Module keeping state for Ganeti watcher. 
 32   
 33  """ 
 34   
 35  import os 
 36  import time 
 37  import logging 
 38   
 39  from ganeti import utils 
 40  from ganeti import serializer 
 41  from ganeti import errors 
 42   
 43   
 44  # Delete any record that is older than 8 hours; this value is based on 
 45  # the fact that the current retry counter is 5, and watcher runs every 
 46  # 5 minutes, so it takes around half an hour to exceed the retry 
 47  # counter, so 8 hours (16*1/2h) seems like a reasonable reset time 
 48  RETRY_EXPIRATION = 8 * 3600 
 49   
 50  KEY_CLEANUP_COUNT = "cleanup_count" 
 51  KEY_CLEANUP_WHEN = "cleanup_when" 
 52  KEY_RESTART_COUNT = "restart_count" 
 53  KEY_RESTART_WHEN = "restart_when" 
 54  KEY_BOOT_ID = "bootid" 
55 56 57 -def OpenStateFile(path):
58 """Opens the state file and acquires a lock on it. 59 60 @type path: string 61 @param path: Path to state file 62 63 """ 64 # The two-step dance below is necessary to allow both opening existing 65 # file read/write and creating if not existing. Vanilla open will truncate 66 # an existing file -or- allow creating if not existing. 67 statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT) 68 69 # Try to acquire lock on state file. If this fails, another watcher instance 70 # might already be running or another program is temporarily blocking the 71 # watcher from running. 72 try: 73 utils.LockFile(statefile_fd) 74 except errors.LockError, err: 75 logging.error("Can't acquire lock on state file %s: %s", path, err) 76 return None 77 78 return os.fdopen(statefile_fd, "w+")
79
80 81 -class WatcherState(object):
82 """Interface to a state file recording restart attempts. 83 84 """
85 - def __init__(self, statefile):
86 """Open, lock, read and parse the file. 87 88 @type statefile: file 89 @param statefile: State file object 90 91 """ 92 self.statefile = statefile 93 94 try: 95 state_data = self.statefile.read() 96 if not state_data: 97 self._data = {} 98 else: 99 self._data = serializer.Load(state_data) 100 except Exception, msg: # pylint: disable=W0703 101 # Ignore errors while loading the file and treat it as empty 102 self._data = {} 103 logging.warning(("Invalid state file. Using defaults." 104 " Error message: %s"), msg) 105 106 if "instance" not in self._data: 107 self._data["instance"] = {} 108 if "node" not in self._data: 109 self._data["node"] = {} 110 111 self._orig_data = serializer.Dump(self._data)
112
113 - def Save(self, filename):
114 """Save state to file, then unlock and close it. 115 116 """ 117 assert self.statefile 118 119 serialized_form = serializer.Dump(self._data) 120 if self._orig_data == serialized_form: 121 logging.debug("Data didn't change, just touching status file") 122 os.utime(filename, None) 123 return 124 125 # We need to make sure the file is locked before renaming it, otherwise 126 # starting ganeti-watcher again at the same time will create a conflict. 127 fd = utils.WriteFile(filename, 128 data=serialized_form, 129 prewrite=utils.LockFile, close=False) 130 self.statefile = os.fdopen(fd, "w+")
131
132 - def Close(self):
133 """Unlock configuration file and close it. 134 135 """ 136 assert self.statefile 137 138 # Files are automatically unlocked when closing them 139 self.statefile.close() 140 self.statefile = None
141
142 - def GetNodeBootID(self, name):
143 """Returns the last boot ID of a node or None. 144 145 """ 146 ndata = self._data["node"] 147 148 if name in ndata and KEY_BOOT_ID in ndata[name]: 149 return ndata[name][KEY_BOOT_ID] 150 return None
151
152 - def SetNodeBootID(self, name, bootid):
153 """Sets the boot ID of a node. 154 155 """ 156 assert bootid 157 158 ndata = self._data["node"] 159 160 ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid
161
162 - def NumberOfRestartAttempts(self, instance_name):
163 """Returns number of previous restart attempts. 164 165 @type instance_name: string 166 @param instance_name: the name of the instance to look up 167 168 """ 169 idata = self._data["instance"] 170 return idata.get(instance_name, {}).get(KEY_RESTART_COUNT, 0)
171
172 - def NumberOfCleanupAttempts(self, instance_name):
173 """Returns number of previous cleanup attempts. 174 175 @type instance_name: string 176 @param instance_name: the name of the instance to look up 177 178 """ 179 idata = self._data["instance"] 180 return idata.get(instance_name, {}).get(KEY_CLEANUP_COUNT, 0)
181
182 - def MaintainInstanceList(self, instances):
183 """Perform maintenance on the recorded instances. 184 185 @type instances: list of string 186 @param instances: the list of currently existing instances 187 188 """ 189 idict = self._data["instance"] 190 191 # First, delete obsolete instances 192 obsolete_instances = set(idict).difference(instances) 193 for inst in obsolete_instances: 194 logging.debug("Forgetting obsolete instance %s", inst) 195 idict.pop(inst, None) 196 197 # Second, delete expired records 198 earliest = time.time() - RETRY_EXPIRATION 199 expired_instances = [i for i in idict 200 if idict[i].get(KEY_RESTART_WHEN, 0) < earliest] 201 for inst in expired_instances: 202 logging.debug("Expiring record for instance %s", inst) 203 idict.pop(inst, None)
204 205 @staticmethod
206 - def _RecordAttempt(instances, instance_name, key_when, key_count):
207 """Record an event. 208 209 @type instances: dict 210 @param instances: contains instance data indexed by instance_name 211 212 @type instance_name: string 213 @param instance_name: name of the instance involved in the event 214 215 @type key_when: 216 @param key_when: dict key for the information for when the event occurred 217 218 @type key_count: int 219 @param key_count: dict key for the information for how many times 220 the event occurred 221 222 """ 223 instance = instances.setdefault(instance_name, {}) 224 instance[key_when] = time.time() 225 instance[key_count] = instance.get(key_count, 0) + 1
226
227 - def RecordRestartAttempt(self, instance_name):
228 """Record a restart attempt. 229 230 @type instance_name: string 231 @param instance_name: the name of the instance being restarted 232 233 """ 234 self._RecordAttempt(self._data["instance"], instance_name, 235 KEY_RESTART_WHEN, KEY_RESTART_COUNT)
236
237 - def RecordCleanupAttempt(self, instance_name):
238 """Record a cleanup attempt. 239 240 @type instance_name: string 241 @param instance_name: the name of the instance being cleaned up 242 243 """ 244 self._RecordAttempt(self._data["instance"], instance_name, 245 KEY_CLEANUP_WHEN, KEY_CLEANUP_COUNT)
246
247 - def RemoveInstance(self, instance_name):
248 """Update state to reflect that a machine is running. 249 250 This method removes the record for a named instance (as we only 251 track down instances). 252 253 @type instance_name: string 254 @param instance_name: the name of the instance to remove from books 255 256 """ 257 idata = self._data["instance"] 258 259 idata.pop(instance_name, None)
260