1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """Module keeping state for Ganeti watcher.
32
33 """
34
35 import os
36 import time
37 import logging
38
39 from ganeti import utils
40 from ganeti import serializer
41 from ganeti import errors
42
43
44
45
46
47
48 RETRY_EXPIRATION = 8 * 3600
49
50 KEY_CLEANUP_COUNT = "cleanup_count"
51 KEY_CLEANUP_WHEN = "cleanup_when"
52 KEY_RESTART_COUNT = "restart_count"
53 KEY_RESTART_WHEN = "restart_when"
54 KEY_BOOT_ID = "bootid"
58 """Opens the state file and acquires a lock on it.
59
60 @type path: string
61 @param path: Path to state file
62
63 """
64
65
66
67 statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
68
69
70
71
72 try:
73 utils.LockFile(statefile_fd)
74 except errors.LockError, err:
75 logging.error("Can't acquire lock on state file %s: %s", path, err)
76 return None
77
78 return os.fdopen(statefile_fd, "w+")
79
82 """Interface to a state file recording restart attempts.
83
84 """
86 """Open, lock, read and parse the file.
87
88 @type statefile: file
89 @param statefile: State file object
90
91 """
92 self.statefile = statefile
93
94 try:
95 state_data = self.statefile.read()
96 if not state_data:
97 self._data = {}
98 else:
99 self._data = serializer.Load(state_data)
100 except Exception, msg:
101
102 self._data = {}
103 logging.warning(("Invalid state file. Using defaults."
104 " Error message: %s"), msg)
105
106 if "instance" not in self._data:
107 self._data["instance"] = {}
108 if "node" not in self._data:
109 self._data["node"] = {}
110
111 self._orig_data = serializer.Dump(self._data)
112
113 - def Save(self, filename):
114 """Save state to file, then unlock and close it.
115
116 """
117 assert self.statefile
118
119 serialized_form = serializer.Dump(self._data)
120 if self._orig_data == serialized_form:
121 logging.debug("Data didn't change, just touching status file")
122 os.utime(filename, None)
123 return
124
125
126
127 fd = utils.WriteFile(filename,
128 data=serialized_form,
129 prewrite=utils.LockFile, close=False)
130 self.statefile = os.fdopen(fd, "w+")
131
133 """Unlock configuration file and close it.
134
135 """
136 assert self.statefile
137
138
139 self.statefile.close()
140 self.statefile = None
141
143 """Returns the last boot ID of a node or None.
144
145 """
146 ndata = self._data["node"]
147
148 if name in ndata and KEY_BOOT_ID in ndata[name]:
149 return ndata[name][KEY_BOOT_ID]
150 return None
151
153 """Sets the boot ID of a node.
154
155 """
156 assert bootid
157
158 ndata = self._data["node"]
159
160 ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid
161
163 """Returns number of previous restart attempts.
164
165 @type instance_name: string
166 @param instance_name: the name of the instance to look up
167
168 """
169 idata = self._data["instance"]
170 return idata.get(instance_name, {}).get(KEY_RESTART_COUNT, 0)
171
173 """Returns number of previous cleanup attempts.
174
175 @type instance_name: string
176 @param instance_name: the name of the instance to look up
177
178 """
179 idata = self._data["instance"]
180 return idata.get(instance_name, {}).get(KEY_CLEANUP_COUNT, 0)
181
182 - def MaintainInstanceList(self, instances):
183 """Perform maintenance on the recorded instances.
184
185 @type instances: list of string
186 @param instances: the list of currently existing instances
187
188 """
189 idict = self._data["instance"]
190
191
192 obsolete_instances = set(idict).difference(instances)
193 for inst in obsolete_instances:
194 logging.debug("Forgetting obsolete instance %s", inst)
195 idict.pop(inst, None)
196
197
198 earliest = time.time() - RETRY_EXPIRATION
199 expired_instances = [i for i in idict
200 if idict[i].get(KEY_RESTART_WHEN, 0) < earliest]
201 for inst in expired_instances:
202 logging.debug("Expiring record for instance %s", inst)
203 idict.pop(inst, None)
204
205 @staticmethod
207 """Record an event.
208
209 @type instances: dict
210 @param instances: contains instance data indexed by instance_name
211
212 @type instance_name: string
213 @param instance_name: name of the instance involved in the event
214
215 @type key_when:
216 @param key_when: dict key for the information for when the event occurred
217
218 @type key_count: int
219 @param key_count: dict key for the information for how many times
220 the event occurred
221
222 """
223 instance = instances.setdefault(instance_name, {})
224 instance[key_when] = time.time()
225 instance[key_count] = instance.get(key_count, 0) + 1
226
228 """Record a restart attempt.
229
230 @type instance_name: string
231 @param instance_name: the name of the instance being restarted
232
233 """
234 self._RecordAttempt(self._data["instance"], instance_name,
235 KEY_RESTART_WHEN, KEY_RESTART_COUNT)
236
238 """Record a cleanup attempt.
239
240 @type instance_name: string
241 @param instance_name: the name of the instance being cleaned up
242
243 """
244 self._RecordAttempt(self._data["instance"], instance_name,
245 KEY_CLEANUP_WHEN, KEY_CLEANUP_COUNT)
246
248 """Update state to reflect that a machine is running.
249
250 This method removes the record for a named instance (as we only
251 track down instances).
252
253 @type instance_name: string
254 @param instance_name: the name of the instance to remove from books
255
256 """
257 idata = self._data["instance"]
258
259 idata.pop(instance_name, None)
260