blob: caf14a20b4469b830fff2f683d5962d663cf4815 [file] [log] [blame]
mbligh67647152008-11-19 00:18:14 +00001# Copyright Martin J. Bligh, Google Inc 2008
2# Released under the GPL v2
3
4"""
5This class allows you to communicate with the frontend to submit jobs etc
6It is designed for writing more sophisiticated server-side control files that
7can recursively add and manage other jobs.
8
9We turn the JSON dictionaries into real objects that are more idiomatic
10
mblighc31e4022008-12-11 19:32:30 +000011For docs, see:
12 http://autotest/afe/server/noauth/rpc/
13 http://autotest/new_tko/server/noauth/rpc/
14 http://docs.djangoproject.com/en/dev/ref/models/querysets/#queryset-api
mbligh67647152008-11-19 00:18:14 +000015"""
16
mblighb64d1762009-05-12 20:52:37 +000017import os, time, traceback, re
mbligh67647152008-11-19 00:18:14 +000018import common
19from autotest_lib.frontend.afe import rpc_client_lib
mbligh37eceaa2008-12-15 22:56:37 +000020from autotest_lib.client.common_lib import global_config
mbligh67647152008-11-19 00:18:14 +000021from autotest_lib.client.common_lib import utils
mbligh4e576612008-12-22 14:56:36 +000022try:
23 from autotest_lib.server.site_common import site_utils as server_utils
24except:
25 from autotest_lib.server import utils as server_utils
26form_ntuples_from_machines = server_utils.form_ntuples_from_machines
mbligh67647152008-11-19 00:18:14 +000027
mbligh37eceaa2008-12-15 22:56:37 +000028GLOBAL_CONFIG = global_config.global_config
29DEFAULT_SERVER = 'autotest'
30
mbligh67647152008-11-19 00:18:14 +000031def dump_object(header, obj):
32 """
33 Standard way to print out the frontend objects (eg job, host, acl, label)
34 in a human-readable fashion for debugging
35 """
36 result = header + '\n'
37 for key in obj.hash:
38 if key == 'afe' or key == 'hash':
39 continue
40 result += '%20s: %s\n' % (key, obj.hash[key])
41 return result
42
43
mbligh5280e3b2008-12-22 14:39:28 +000044class RpcClient(object):
mbligh67647152008-11-19 00:18:14 +000045 """
mbligh451ede12009-02-12 21:54:03 +000046 Abstract RPC class for communicating with the autotest frontend
47 Inherited for both TKO and AFE uses.
mbligh67647152008-11-19 00:18:14 +000048
mbligh451ede12009-02-12 21:54:03 +000049 All the constructors go in the afe / tko class.
50 Manipulating methods go in the object classes themselves
mbligh67647152008-11-19 00:18:14 +000051 """
mbligh99b24f42009-06-08 16:45:55 +000052 def __init__(self, path, user, server, print_log, debug, reply_debug):
mbligh67647152008-11-19 00:18:14 +000053 """
mbligh451ede12009-02-12 21:54:03 +000054 Create a cached instance of a connection to the frontend
mbligh67647152008-11-19 00:18:14 +000055
56 user: username to connect as
mbligh451ede12009-02-12 21:54:03 +000057 server: frontend server to connect to
mbligh67647152008-11-19 00:18:14 +000058 print_log: pring a logging message to stdout on every operation
59 debug: print out all RPC traffic
60 """
mblighc31e4022008-12-11 19:32:30 +000061 if not user:
62 user = os.environ.get('LOGNAME')
mbligh451ede12009-02-12 21:54:03 +000063 if not server:
mbligh475f7762009-01-30 00:34:04 +000064 if 'AUTOTEST_WEB' in os.environ:
mbligh451ede12009-02-12 21:54:03 +000065 server = os.environ['AUTOTEST_WEB']
mbligh475f7762009-01-30 00:34:04 +000066 else:
mbligh451ede12009-02-12 21:54:03 +000067 server = GLOBAL_CONFIG.get_config_value('SERVER', 'hostname',
68 default=DEFAULT_SERVER)
69 self.server = server
mbligh67647152008-11-19 00:18:14 +000070 self.user = user
71 self.print_log = print_log
72 self.debug = debug
mbligh99b24f42009-06-08 16:45:55 +000073 self.reply_debug = reply_debug
mbligh67647152008-11-19 00:18:14 +000074 headers = {'AUTHORIZATION' : self.user}
mbligh451ede12009-02-12 21:54:03 +000075 rpc_server = 'http://' + server + path
mbligh1354c9d2008-12-22 14:56:13 +000076 if debug:
77 print 'SERVER: %s' % rpc_server
78 print 'HEADERS: %s' % headers
mbligh67647152008-11-19 00:18:14 +000079 self.proxy = rpc_client_lib.get_proxy(rpc_server, headers=headers)
80
81
82 def run(self, call, **dargs):
83 """
84 Make a RPC call to the AFE server
85 """
86 rpc_call = getattr(self.proxy, call)
87 if self.debug:
88 print 'DEBUG: %s %s' % (call, dargs)
mbligh451ede12009-02-12 21:54:03 +000089 try:
mbligh99b24f42009-06-08 16:45:55 +000090 result = utils.strip_unicode(rpc_call(**dargs))
91 if self.reply_debug:
92 print result
93 return result
mbligh451ede12009-02-12 21:54:03 +000094 except Exception:
95 print 'FAILED RPC CALL: %s %s' % (call, dargs)
96 raise
mbligh67647152008-11-19 00:18:14 +000097
98
99 def log(self, message):
100 if self.print_log:
101 print message
102
103
mbligh5280e3b2008-12-22 14:39:28 +0000104class TKO(RpcClient):
mbligh99b24f42009-06-08 16:45:55 +0000105 def __init__(self, user=None, server=None, print_log=True, debug=False,
106 reply_debug=False):
107 super(TKO, self).__init__(path='/new_tko/server/noauth/rpc/',
108 user=user,
109 server=server,
110 print_log=print_log,
111 debug=debug,
112 reply_debug=reply_debug)
mblighc31e4022008-12-11 19:32:30 +0000113
114
115 def get_status_counts(self, job, **data):
116 entries = self.run('get_status_counts',
mbligh451ede12009-02-12 21:54:03 +0000117 group_by=['hostname', 'test_name', 'reason'],
mblighc31e4022008-12-11 19:32:30 +0000118 job_tag__startswith='%s-' % job, **data)
mbligh5280e3b2008-12-22 14:39:28 +0000119 return [TestStatus(self, e) for e in entries['groups']]
mblighc31e4022008-12-11 19:32:30 +0000120
121
mbligh5280e3b2008-12-22 14:39:28 +0000122class AFE(RpcClient):
mbligh17c75e62009-06-08 16:18:21 +0000123 def __init__(self, user=None, server=None, print_log=True, debug=False,
mbligh99b24f42009-06-08 16:45:55 +0000124 reply_debug=False, job=None):
mbligh17c75e62009-06-08 16:18:21 +0000125 self.job = job
mbligh99b24f42009-06-08 16:45:55 +0000126 super(AFE, self).__init__(path='/afe/server/noauth/rpc/',
127 user=user,
128 server=server,
129 print_log=print_log,
130 debug=debug,
131 reply_debug=reply_debug)
mblighc31e4022008-12-11 19:32:30 +0000132
133
mbligh67647152008-11-19 00:18:14 +0000134 def host_statuses(self, live=None):
mblighc2847b72009-03-25 19:32:20 +0000135 dead_statuses = ['Dead', 'Repair Failed', 'Repairing']
mbligh67647152008-11-19 00:18:14 +0000136 statuses = self.run('get_static_data')['host_statuses']
137 if live == True:
mblighc2847b72009-03-25 19:32:20 +0000138 return list(set(statuses) - set(dead_statuses))
mbligh67647152008-11-19 00:18:14 +0000139 if live == False:
140 return dead_statuses
141 else:
142 return statuses
143
144
145 def get_hosts(self, **dargs):
146 hosts = self.run('get_hosts', **dargs)
mbligh5280e3b2008-12-22 14:39:28 +0000147 return [Host(self, h) for h in hosts]
mbligh67647152008-11-19 00:18:14 +0000148
149
150 def create_host(self, hostname, **dargs):
mbligh54459c72009-01-21 19:26:44 +0000151 id = self.run('add_host', hostname=hostname, **dargs)
mbligh67647152008-11-19 00:18:14 +0000152 return self.get_hosts(id=id)[0]
153
154
155 def get_labels(self, **dargs):
156 labels = self.run('get_labels', **dargs)
mbligh5280e3b2008-12-22 14:39:28 +0000157 return [Label(self, l) for l in labels]
mbligh67647152008-11-19 00:18:14 +0000158
159
160 def create_label(self, name, **dargs):
mbligh54459c72009-01-21 19:26:44 +0000161 id = self.run('add_label', name=name, **dargs)
mbligh67647152008-11-19 00:18:14 +0000162 return self.get_labels(id=id)[0]
163
164
165 def get_acls(self, **dargs):
166 acls = self.run('get_acl_groups', **dargs)
mbligh5280e3b2008-12-22 14:39:28 +0000167 return [Acl(self, a) for a in acls]
mbligh67647152008-11-19 00:18:14 +0000168
169
170 def create_acl(self, name, **dargs):
mbligh54459c72009-01-21 19:26:44 +0000171 id = self.run('add_acl_group', name=name, **dargs)
mbligh67647152008-11-19 00:18:14 +0000172 return self.get_acls(id=id)[0]
173
174
mbligh54459c72009-01-21 19:26:44 +0000175 def get_users(self, **dargs):
176 users = self.run('get_users', **dargs)
177 return [User(self, u) for u in users]
178
179
mbligh1354c9d2008-12-22 14:56:13 +0000180 def generate_control_file(self, tests, **dargs):
181 ret = self.run('generate_control_file', tests=tests, **dargs)
182 return ControlFile(self, ret)
183
184
mbligh67647152008-11-19 00:18:14 +0000185 def get_jobs(self, summary=False, **dargs):
186 if summary:
187 jobs_data = self.run('get_jobs_summary', **dargs)
188 else:
189 jobs_data = self.run('get_jobs', **dargs)
mblighafbba0c2009-06-08 16:44:45 +0000190 jobs = []
191 for j in jobs_data:
192 job = Job(self, j)
193 # Set up some extra information defaults
194 job.testname = re.sub('\s.*', '', job.name) # arbitrary default
195 job.platform_results = {}
196 job.platform_reasons = {}
197 jobs.append(job)
198 return jobs
mbligh67647152008-11-19 00:18:14 +0000199
200
201 def get_host_queue_entries(self, **data):
202 entries = self.run('get_host_queue_entries', **data)
mblighf9e35862009-02-26 01:03:11 +0000203 job_statuses = [JobStatus(self, e) for e in entries]
mbligh99b24f42009-06-08 16:45:55 +0000204
205 # Sadly, get_host_queue_entries doesn't return platforms, we have
206 # to get those back from an explicit get_hosts queury, then patch
207 # the new host objects back into the host list.
208 hostnames = [s.host.hostname for s in job_statuses if s.host]
209 host_hash = {}
210 for host in self.get_hosts(hostname__in=hostnames):
211 host_hash[host.hostname] = host
212 for status in job_statuses:
213 if status.host:
214 status.host = host_hash[status.host.hostname]
mblighf9e35862009-02-26 01:03:11 +0000215 # filter job statuses that have either host or meta_host
216 return [status for status in job_statuses if (status.host or
217 status.meta_host)]
mbligh67647152008-11-19 00:18:14 +0000218
219
mblighb9db5162009-04-17 22:21:41 +0000220 def create_job_by_test(self, tests, kernel=None, use_container=False,
mbligh1354c9d2008-12-22 14:56:13 +0000221 **dargs):
mbligh67647152008-11-19 00:18:14 +0000222 """
223 Given a test name, fetch the appropriate control file from the server
mbligh4e576612008-12-22 14:56:36 +0000224 and submit it.
225
226 Returns a list of job objects
mbligh67647152008-11-19 00:18:14 +0000227 """
mblighb9db5162009-04-17 22:21:41 +0000228 assert ('hosts' in dargs or
229 'atomic_group_name' in dargs and 'synch_count' in dargs)
mbligh1354c9d2008-12-22 14:56:13 +0000230 control_file = self.generate_control_file(tests=tests, kernel=kernel,
231 use_container=use_container,
232 do_push_packages=True)
233 if control_file.is_server:
mbligh67647152008-11-19 00:18:14 +0000234 dargs['control_type'] = 'Server'
235 else:
236 dargs['control_type'] = 'Client'
237 dargs['dependencies'] = dargs.get('dependencies', []) + \
mbligh1354c9d2008-12-22 14:56:13 +0000238 control_file.dependencies
239 dargs['control_file'] = control_file.control_file
mblighb9db5162009-04-17 22:21:41 +0000240 dargs.setdefault('synch_count', control_file.synch_count)
241 if 'hosts' in dargs and len(dargs['hosts']) < dargs['synch_count']:
242 # will not be able to satisfy this request
mbligh38b09152009-04-28 18:34:25 +0000243 return None
244 return self.create_job(**dargs)
mbligh67647152008-11-19 00:18:14 +0000245
246
247 def create_job(self, control_file, name=' ', priority='Medium',
248 control_type='Client', **dargs):
249 id = self.run('create_job', name=name, priority=priority,
250 control_file=control_file, control_type=control_type, **dargs)
251 return self.get_jobs(id=id)[0]
252
253
mbligh1f23f362008-12-22 14:46:12 +0000254 def run_test_suites(self, pairings, kernel, kernel_label, priority='Medium',
mblighd50b1252009-06-08 16:43:37 +0000255 wait=True, poll_interval=10, email_from=None,
mbligh7b312282009-01-07 16:45:43 +0000256 email_to=None, timeout=168):
mbligh5b618382008-12-03 15:24:01 +0000257 """
258 Run a list of test suites on a particular kernel.
259
260 Poll for them to complete, and return whether they worked or not.
261
262 pairings: list of MachineTestPairing objects to invoke
263 kernel: name of the kernel to run
264 kernel_label: label of the kernel to run
265 (<kernel-version> : <config> : <date>)
266 wait: boolean - wait for the results to come back?
267 poll_interval: interval between polling for job results (in minutes)
mbligh45ffc432008-12-09 23:35:17 +0000268 email_from: send notification email upon completion from here
269 email_from: send notification email upon completion to here
mbligh5b618382008-12-03 15:24:01 +0000270 """
271 jobs = []
272 for pairing in pairings:
mbligh0c4f8d72009-05-12 20:52:18 +0000273 try:
274 new_job = self.invoke_test(pairing, kernel, kernel_label,
275 priority, timeout=timeout)
276 if not new_job:
277 continue
mbligh0c4f8d72009-05-12 20:52:18 +0000278 jobs.append(new_job)
279 except Exception, e:
280 traceback.print_exc()
mblighb9db5162009-04-17 22:21:41 +0000281 if not wait or not jobs:
mbligh5b618382008-12-03 15:24:01 +0000282 return
mbligh5280e3b2008-12-22 14:39:28 +0000283 tko = TKO()
mbligh5b618382008-12-03 15:24:01 +0000284 while True:
285 time.sleep(60 * poll_interval)
mbligh5280e3b2008-12-22 14:39:28 +0000286 result = self.poll_all_jobs(tko, jobs, email_from, email_to)
mbligh5b618382008-12-03 15:24:01 +0000287 if result is not None:
288 return result
289
290
mbligh45ffc432008-12-09 23:35:17 +0000291 def result_notify(self, job, email_from, email_to):
mbligh5b618382008-12-03 15:24:01 +0000292 """
mbligh45ffc432008-12-09 23:35:17 +0000293 Notify about the result of a job. Will always print, if email data
294 is provided, will send email for it as well.
295
296 job: job object to notify about
297 email_from: send notification email upon completion from here
298 email_from: send notification email upon completion to here
299 """
300 if job.result == True:
301 subject = 'Testing PASSED: '
302 else:
303 subject = 'Testing FAILED: '
304 subject += '%s : %s\n' % (job.name, job.id)
305 text = []
306 for platform in job.results_platform_map:
307 for status in job.results_platform_map[platform]:
308 if status == 'Total':
309 continue
mbligh451ede12009-02-12 21:54:03 +0000310 for host in job.results_platform_map[platform][status]:
311 text.append('%20s %10s %10s' % (platform, status, host))
312 if status == 'Failed':
313 for test_status in job.test_status[host].fail:
314 text.append('(%s, %s) : %s' % \
315 (host, test_status.test_name,
316 test_status.reason))
317 text.append('')
mbligh37eceaa2008-12-15 22:56:37 +0000318
mbligh451ede12009-02-12 21:54:03 +0000319 base_url = 'http://' + self.server
mbligh37eceaa2008-12-15 22:56:37 +0000320
321 params = ('columns=test',
322 'rows=machine_group',
323 "condition=tag~'%s-%%25'" % job.id,
324 'title=Report')
325 query_string = '&'.join(params)
mbligh451ede12009-02-12 21:54:03 +0000326 url = '%s/tko/compose_query.cgi?%s' % (base_url, query_string)
327 text.append(url + '\n')
328 url = '%s/afe/#tab_id=view_job&object_id=%s' % (base_url, job.id)
329 text.append(url + '\n')
mbligh37eceaa2008-12-15 22:56:37 +0000330
331 body = '\n'.join(text)
332 print '---------------------------------------------------'
333 print 'Subject: ', subject
mbligh45ffc432008-12-09 23:35:17 +0000334 print body
mbligh37eceaa2008-12-15 22:56:37 +0000335 print '---------------------------------------------------'
mbligh45ffc432008-12-09 23:35:17 +0000336 if email_from and email_to:
mbligh37eceaa2008-12-15 22:56:37 +0000337 print 'Sending email ...'
mbligh45ffc432008-12-09 23:35:17 +0000338 utils.send_email(email_from, email_to, subject, body)
339 print
mbligh37eceaa2008-12-15 22:56:37 +0000340
mbligh45ffc432008-12-09 23:35:17 +0000341
mbligh1354c9d2008-12-22 14:56:13 +0000342 def print_job_result(self, job):
343 """
344 Print the result of a single job.
345 job: a job object
346 """
347 if job.result is None:
348 print 'PENDING',
349 elif job.result == True:
350 print 'PASSED',
351 elif job.result == False:
352 print 'FAILED',
mbligh912c3f32009-03-25 19:31:30 +0000353 elif job.result == "Abort":
354 print 'ABORT',
mbligh1354c9d2008-12-22 14:56:13 +0000355 print ' %s : %s' % (job.id, job.name)
356
357
mbligh451ede12009-02-12 21:54:03 +0000358 def poll_all_jobs(self, tko, jobs, email_from=None, email_to=None):
mbligh45ffc432008-12-09 23:35:17 +0000359 """
360 Poll all jobs in a list.
361 jobs: list of job objects to poll
362 email_from: send notification email upon completion from here
363 email_from: send notification email upon completion to here
364
365 Returns:
mbligh5b618382008-12-03 15:24:01 +0000366 a) All complete successfully (return True)
367 b) One or more has failed (return False)
368 c) Cannot tell yet (return None)
369 """
mbligh45ffc432008-12-09 23:35:17 +0000370 results = []
mbligh5b618382008-12-03 15:24:01 +0000371 for job in jobs:
mbligh676dcbe2009-06-15 21:57:27 +0000372 if getattr(job, 'result', None) is None:
373 job.result = self.poll_job_results(tko, job)
374 if job.result is not None:
375 self.result_notify(job, email_from, email_to)
mbligh45ffc432008-12-09 23:35:17 +0000376
mbligh676dcbe2009-06-15 21:57:27 +0000377 results.append(job.result)
mbligh1354c9d2008-12-22 14:56:13 +0000378 self.print_job_result(job)
mbligh45ffc432008-12-09 23:35:17 +0000379
380 if None in results:
381 return None
mbligh912c3f32009-03-25 19:31:30 +0000382 elif False in results or "Abort" in results:
mbligh45ffc432008-12-09 23:35:17 +0000383 return False
384 else:
385 return True
mbligh5b618382008-12-03 15:24:01 +0000386
387
mbligh1f23f362008-12-22 14:46:12 +0000388 def _included_platform(self, host, platforms):
389 """
390 See if host's platforms matches any of the patterns in the included
391 platforms list.
392 """
393 if not platforms:
394 return True # No filtering of platforms
395 for platform in platforms:
396 if re.search(platform, host.platform):
397 return True
398 return False
399
400
mbligh7b312282009-01-07 16:45:43 +0000401 def invoke_test(self, pairing, kernel, kernel_label, priority='Medium',
402 **dargs):
mbligh5b618382008-12-03 15:24:01 +0000403 """
404 Given a pairing of a control file to a machine label, find all machines
405 with that label, and submit that control file to them.
406
mbligh4e576612008-12-22 14:56:36 +0000407 Returns a list of job objects
mbligh5b618382008-12-03 15:24:01 +0000408 """
409 job_name = '%s : %s' % (pairing.machine_label, kernel_label)
410 hosts = self.get_hosts(multiple_labels=[pairing.machine_label])
mbligh1f23f362008-12-22 14:46:12 +0000411 platforms = pairing.platforms
412 hosts = [h for h in hosts if self._included_platform(h, platforms)]
mblighc2847b72009-03-25 19:32:20 +0000413 dead_statuses = self.host_statuses(live=False)
414 host_list = [h.hostname for h in hosts if h.status not in dead_statuses]
mbligh1f23f362008-12-22 14:46:12 +0000415 print 'HOSTS: %s' % host_list
mblighb9db5162009-04-17 22:21:41 +0000416 # TODO(ncrao): fix this when synch_count implements "at least N"
417 # semantics instead of "exactly N".
418 if pairing.atomic_group_sched:
419 if pairing.synch_count > 0:
420 dargs['synch_count'] = pairing.synch_count
421 else:
422 dargs['synch_count'] = len(host_list)
423 dargs['atomic_group_name'] = pairing.machine_label
424 else:
425 dargs['hosts'] = host_list
mbligh38b09152009-04-28 18:34:25 +0000426 new_job = self.create_job_by_test(name=job_name,
mbligh17c75e62009-06-08 16:18:21 +0000427 dependencies=[pairing.machine_label],
428 tests=[pairing.control_file],
429 priority=priority,
430 kernel=kernel,
431 use_container=pairing.container,
432 **dargs)
mbligh38b09152009-04-28 18:34:25 +0000433 if new_job:
mbligh17c75e62009-06-08 16:18:21 +0000434 if pairing.testname:
435 new_job.testname = pairing.testname
mbligh4e576612008-12-22 14:56:36 +0000436 print 'Invoked test %s : %s' % (new_job.id, job_name)
mbligh38b09152009-04-28 18:34:25 +0000437 return new_job
mbligh5b618382008-12-03 15:24:01 +0000438
439
mblighb9db5162009-04-17 22:21:41 +0000440 def _job_test_results(self, tko, job, debug, tests=[]):
mbligh5b618382008-12-03 15:24:01 +0000441 """
mbligh5280e3b2008-12-22 14:39:28 +0000442 Retrieve test results for a job
mbligh5b618382008-12-03 15:24:01 +0000443 """
mbligh5280e3b2008-12-22 14:39:28 +0000444 job.test_status = {}
445 try:
446 test_statuses = tko.get_status_counts(job=job.id)
447 except Exception:
448 print "Ignoring exception on poll job; RPC interface is flaky"
449 traceback.print_exc()
450 return
451
452 for test_status in test_statuses:
mbligh7479a182009-01-07 16:46:24 +0000453 # SERVER_JOB is buggy, and often gives false failures. Ignore it.
454 if test_status.test_name == 'SERVER_JOB':
455 continue
mblighb9db5162009-04-17 22:21:41 +0000456 # if tests is not empty, restrict list of test_statuses to tests
457 if tests and test_status.test_name not in tests:
458 continue
mbligh451ede12009-02-12 21:54:03 +0000459 if debug:
460 print test_status
mbligh5280e3b2008-12-22 14:39:28 +0000461 hostname = test_status.hostname
462 if hostname not in job.test_status:
463 job.test_status[hostname] = TestResults()
464 job.test_status[hostname].add(test_status)
465
466
mbligh451ede12009-02-12 21:54:03 +0000467 def _job_results_platform_map(self, job, debug):
mblighc9e427e2009-04-28 18:35:06 +0000468 # Figure out which hosts passed / failed / aborted in a job
469 # Creates a 2-dimensional hash, stored as job.results_platform_map
470 # 1st index - platform type (string)
471 # 2nd index - Status (string)
472 # 'Completed' / 'Failed' / 'Aborted'
473 # Data indexed by this hash is a list of hostnames (text strings)
mbligh5280e3b2008-12-22 14:39:28 +0000474 job.results_platform_map = {}
mbligh5b618382008-12-03 15:24:01 +0000475 try:
mbligh45ffc432008-12-09 23:35:17 +0000476 job_statuses = self.get_host_queue_entries(job=job.id)
mbligh5b618382008-12-03 15:24:01 +0000477 except Exception:
478 print "Ignoring exception on poll job; RPC interface is flaky"
479 traceback.print_exc()
480 return None
mbligh5280e3b2008-12-22 14:39:28 +0000481
mbligh5b618382008-12-03 15:24:01 +0000482 platform_map = {}
mbligh5280e3b2008-12-22 14:39:28 +0000483 job.job_status = {}
mbligh451ede12009-02-12 21:54:03 +0000484 job.metahost_index = {}
mbligh5b618382008-12-03 15:24:01 +0000485 for job_status in job_statuses:
mblighc9e427e2009-04-28 18:35:06 +0000486 # This is basically "for each host / metahost in the job"
mbligh451ede12009-02-12 21:54:03 +0000487 if job_status.host:
488 hostname = job_status.host.hostname
489 else: # This is a metahost
490 metahost = job_status.meta_host
491 index = job.metahost_index.get(metahost, 1)
492 job.metahost_index[metahost] = index + 1
493 hostname = '%s.%s' % (metahost, index)
mbligh5280e3b2008-12-22 14:39:28 +0000494 job.job_status[hostname] = job_status.status
mbligh5b618382008-12-03 15:24:01 +0000495 status = job_status.status
mbligh0ecbe632009-05-13 21:34:56 +0000496 # Skip hosts that failed verify or repair:
497 # that's a machine failure, not a job failure
mbligh451ede12009-02-12 21:54:03 +0000498 if hostname in job.test_status:
499 verify_failed = False
500 for failure in job.test_status[hostname].fail:
mbligh0ecbe632009-05-13 21:34:56 +0000501 if (failure.test_name == 'verify' or
502 failure.test_name == 'repair'):
mbligh451ede12009-02-12 21:54:03 +0000503 verify_failed = True
504 break
505 if verify_failed:
506 continue
mblighc9e427e2009-04-28 18:35:06 +0000507 if hostname in job.test_status and job.test_status[hostname].fail:
508 # If the any tests failed in the job, we want to mark the
509 # job result as failed, overriding the default job status.
510 if status != "Aborted": # except if it's an aborted job
511 status = 'Failed'
mbligh451ede12009-02-12 21:54:03 +0000512 if job_status.host:
513 platform = job_status.host.platform
514 else: # This is a metahost
515 platform = job_status.meta_host
mbligh5b618382008-12-03 15:24:01 +0000516 if platform not in platform_map:
517 platform_map[platform] = {'Total' : [hostname]}
518 else:
519 platform_map[platform]['Total'].append(hostname)
520 new_host_list = platform_map[platform].get(status, []) + [hostname]
521 platform_map[platform][status] = new_host_list
mbligh45ffc432008-12-09 23:35:17 +0000522 job.results_platform_map = platform_map
mbligh5280e3b2008-12-22 14:39:28 +0000523
524
mbligh17c75e62009-06-08 16:18:21 +0000525 def set_platform_results(self, test_job, platform, result):
526 """
527 Result must be None, 'FAIL', 'WARN' or 'GOOD'
528 """
529 if test_job.platform_results[platform] is not None:
530 # We're already done, and results recorded. This can't change later.
531 return
532 test_job.platform_results[platform] = result
533 # Note that self.job refers to the metajob we're IN, not the job
534 # that we're excuting from here.
535 testname = '%s.%s' % (test_job.testname, platform)
536 if self.job:
537 self.job.record(result, None, testname, status='')
538
539
mbligh5280e3b2008-12-22 14:39:28 +0000540 def poll_job_results(self, tko, job, debug=False):
541 """
542 Analyse all job results by platform, return:
mbligh5b618382008-12-03 15:24:01 +0000543
mbligh5280e3b2008-12-22 14:39:28 +0000544 False: if any platform has more than one failure
545 None: if any platform has more than one machine not yet Good.
546 True: if all platforms have at least all-but-one machines Good.
547 """
mbligh451ede12009-02-12 21:54:03 +0000548 self._job_test_results(tko, job, debug)
mblighe7fcf562009-05-21 01:43:17 +0000549 if job.test_status == {}:
550 return None
mbligh451ede12009-02-12 21:54:03 +0000551 self._job_results_platform_map(job, debug)
mbligh5280e3b2008-12-22 14:39:28 +0000552
mbligh5b618382008-12-03 15:24:01 +0000553 good_platforms = []
mbligh912c3f32009-03-25 19:31:30 +0000554 failed_platforms = []
555 aborted_platforms = []
mbligh5b618382008-12-03 15:24:01 +0000556 unknown_platforms = []
mbligh5280e3b2008-12-22 14:39:28 +0000557 platform_map = job.results_platform_map
mbligh5b618382008-12-03 15:24:01 +0000558 for platform in platform_map:
mbligh17c75e62009-06-08 16:18:21 +0000559 if not job.platform_results.has_key(platform):
560 # record test start, but there's no way to do this right now
561 job.platform_results[platform] = None
mbligh5b618382008-12-03 15:24:01 +0000562 total = len(platform_map[platform]['Total'])
563 completed = len(platform_map[platform].get('Completed', []))
mbligh912c3f32009-03-25 19:31:30 +0000564 failed = len(platform_map[platform].get('Failed', []))
565 aborted = len(platform_map[platform].get('Aborted', []))
mbligh17c75e62009-06-08 16:18:21 +0000566
567 # We set up what we want to record here, but don't actually do
568 # it yet, until we have a decisive answer for this platform
569 if aborted or failed:
570 bad = aborted + failed
571 if (bad > 1) or (bad * 2 >= total):
572 platform_test_result = 'FAIL'
573 else:
574 platform_test_result = 'WARN'
575
mbligh912c3f32009-03-25 19:31:30 +0000576 if aborted > 1:
577 aborted_platforms.append(platform)
mbligh17c75e62009-06-08 16:18:21 +0000578 self.set_platform_results(job, platform, platform_test_result)
mbligh912c3f32009-03-25 19:31:30 +0000579 elif (failed * 2 >= total) or (failed > 1):
580 failed_platforms.append(platform)
mbligh17c75e62009-06-08 16:18:21 +0000581 self.set_platform_results(job, platform, platform_test_result)
mbligh451ede12009-02-12 21:54:03 +0000582 elif (completed >= 1) and (completed + 1 >= total):
mbligh5b618382008-12-03 15:24:01 +0000583 # if all or all but one are good, call the job good.
584 good_platforms.append(platform)
mbligh17c75e62009-06-08 16:18:21 +0000585 self.set_platform_results(job, platform, 'GOOD')
mbligh5b618382008-12-03 15:24:01 +0000586 else:
587 unknown_platforms.append(platform)
588 detail = []
589 for status in platform_map[platform]:
590 if status == 'Total':
591 continue
592 detail.append('%s=%s' % (status,platform_map[platform][status]))
593 if debug:
594 print '%20s %d/%d %s' % (platform, completed, total,
595 ' '.join(detail))
596 print
597
mbligh912c3f32009-03-25 19:31:30 +0000598 if len(aborted_platforms) > 0:
mbligh5b618382008-12-03 15:24:01 +0000599 if debug:
mbligh17c75e62009-06-08 16:18:21 +0000600 print 'Result aborted - platforms: ',
601 print ' '.join(aborted_platforms)
mbligh912c3f32009-03-25 19:31:30 +0000602 return "Abort"
603 if len(failed_platforms) > 0:
604 if debug:
605 print 'Result bad - platforms: ' + ' '.join(failed_platforms)
mbligh5b618382008-12-03 15:24:01 +0000606 return False
607 if len(unknown_platforms) > 0:
608 if debug:
609 platform_list = ' '.join(unknown_platforms)
610 print 'Result unknown - platforms: ', platform_list
611 return None
612 if debug:
613 platform_list = ' '.join(good_platforms)
614 print 'Result good - all platforms passed: ', platform_list
615 return True
616
617
mbligh5280e3b2008-12-22 14:39:28 +0000618class TestResults(object):
619 """
620 Container class used to hold the results of the tests for a job
621 """
622 def __init__(self):
623 self.good = []
624 self.fail = []
mbligh451ede12009-02-12 21:54:03 +0000625 self.pending = []
mbligh5280e3b2008-12-22 14:39:28 +0000626
627
628 def add(self, result):
mbligh451ede12009-02-12 21:54:03 +0000629 if result.complete_count > result.pass_count:
630 self.fail.append(result)
631 elif result.incomplete_count > 0:
632 self.pending.append(result)
mbligh5280e3b2008-12-22 14:39:28 +0000633 else:
mbligh451ede12009-02-12 21:54:03 +0000634 self.good.append(result)
mbligh5280e3b2008-12-22 14:39:28 +0000635
636
637class RpcObject(object):
mbligh67647152008-11-19 00:18:14 +0000638 """
639 Generic object used to construct python objects from rpc calls
640 """
641 def __init__(self, afe, hash):
642 self.afe = afe
643 self.hash = hash
644 self.__dict__.update(hash)
645
646
647 def __str__(self):
648 return dump_object(self.__repr__(), self)
649
650
mbligh1354c9d2008-12-22 14:56:13 +0000651class ControlFile(RpcObject):
652 """
653 AFE control file object
654
655 Fields: synch_count, dependencies, control_file, is_server
656 """
657 def __repr__(self):
658 return 'CONTROL FILE: %s' % self.control_file
659
660
mbligh5280e3b2008-12-22 14:39:28 +0000661class Label(RpcObject):
mbligh67647152008-11-19 00:18:14 +0000662 """
663 AFE label object
664
665 Fields:
666 name, invalid, platform, kernel_config, id, only_if_needed
667 """
668 def __repr__(self):
669 return 'LABEL: %s' % self.name
670
671
672 def add_hosts(self, hosts):
673 return self.afe.run('label_add_hosts', self.id, hosts)
674
675
676 def remove_hosts(self, hosts):
677 return self.afe.run('label_remove_hosts', self.id, hosts)
678
679
mbligh5280e3b2008-12-22 14:39:28 +0000680class Acl(RpcObject):
mbligh67647152008-11-19 00:18:14 +0000681 """
682 AFE acl object
683
684 Fields:
685 users, hosts, description, name, id
686 """
687 def __repr__(self):
688 return 'ACL: %s' % self.name
689
690
691 def add_hosts(self, hosts):
692 self.afe.log('Adding hosts %s to ACL %s' % (hosts, self.name))
693 return self.afe.run('acl_group_add_hosts', self.id, hosts)
694
695
696 def remove_hosts(self, hosts):
697 self.afe.log('Removing hosts %s from ACL %s' % (hosts, self.name))
698 return self.afe.run('acl_group_remove_hosts', self.id, hosts)
699
700
mbligh54459c72009-01-21 19:26:44 +0000701 def add_users(self, users):
702 self.afe.log('Adding users %s to ACL %s' % (users, self.name))
703 return self.afe.run('acl_group_add_users', id=self.name, users=users)
704
705
mbligh5280e3b2008-12-22 14:39:28 +0000706class Job(RpcObject):
mbligh67647152008-11-19 00:18:14 +0000707 """
708 AFE job object
709
710 Fields:
711 name, control_file, control_type, synch_count, reboot_before,
712 run_verify, priority, email_list, created_on, dependencies,
713 timeout, owner, reboot_after, id
714 """
715 def __repr__(self):
716 return 'JOB: %s' % self.id
717
718
mbligh5280e3b2008-12-22 14:39:28 +0000719class JobStatus(RpcObject):
mbligh67647152008-11-19 00:18:14 +0000720 """
721 AFE job_status object
722
723 Fields:
724 status, complete, deleted, meta_host, host, active, execution_subdir, id
725 """
726 def __init__(self, afe, hash):
727 # This should call super
728 self.afe = afe
729 self.hash = hash
730 self.__dict__.update(hash)
mbligh5280e3b2008-12-22 14:39:28 +0000731 self.job = Job(afe, self.job)
mbligh67647152008-11-19 00:18:14 +0000732 if self.host:
mbligh99b24f42009-06-08 16:45:55 +0000733 self.host = Host(afe, self.host)
mbligh67647152008-11-19 00:18:14 +0000734
735
736 def __repr__(self):
mbligh451ede12009-02-12 21:54:03 +0000737 if self.host and self.host.hostname:
738 hostname = self.host.hostname
739 else:
740 hostname = 'None'
741 return 'JOB STATUS: %s-%s' % (self.job.id, hostname)
mbligh67647152008-11-19 00:18:14 +0000742
743
mbligh5280e3b2008-12-22 14:39:28 +0000744class Host(RpcObject):
mbligh67647152008-11-19 00:18:14 +0000745 """
746 AFE host object
747
748 Fields:
749 status, lock_time, locked_by, locked, hostname, invalid,
750 synch_id, labels, platform, protection, dirty, id
751 """
752 def __repr__(self):
753 return 'HOST OBJECT: %s' % self.hostname
754
755
756 def show(self):
757 labels = list(set(self.labels) - set([self.platform]))
758 print '%-6s %-7s %-7s %-16s %s' % (self.hostname, self.status,
759 self.locked, self.platform,
760 ', '.join(labels))
761
762
mbligh54459c72009-01-21 19:26:44 +0000763 def delete(self):
764 return self.afe.run('delete_host', id=self.id)
765
766
mbligh6463c4b2009-01-30 00:33:37 +0000767 def modify(self, **dargs):
768 return self.afe.run('modify_host', id=self.id, **dargs)
769
770
mbligh67647152008-11-19 00:18:14 +0000771 def get_acls(self):
772 return self.afe.get_acls(hosts__hostname=self.hostname)
773
774
775 def add_acl(self, acl_name):
776 self.afe.log('Adding ACL %s to host %s' % (acl_name, self.hostname))
777 return self.afe.run('acl_group_add_hosts', id=acl_name,
778 hosts=[self.hostname])
779
780
781 def remove_acl(self, acl_name):
782 self.afe.log('Removing ACL %s from host %s' % (acl_name, self.hostname))
783 return self.afe.run('acl_group_remove_hosts', id=acl_name,
784 hosts=[self.hostname])
785
786
787 def get_labels(self):
788 return self.afe.get_labels(host__hostname__in=[self.hostname])
789
790
791 def add_labels(self, labels):
792 self.afe.log('Adding labels %s to host %s' % (labels, self.hostname))
793 return self.afe.run('host_add_labels', id=self.id, labels=labels)
794
795
796 def remove_labels(self, labels):
797 self.afe.log('Removing labels %s from host %s' % (labels,self.hostname))
798 return self.afe.run('host_remove_labels', id=self.id, labels=labels)
mbligh5b618382008-12-03 15:24:01 +0000799
800
mbligh54459c72009-01-21 19:26:44 +0000801class User(RpcObject):
802 def __repr__(self):
803 return 'USER: %s' % self.login
804
805
mbligh5280e3b2008-12-22 14:39:28 +0000806class TestStatus(RpcObject):
mblighc31e4022008-12-11 19:32:30 +0000807 """
808 TKO test status object
809
810 Fields:
811 test_idx, hostname, testname, id
812 complete_count, incomplete_count, group_count, pass_count
813 """
814 def __repr__(self):
815 return 'TEST STATUS: %s' % self.id
816
817
mbligh5b618382008-12-03 15:24:01 +0000818class MachineTestPairing(object):
819 """
820 Object representing the pairing of a machine label with a control file
mbligh1f23f362008-12-22 14:46:12 +0000821
822 machine_label: use machines from this label
823 control_file: use this control file (by name in the frontend)
824 platforms: list of rexeps to filter platforms by. [] => no filtering
mbligh5b618382008-12-03 15:24:01 +0000825 """
mbligh1354c9d2008-12-22 14:56:13 +0000826 def __init__(self, machine_label, control_file, platforms=[],
mbligh17c75e62009-06-08 16:18:21 +0000827 container=False, atomic_group_sched=False, synch_count=0,
828 testname=None):
mbligh5b618382008-12-03 15:24:01 +0000829 self.machine_label = machine_label
830 self.control_file = control_file
mbligh1f23f362008-12-22 14:46:12 +0000831 self.platforms = platforms
mbligh1354c9d2008-12-22 14:56:13 +0000832 self.container = container
mblighb9db5162009-04-17 22:21:41 +0000833 self.atomic_group_sched = atomic_group_sched
834 self.synch_count = synch_count
mbligh17c75e62009-06-08 16:18:21 +0000835 self.testname = testname
mbligh1354c9d2008-12-22 14:56:13 +0000836
837
838 def __repr__(self):
839 return '%s %s %s %s' % (self.machine_label, self.control_file,
840 self.platforms, self.container)