blob: 3bb64f26d52a61a779d4d5873d73880db0818362 [file] [log] [blame]
Hung-Te Linf2f78f72012-02-08 19:27:11 +08001#!/usr/bin/python -u
2#
3# -*- coding: utf-8 -*-
4#
5# Copyright (c) 2010 The Chromium OS Authors. All rights reserved.
6# Use of this source code is governed by a BSD-style license that can be
7# found in the LICENSE file.
8
9'''
10The main factory flow that runs the factory test and finalizes a device.
11'''
12
13import logging, os, pickle, pipes, signal, subprocess, sys, tempfile
14import threading, time, traceback
15from collections import deque
16from optparse import OptionParser
17from Queue import Queue
18
19import factory_common
20from autotest_lib.client.bin import prespawner
21from autotest_lib.client.cros import factory
22from autotest_lib.client.cros.factory import state
23from autotest_lib.client.cros.factory import TestState
24from autotest_lib.client.cros.factory.event import Event
25from autotest_lib.client.cros.factory.event import EventClient
26from autotest_lib.client.cros.factory.event import EventServer
27
28
29SCRIPT_PATH = os.path.realpath(__file__)
30CROS_FACTORY_LIB_PATH = os.path.dirname(SCRIPT_PATH)
Hung-Te Lin6bb48552012-02-09 14:37:43 +080031FACTORY_UI_PATH = os.path.join(CROS_FACTORY_LIB_PATH, 'ui')
Hung-Te Linf2f78f72012-02-08 19:27:11 +080032CLIENT_PATH = os.path.realpath(os.path.join(CROS_FACTORY_LIB_PATH, '..', '..'))
33DEFAULT_TEST_LIST_PATH = os.path.join(
34 CLIENT_PATH , 'site_tests', 'suite_Factory', 'test_list')
35HWID_CFG_PATH = '/usr/local/share/chromeos-hwid/cfg'
36
37
38def get_hwid_cfg():
39 '''
40 Returns the HWID config tag, or an empty string if none can be found.
41 '''
42 if 'CROS_HWID' in os.environ:
43 return os.environ['CROS_HWID']
44 if os.path.exists(HWID_CFG_PATH):
45 with open(HWID_CFG_PATH, 'rt') as hwid_cfg_handle:
46 return hwid_cfg_handle.read().strip()
47 return ''
48
49
50def find_test_list():
51 '''
52 Returns the path to the active test list, based on the HWID config tag.
53 '''
54 hwid_cfg = get_hwid_cfg()
55
56 # Try in order: test_list, test_list.$hwid_cfg, test_list.all
57 if hwid_cfg:
58 test_list = '%s_%s' % (DEFAULT_TEST_LIST_PATH, hwid_cfg)
59 if os.path.exists(test_list):
60 logging.info('Using special test list: %s', test_list)
61 return test_list
62 logging.info('WARNING: no specific test list for config: %s', hwid_cfg)
63
64 test_list = DEFAULT_TEST_LIST_PATH
65 if os.path.exists(test_list):
66 return test_list
67
68 test_list = ('%s.all' % DEFAULT_TEST_LIST_PATH)
69 if os.path.exists(test_list):
70 logging.info('Using default test list: ' + test_list)
71 return test_list
72 logging.info('ERROR: Cannot find any test list.')
73
74
75def is_process_alive(pid):
76 '''
77 Returns true if the named process is alive and not a zombie.
78 '''
79 try:
80 with open("/proc/%d/stat" % pid) as f:
81 return f.readline().split()[2] != 'Z'
82 except IOError:
83 return False
84
85
86def kill_process_tree(process, caption):
87 '''
88 Kills a process and all its subprocesses.
89
90 @param process: The process to kill (opened with the subprocess module).
91 @param caption: A caption describing the process.
92 '''
Hung-Te Lin1b094702012-03-15 18:14:15 +080093 # os.kill does not kill child processes. os.killpg kills all processes
94 # sharing same group (and is usually used for killing process tree). But in
95 # our case, to preserve PGID for autotest and upstart service, we need to
96 # iterate through each level until leaf of the tree.
97
98 def get_all_pids(root):
99 ps_output = subprocess.Popen(['ps','--no-headers','-eo','pid,ppid'],
100 stdout=subprocess.PIPE)
101 children = {}
102 for line in ps_output.stdout:
103 match = re.findall('\d+', line)
104 children.setdefault(int(match[1]), []).append(int(match[0]))
105 pids = []
106 def add_children(pid):
107 pids.append(pid)
108 map(add_children, children.get(pid, []))
109 add_children(root)
110 # Kill children first then parents.
111 return reversed(pids)
112
113 pids = get_all_pids([process.pid])
114
Hung-Te Linf2f78f72012-02-08 19:27:11 +0800115 for sig in [signal.SIGTERM, signal.SIGKILL]:
Hung-Te Linf2f78f72012-02-08 19:27:11 +0800116 logging.info('Stopping %s (pid=%s)...', caption, sorted(pids))
117
118 for i in range(25): # Try 25 times (200 ms between tries)
119 for pid in pids:
120 try:
Jon Salza751ce52012-03-15 14:59:33 +0800121 logging.info("Sending signal %s to %d" % (sig, pid))
Hung-Te Linf2f78f72012-02-08 19:27:11 +0800122 os.kill(pid, sig)
123 except OSError:
124 pass
Jon Salza751ce52012-03-15 14:59:33 +0800125 pids = filter(is_process_alive, pids)
126 if not pids:
127 return
Hung-Te Linf2f78f72012-02-08 19:27:11 +0800128 time.sleep(0.2) # Sleep 200 ms and try again
129
130 logging.warn('Failed to stop %s process. Ignoring.', caption)
131
132
133class TestInvocation(object):
134 '''
135 State for an active test.
136 '''
137 def __init__(self, goofy, test, on_completion=None):
138 '''Constructor.
139
140 @param goofy: The controlling Goofy object.
141 @param test: The FactoryTest object to test.
142 @param on_completion: Callback to invoke in the goofy event queue
143 on completion.
144 '''
145 self.goofy = goofy
146 self.test = test
147 self.thread = threading.Thread(target=self._run)
148 self.on_completion = on_completion
149
150 self._lock = threading.Lock()
151 # The following properties are guarded by the lock.
152 self._aborted = False
153 self._completed = False
154 self._process = None
155
156 def start(self):
157 '''Starts the test thread.'''
158 self.thread.start()
159
160 def abort_and_join(self):
161 '''
162 Aborts a test (must be called from the event controller thread).
163 '''
164 with self._lock:
165 self._aborted = True
166 if self._process:
167 kill_process_tree(self._process, 'autotest')
168 if self.thread:
169 self.thread.join()
170 with self._lock:
171 # Should be set by the thread itself, but just in case...
172 self._completed = True
173
174 def is_completed(self):
175 '''
176 Returns true if the test has finished.
177 '''
178 with self._lock:
179 return self._completed
180
181 def _invoke_autotest(self, test, dargs):
182 '''
183 Invokes an autotest test.
184
185 This method encapsulates all the magic necessary to run a single
186 autotest test using the 'autotest' command-line tool and get a
187 sane pass/fail status and error message out. It may be better
188 to just write our own command-line wrapper for job.run_test
189 instead.
190
191 @param test: the autotest to run
192 @param dargs: the argument map
193 @return: tuple of status (TestState.PASSED or TestState.FAILED) and
194 error message, if any
195 '''
196 status = TestState.FAILED
197 error_msg = 'Unknown'
198
199 try:
200 client_dir = CLIENT_PATH
201
202 output_dir = '%s/results/%s' % (client_dir, test.path)
203 if not os.path.exists(output_dir):
204 os.makedirs(output_dir)
205 tmp_dir = tempfile.mkdtemp(prefix='tmp', dir=output_dir)
206
207 control_file = os.path.join(tmp_dir, 'control')
208 result_file = os.path.join(tmp_dir, 'result')
209 args_file = os.path.join(tmp_dir, 'args')
210
211 with open(args_file, 'w') as f:
212 pickle.dump(dargs, f)
213
214 # Create a new control file to use to run the test
215 with open(control_file, 'w') as f:
216 print >> f, 'import common, traceback, utils'
217 print >> f, 'import cPickle as pickle'
218 print >> f, ("success = job.run_test("
219 "'%s', **pickle.load(open('%s')))" % (
220 test.autotest_name, args_file))
221
222 print >> f, (
223 "pickle.dump((success, "
224 "str(job.last_error) if job.last_error else None), "
225 "open('%s', 'w'), protocol=2)"
226 % result_file)
227
228 args = ['%s/bin/autotest' % client_dir,
229 '--output_dir', output_dir,
230 control_file]
231
232 factory.console.info('Running test %s' % test.path)
233 logging.debug('Test command line: %s', ' '.join(
234 [pipes.quote(arg) for arg in args]))
235
236 with self._lock:
237 self._process = prespawner.spawn(
238 args, {'CROS_FACTORY_TEST_PATH': test.path})
239
240 returncode = self._process.wait()
241 with self._lock:
242 if self._aborted:
243 error_msg = 'Aborted by operator'
244 return
245
246 if returncode:
247 # Only happens when there is an autotest-level problem (not when
248 # the test actually failed).
249 error_msg = 'autotest returned with code %d' % returncode
250 return
251
252 with open(result_file) as f:
253 try:
254 success, error_msg = pickle.load(f)
255 except: # pylint: disable=W0702
256 logging.exception('Unable to retrieve autotest results')
257 error_msg = 'Unable to retrieve autotest results'
258 return
259
260 if success:
261 status = TestState.PASSED
262 error_msg = ''
263 except Exception: # pylint: disable=W0703
264 traceback.print_exc(sys.stderr)
265 finally:
266 factory.console.info('Test %s: %s' % (test.path, status))
267 return status, error_msg # pylint: disable=W0150
268
269 def _run(self):
270 with self._lock:
271 if self._aborted:
272 return
273
274 count = self.test.update_state(
275 status=TestState.ACTIVE, increment_count=1, error_msg='').count
276
277 test_tag = '%s_%s' % (self.test.path, count)
278 dargs = dict(self.test.dargs)
279 dargs.update({'tag': test_tag,
280 'test_list_path': self.goofy.test_list_path})
281
282 status, error_msg = self._invoke_autotest(self.test, dargs)
283 self.test.update_state(status=status, error_msg=error_msg,
284 visible=False)
285 with self._lock:
286 self._completed = True
287 self.goofy.run_queue.put(self.goofy.reap_completed_tests)
288 self.goofy.run_queue.put(self.on_completion)
289
290
291class Goofy(object):
292 '''
293 The main factory flow.
294
295 Note that all methods in this class must be invoked from the main
296 (event) thread. Other threads, such as callbacks and TestInvocation
297 methods, should instead post events on the run queue.
298
299 TODO: Unit tests. (chrome-os-partner:7409)
300
301 Properties:
302 state_instance: An instance of FactoryState.
303 state_server: The FactoryState XML/RPC server.
304 state_server_thread: A thread running state_server.
305 event_server: The EventServer socket server.
306 event_server_thread: A thread running event_server.
307 event_client: A client to the event server.
Hung-Te Lin6bb48552012-02-09 14:37:43 +0800308 ui_process: The factory ui process object.
Hung-Te Linf2f78f72012-02-08 19:27:11 +0800309 run_queue: A queue of callbacks to invoke from the main thread.
310 invocations: A map from FactoryTest objects to the corresponding
311 TestInvocations objects representing active tests.
312 tests_to_run: A deque of tests that should be run when the current
313 test(s) complete.
314 options: Command-line options.
315 args: Command-line args.
316 test_list_path: The path to the test list.
317 test_list: The test list.
318 '''
319 def __init__(self):
320 self.state_instance = None
321 self.state_server = None
322 self.state_server_thread = None
323 self.event_server = None
324 self.event_server_thread = None
325 self.event_client = None
326 self.ui_process = None
327 self.run_queue = Queue()
328 self.invocations = {}
329 self.tests_to_run = deque()
330 self.visible_test = None
331
332 self.options = None
333 self.args = None
334 self.test_list_path = None
335 self.test_list = None
336
337 def __del__(self):
338 if self.ui_process:
339 kill_process_tree(self.ui_process, 'ui')
340 if self.state_server_thread:
341 logging.info('Stopping state server')
342 self.state_server.shutdown()
343 self.state_server_thread.join()
344 if self.event_server_thread:
345 logging.info('Stopping event server')
346 self.event_server.shutdown() # pylint: disable=E1101
347 self.event_server_thread.join()
348 prespawner.stop()
349
350 def start_state_server(self):
351 self.state_instance, self.state_server = state.create_server()
352 logging.info('Starting state server')
353 self.state_server_thread = threading.Thread(
354 target=self.state_server.serve_forever)
355 self.state_server_thread.start()
356
357 def start_event_server(self):
358 self.event_server = EventServer()
359 logging.info('Starting factory event server')
360 self.event_server_thread = threading.Thread(
361 target=self.event_server.serve_forever) # pylint: disable=E1101
362 self.event_server_thread.start()
363
364 self.event_client = EventClient(
365 callback=self.handle_event, event_loop=self.run_queue)
366
367 def start_ui(self):
368 ui_proc_args = [FACTORY_UI_PATH, self.test_list_path]
369 logging.info('Starting ui %s', ui_proc_args)
370 self.ui_process = subprocess.Popen(ui_proc_args)
371 logging.info('Waiting for UI to come up...')
372 self.event_client.wait(
373 lambda event: event.type == Event.Type.UI_READY)
374 logging.info('UI has started')
375
376 def set_visible_test(self, test):
377 if self.visible_test == test:
378 return
379
380 if test:
381 test.update_state(visible=True)
382 if self.visible_test:
383 self.visible_test.update_state(visible=False)
384 self.visible_test = test
385
386 def handle_reboot_complete(self, test, state):
387 '''
388 Handles the case where a reboot was detected during a reboot step.
389
390 @param test: The RebootStep.
391 @param state: The test state.
392 '''
393 state = test.update_state(increment_reboot_count=1)
394 logging.info('Detected reboot (%d of %d)',
395 state.reboot_count, test.iterations)
396 if state.reboot_count == test.iterations:
397 # Good!
398 test.update_state(status=TestState.PASSED, error_msg='')
399 elif state.reboot_count > test.iterations:
400 # Rebooted too many times
401 test.update_state(status=TestState.FAILED,
402 error_msg='Too many reboots')
403 else:
404 # Need to reboot again
405 self.reboot()
406
407 def init_states(self):
408 '''
409 Initializes all states on startup.
410 '''
411 for test in self.test_list.get_all_tests():
412 # Make sure the state server knows about all the tests,
413 # defaulting to an untested state.
414 test.update_state(update_parent=False, visible=False)
415
416 # Any 'active' tests should be marked as failed now.
417 for test in self.test_list.walk():
418 state = test.get_state()
419 if isinstance(test, factory.InformationScreen):
420 test.update_state(status=TestState.UNTESTED, error_msg='')
421 elif state.status == TestState.ACTIVE:
422 if isinstance(test, factory.RebootStep):
423 # Rebooted while the test was active - that's good.
424 self.handle_reboot_complete(test, state)
425 else:
426 test.update_state(status=TestState.FAILED,
427 error_msg='Unknown (reboot?)')
428
429 def show_next_active_test(self):
430 '''
431 Rotates to the next visible active test.
432 '''
433 self.reap_completed_tests()
434 active_tests = [
435 t for t in self.test_list.walk()
436 if t.is_leaf() and t.get_state().status == TestState.ACTIVE]
437 if not active_tests:
438 return
439
440 try:
441 next_test = active_tests[
442 (active_tests.index(self.visible_test) + 1) % len(active_tests)]
443 except ValueError: # visible_test not present in active_tests
444 next_test = active_tests[0]
445
446 self.set_visible_test(next_test)
447
448 def handle_event(self, event):
449 '''
450 Handles an event from the event server.
451 '''
452 if event.type == 'kbd_shortcut':
453 if event.key == 'Tab':
454 self.show_next_active_test()
455 test = self.test_list.kbd_shortcut_map.get(event.key)
456 if test:
457 invoc = self.invocations.get(test)
458 if invoc and test.backgroundable:
459 # Already running: just bring to the front if it
460 # has a UI.
461 logging.info('Setting visible test to %s', test.path)
462 self.event_client.post_event(
463 Event(Event.Type.SET_VISIBLE_TEST,
464 path=test.path))
465
466 self.abort_active_tests()
467 for t in test.walk():
468 t.update_state(status=TestState.UNTESTED)
469 self.run_tests(test)
470
471 def run_next_test(self):
472 '''
473 Runs the next eligible test (or tests) in self.tests_to_run.
474 '''
475 self.reap_completed_tests()
476 while self.tests_to_run:
477 logging.debug('Tests to run: %s',
478 [x.path for x in self.tests_to_run])
479
480 test = self.tests_to_run[0]
481
482 if test in self.invocations:
483 logging.info('Next test %s is already running', test.path)
484 self.tests_to_run.popleft()
485 return
486
487 if self.invocations and not (test.backgroundable and all(
488 [x.backgroundable for x in self.invocations])):
489 logging.debug('Waiting for non-backgroundable tests to '
490 'complete before running %s', test.path)
491 return
492
493 self.tests_to_run.popleft()
494
495 if isinstance(test, factory.RebootStep):
496 test.update_state(status=TestState.ACTIVE, increment_count=1,
497 error_msg='', reboot_count=0)
498 # Save pending test list in the state server
499 self.state_instance.set_shared_data(
500 'tests_after_reboot',
501 [t.path for t in self.tests_to_run])
502 self.reboot()
503
504 invoc = TestInvocation(self, test, on_completion=self.run_next_test)
505 self.invocations[test] = invoc
506 if self.visible_test is None and test.has_ui:
507 self.set_visible_test(test)
508 invoc.start()
509
510 def run_tests(self, root, untested_only=False):
511 '''
512 Runs tests under root.
513
514 The tests are run in order unless one fails (then stops).
515 Backgroundable tests are run simultaneously; when a foreground test is
516 encountered, we wait for all active tests to finish before continuing.
517 '''
518 self.tests_to_run = deque()
519 for x in root.walk():
520 if not x.is_leaf():
521 continue
522 if untested_only and x.get_state().status != TestState.UNTESTED:
523 continue
524 self.tests_to_run.append(x)
525 self.run_next_test()
526
527 def reap_completed_tests(self):
528 '''
529 Removes completed tests from the set of active tests.
530
531 Also updates the visible test if it was reaped.
532 '''
533 for t, v in dict(self.invocations).iteritems():
534 if v.is_completed():
535 del self.invocations[t]
536
537 if (self.visible_test is None or
538 self.visible_test not in self.invocations):
539 self.set_visible_test(None)
540 # Make the first running test, if any, the visible test
541 for t in self.test_list.walk():
542 if t in self.invocations:
543 self.set_visible_test(t)
544 break
545
546 def abort_active_tests(self):
547 '''
548 Kills and waits for all active tests.
549 '''
550 self.reap_completed_tests()
551 for test, invoc in self.invocations.items():
552 factory.console.info('Killing active test %s...' % test.path)
553 invoc.abort_and_join()
554 factory.console.info('Killed %s' % test.path)
555 del self.invocations[test]
556 self.reap_completed_tests()
557
558 def reboot(self):
559 '''
560 Reboots the machine (from a RebootTest).
561 '''
562 logging.info('Rebooting')
563 subprocess.check_call('sync')
564 subprocess.check_call('reboot')
565 time.sleep(30)
566 assert False, 'Never reached (should reboot)'
567
568 def main(self):
569 parser = OptionParser()
570 parser.add_option('-v', '--verbose', dest='verbose',
571 action='store_true',
572 help='Enable debug logging')
573 parser.add_option('--print_test_list', dest='print_test_list',
574 metavar='FILE',
575 help='Read and print test list FILE, and exit')
576 (self.options, self.args) = parser.parse_args()
577
578 factory.init_logging('goofy', verbose=self.options.verbose)
579
580 if self.options.print_test_list:
581 print (factory.read_test_list(self.options.print_test_list).
582 __repr__(recursive=True))
583 return
584
585 logging.info('Started')
586
587 self.start_state_server()
588 # Update HWID configuration tag.
589 self.state_instance.set_shared_data('hwid_cfg', get_hwid_cfg())
590
591 self.test_list_path = find_test_list()
592 self.test_list = factory.read_test_list(self.test_list_path,
593 self.state_instance)
594 logging.info('TEST_LIST:\n%s', self.test_list.__repr__(recursive=True))
595
596 self.init_states()
597 self.start_event_server()
598 self.start_ui()
599 prespawner.start()
600
601 def state_change_callback(test, state):
602 self.event_client.post_event(
603 Event(Event.Type.STATE_CHANGE,
604 path=test.path, state=state))
605 self.test_list.state_change_callback = state_change_callback
606
607 try:
608 tests_after_reboot = self.state_instance.get_shared_data(
609 'tests_after_reboot')
610 except KeyError:
611 tests_after_reboot = None
612
613 if tests_after_reboot is not None:
614 logging.info('Resuming tests after reboot: %s', tests_after_reboot)
615 self.state_instance.set_shared_data('tests_after_reboot', None)
616 self.tests_to_run.extend(
617 self.test_list.lookup_path(t) for t in tests_after_reboot)
618 self.run_next_test()
619 else:
620 self.run_tests(self.test_list, untested_only=True)
621
622 # Process events forever.
623 while True:
624 event = self.run_queue.get()
625 if not event:
626 # Shutdown request (although this currently never happens).
627 self.run_queue.task_done()
628 break
629
630 try:
631 event()
632 except Exception as e: # pylint: disable=W0703
633 logging.error('Error in event loop: %s', e)
634 traceback.print_exc(sys.stderr)
635 # But keep going
636 finally:
637 self.run_queue.task_done()
638
639
640if __name__ == '__main__':
641 Goofy().main()