blob: b74e7ea7c97d31dac6367e4f45a6cd41750b395a [file] [log] [blame]
Hung-Te Linf2f78f72012-02-08 19:27:11 +08001#!/usr/bin/python -u
2#
3# -*- coding: utf-8 -*-
4#
5# Copyright (c) 2010 The Chromium OS Authors. All rights reserved.
6# Use of this source code is governed by a BSD-style license that can be
7# found in the LICENSE file.
8
9'''
10The main factory flow that runs the factory test and finalizes a device.
11'''
12
Hung-Te Lin02988092012-03-16 12:35:17 +080013import logging, os, pickle, pipes, re, signal, subprocess, sys, tempfile
Hung-Te Linf2f78f72012-02-08 19:27:11 +080014import threading, time, traceback
15from collections import deque
16from optparse import OptionParser
17from Queue import Queue
18
19import factory_common
20from autotest_lib.client.bin import prespawner
21from autotest_lib.client.cros import factory
22from autotest_lib.client.cros.factory import state
23from autotest_lib.client.cros.factory import TestState
24from autotest_lib.client.cros.factory.event import Event
25from autotest_lib.client.cros.factory.event import EventClient
26from autotest_lib.client.cros.factory.event import EventServer
27
28
29SCRIPT_PATH = os.path.realpath(__file__)
30CROS_FACTORY_LIB_PATH = os.path.dirname(SCRIPT_PATH)
Hung-Te Lin6bb48552012-02-09 14:37:43 +080031FACTORY_UI_PATH = os.path.join(CROS_FACTORY_LIB_PATH, 'ui')
Hung-Te Linf2f78f72012-02-08 19:27:11 +080032CLIENT_PATH = os.path.realpath(os.path.join(CROS_FACTORY_LIB_PATH, '..', '..'))
33DEFAULT_TEST_LIST_PATH = os.path.join(
34 CLIENT_PATH , 'site_tests', 'suite_Factory', 'test_list')
35HWID_CFG_PATH = '/usr/local/share/chromeos-hwid/cfg'
36
37
38def get_hwid_cfg():
39 '''
40 Returns the HWID config tag, or an empty string if none can be found.
41 '''
42 if 'CROS_HWID' in os.environ:
43 return os.environ['CROS_HWID']
44 if os.path.exists(HWID_CFG_PATH):
45 with open(HWID_CFG_PATH, 'rt') as hwid_cfg_handle:
46 return hwid_cfg_handle.read().strip()
47 return ''
48
49
50def find_test_list():
51 '''
52 Returns the path to the active test list, based on the HWID config tag.
53 '''
54 hwid_cfg = get_hwid_cfg()
55
56 # Try in order: test_list, test_list.$hwid_cfg, test_list.all
57 if hwid_cfg:
58 test_list = '%s_%s' % (DEFAULT_TEST_LIST_PATH, hwid_cfg)
59 if os.path.exists(test_list):
60 logging.info('Using special test list: %s', test_list)
61 return test_list
62 logging.info('WARNING: no specific test list for config: %s', hwid_cfg)
63
64 test_list = DEFAULT_TEST_LIST_PATH
65 if os.path.exists(test_list):
66 return test_list
67
68 test_list = ('%s.all' % DEFAULT_TEST_LIST_PATH)
69 if os.path.exists(test_list):
70 logging.info('Using default test list: ' + test_list)
71 return test_list
72 logging.info('ERROR: Cannot find any test list.')
73
74
75def is_process_alive(pid):
76 '''
77 Returns true if the named process is alive and not a zombie.
78 '''
79 try:
80 with open("/proc/%d/stat" % pid) as f:
81 return f.readline().split()[2] != 'Z'
82 except IOError:
83 return False
84
85
86def kill_process_tree(process, caption):
87 '''
88 Kills a process and all its subprocesses.
89
90 @param process: The process to kill (opened with the subprocess module).
91 @param caption: A caption describing the process.
92 '''
Hung-Te Lin1b094702012-03-15 18:14:15 +080093 # os.kill does not kill child processes. os.killpg kills all processes
94 # sharing same group (and is usually used for killing process tree). But in
95 # our case, to preserve PGID for autotest and upstart service, we need to
96 # iterate through each level until leaf of the tree.
97
98 def get_all_pids(root):
99 ps_output = subprocess.Popen(['ps','--no-headers','-eo','pid,ppid'],
100 stdout=subprocess.PIPE)
101 children = {}
102 for line in ps_output.stdout:
103 match = re.findall('\d+', line)
104 children.setdefault(int(match[1]), []).append(int(match[0]))
105 pids = []
106 def add_children(pid):
107 pids.append(pid)
108 map(add_children, children.get(pid, []))
109 add_children(root)
Hung-Te Lin02988092012-03-16 12:35:17 +0800110 # Reverse the list to first kill children then parents.
111 # Note reversed(pids) will return an iterator instead of real list, so
112 # we must explicitly call pids.reverse() here.
113 pids.reverse()
114 return pids
Hung-Te Lin1b094702012-03-15 18:14:15 +0800115
Hung-Te Lin02988092012-03-16 12:35:17 +0800116 pids = get_all_pids(process.pid)
Hung-Te Linf2f78f72012-02-08 19:27:11 +0800117 for sig in [signal.SIGTERM, signal.SIGKILL]:
Hung-Te Linf2f78f72012-02-08 19:27:11 +0800118 logging.info('Stopping %s (pid=%s)...', caption, sorted(pids))
119
120 for i in range(25): # Try 25 times (200 ms between tries)
121 for pid in pids:
122 try:
Hung-Te Lin02988092012-03-16 12:35:17 +0800123 logging.info("Sending signal %s to %d", sig, pid)
Hung-Te Linf2f78f72012-02-08 19:27:11 +0800124 os.kill(pid, sig)
125 except OSError:
126 pass
Jon Salza751ce52012-03-15 14:59:33 +0800127 pids = filter(is_process_alive, pids)
128 if not pids:
129 return
Hung-Te Linf2f78f72012-02-08 19:27:11 +0800130 time.sleep(0.2) # Sleep 200 ms and try again
131
132 logging.warn('Failed to stop %s process. Ignoring.', caption)
133
134
135class TestInvocation(object):
136 '''
137 State for an active test.
138 '''
139 def __init__(self, goofy, test, on_completion=None):
140 '''Constructor.
141
142 @param goofy: The controlling Goofy object.
143 @param test: The FactoryTest object to test.
144 @param on_completion: Callback to invoke in the goofy event queue
145 on completion.
146 '''
147 self.goofy = goofy
148 self.test = test
149 self.thread = threading.Thread(target=self._run)
150 self.on_completion = on_completion
151
152 self._lock = threading.Lock()
153 # The following properties are guarded by the lock.
154 self._aborted = False
155 self._completed = False
156 self._process = None
157
158 def start(self):
159 '''Starts the test thread.'''
160 self.thread.start()
161
162 def abort_and_join(self):
163 '''
164 Aborts a test (must be called from the event controller thread).
165 '''
166 with self._lock:
167 self._aborted = True
168 if self._process:
169 kill_process_tree(self._process, 'autotest')
170 if self.thread:
171 self.thread.join()
172 with self._lock:
173 # Should be set by the thread itself, but just in case...
174 self._completed = True
175
176 def is_completed(self):
177 '''
178 Returns true if the test has finished.
179 '''
180 with self._lock:
181 return self._completed
182
183 def _invoke_autotest(self, test, dargs):
184 '''
185 Invokes an autotest test.
186
187 This method encapsulates all the magic necessary to run a single
188 autotest test using the 'autotest' command-line tool and get a
189 sane pass/fail status and error message out. It may be better
190 to just write our own command-line wrapper for job.run_test
191 instead.
192
193 @param test: the autotest to run
194 @param dargs: the argument map
195 @return: tuple of status (TestState.PASSED or TestState.FAILED) and
196 error message, if any
197 '''
198 status = TestState.FAILED
199 error_msg = 'Unknown'
200
201 try:
202 client_dir = CLIENT_PATH
203
204 output_dir = '%s/results/%s' % (client_dir, test.path)
205 if not os.path.exists(output_dir):
206 os.makedirs(output_dir)
207 tmp_dir = tempfile.mkdtemp(prefix='tmp', dir=output_dir)
208
209 control_file = os.path.join(tmp_dir, 'control')
210 result_file = os.path.join(tmp_dir, 'result')
211 args_file = os.path.join(tmp_dir, 'args')
212
213 with open(args_file, 'w') as f:
214 pickle.dump(dargs, f)
215
216 # Create a new control file to use to run the test
217 with open(control_file, 'w') as f:
218 print >> f, 'import common, traceback, utils'
219 print >> f, 'import cPickle as pickle'
220 print >> f, ("success = job.run_test("
221 "'%s', **pickle.load(open('%s')))" % (
222 test.autotest_name, args_file))
223
224 print >> f, (
225 "pickle.dump((success, "
226 "str(job.last_error) if job.last_error else None), "
227 "open('%s', 'w'), protocol=2)"
228 % result_file)
229
230 args = ['%s/bin/autotest' % client_dir,
231 '--output_dir', output_dir,
232 control_file]
233
234 factory.console.info('Running test %s' % test.path)
235 logging.debug('Test command line: %s', ' '.join(
236 [pipes.quote(arg) for arg in args]))
237
238 with self._lock:
239 self._process = prespawner.spawn(
240 args, {'CROS_FACTORY_TEST_PATH': test.path})
241
242 returncode = self._process.wait()
243 with self._lock:
244 if self._aborted:
245 error_msg = 'Aborted by operator'
246 return
247
248 if returncode:
249 # Only happens when there is an autotest-level problem (not when
250 # the test actually failed).
251 error_msg = 'autotest returned with code %d' % returncode
252 return
253
254 with open(result_file) as f:
255 try:
256 success, error_msg = pickle.load(f)
257 except: # pylint: disable=W0702
258 logging.exception('Unable to retrieve autotest results')
259 error_msg = 'Unable to retrieve autotest results'
260 return
261
262 if success:
263 status = TestState.PASSED
264 error_msg = ''
265 except Exception: # pylint: disable=W0703
266 traceback.print_exc(sys.stderr)
267 finally:
268 factory.console.info('Test %s: %s' % (test.path, status))
269 return status, error_msg # pylint: disable=W0150
270
271 def _run(self):
272 with self._lock:
273 if self._aborted:
274 return
275
276 count = self.test.update_state(
277 status=TestState.ACTIVE, increment_count=1, error_msg='').count
278
279 test_tag = '%s_%s' % (self.test.path, count)
280 dargs = dict(self.test.dargs)
281 dargs.update({'tag': test_tag,
282 'test_list_path': self.goofy.test_list_path})
283
284 status, error_msg = self._invoke_autotest(self.test, dargs)
285 self.test.update_state(status=status, error_msg=error_msg,
286 visible=False)
287 with self._lock:
288 self._completed = True
289 self.goofy.run_queue.put(self.goofy.reap_completed_tests)
290 self.goofy.run_queue.put(self.on_completion)
291
292
293class Goofy(object):
294 '''
295 The main factory flow.
296
297 Note that all methods in this class must be invoked from the main
298 (event) thread. Other threads, such as callbacks and TestInvocation
299 methods, should instead post events on the run queue.
300
301 TODO: Unit tests. (chrome-os-partner:7409)
302
303 Properties:
304 state_instance: An instance of FactoryState.
305 state_server: The FactoryState XML/RPC server.
306 state_server_thread: A thread running state_server.
307 event_server: The EventServer socket server.
308 event_server_thread: A thread running event_server.
309 event_client: A client to the event server.
Hung-Te Lin6bb48552012-02-09 14:37:43 +0800310 ui_process: The factory ui process object.
Hung-Te Linf2f78f72012-02-08 19:27:11 +0800311 run_queue: A queue of callbacks to invoke from the main thread.
312 invocations: A map from FactoryTest objects to the corresponding
313 TestInvocations objects representing active tests.
314 tests_to_run: A deque of tests that should be run when the current
315 test(s) complete.
316 options: Command-line options.
317 args: Command-line args.
318 test_list_path: The path to the test list.
319 test_list: The test list.
320 '''
321 def __init__(self):
322 self.state_instance = None
323 self.state_server = None
324 self.state_server_thread = None
325 self.event_server = None
326 self.event_server_thread = None
327 self.event_client = None
328 self.ui_process = None
329 self.run_queue = Queue()
330 self.invocations = {}
331 self.tests_to_run = deque()
332 self.visible_test = None
333
334 self.options = None
335 self.args = None
336 self.test_list_path = None
337 self.test_list = None
338
339 def __del__(self):
340 if self.ui_process:
341 kill_process_tree(self.ui_process, 'ui')
342 if self.state_server_thread:
343 logging.info('Stopping state server')
344 self.state_server.shutdown()
345 self.state_server_thread.join()
346 if self.event_server_thread:
347 logging.info('Stopping event server')
348 self.event_server.shutdown() # pylint: disable=E1101
349 self.event_server_thread.join()
350 prespawner.stop()
351
352 def start_state_server(self):
353 self.state_instance, self.state_server = state.create_server()
354 logging.info('Starting state server')
355 self.state_server_thread = threading.Thread(
356 target=self.state_server.serve_forever)
357 self.state_server_thread.start()
358
359 def start_event_server(self):
360 self.event_server = EventServer()
361 logging.info('Starting factory event server')
362 self.event_server_thread = threading.Thread(
363 target=self.event_server.serve_forever) # pylint: disable=E1101
364 self.event_server_thread.start()
365
366 self.event_client = EventClient(
367 callback=self.handle_event, event_loop=self.run_queue)
368
369 def start_ui(self):
370 ui_proc_args = [FACTORY_UI_PATH, self.test_list_path]
371 logging.info('Starting ui %s', ui_proc_args)
372 self.ui_process = subprocess.Popen(ui_proc_args)
373 logging.info('Waiting for UI to come up...')
374 self.event_client.wait(
375 lambda event: event.type == Event.Type.UI_READY)
376 logging.info('UI has started')
377
378 def set_visible_test(self, test):
379 if self.visible_test == test:
380 return
381
382 if test:
383 test.update_state(visible=True)
384 if self.visible_test:
385 self.visible_test.update_state(visible=False)
386 self.visible_test = test
387
388 def handle_reboot_complete(self, test, state):
389 '''
390 Handles the case where a reboot was detected during a reboot step.
391
392 @param test: The RebootStep.
393 @param state: The test state.
394 '''
395 state = test.update_state(increment_reboot_count=1)
396 logging.info('Detected reboot (%d of %d)',
397 state.reboot_count, test.iterations)
398 if state.reboot_count == test.iterations:
399 # Good!
400 test.update_state(status=TestState.PASSED, error_msg='')
401 elif state.reboot_count > test.iterations:
402 # Rebooted too many times
403 test.update_state(status=TestState.FAILED,
404 error_msg='Too many reboots')
405 else:
406 # Need to reboot again
407 self.reboot()
408
409 def init_states(self):
410 '''
411 Initializes all states on startup.
412 '''
413 for test in self.test_list.get_all_tests():
414 # Make sure the state server knows about all the tests,
415 # defaulting to an untested state.
416 test.update_state(update_parent=False, visible=False)
417
418 # Any 'active' tests should be marked as failed now.
419 for test in self.test_list.walk():
420 state = test.get_state()
421 if isinstance(test, factory.InformationScreen):
422 test.update_state(status=TestState.UNTESTED, error_msg='')
423 elif state.status == TestState.ACTIVE:
424 if isinstance(test, factory.RebootStep):
425 # Rebooted while the test was active - that's good.
426 self.handle_reboot_complete(test, state)
427 else:
428 test.update_state(status=TestState.FAILED,
429 error_msg='Unknown (reboot?)')
430
431 def show_next_active_test(self):
432 '''
433 Rotates to the next visible active test.
434 '''
435 self.reap_completed_tests()
436 active_tests = [
437 t for t in self.test_list.walk()
438 if t.is_leaf() and t.get_state().status == TestState.ACTIVE]
439 if not active_tests:
440 return
441
442 try:
443 next_test = active_tests[
444 (active_tests.index(self.visible_test) + 1) % len(active_tests)]
445 except ValueError: # visible_test not present in active_tests
446 next_test = active_tests[0]
447
448 self.set_visible_test(next_test)
449
450 def handle_event(self, event):
451 '''
452 Handles an event from the event server.
453 '''
454 if event.type == 'kbd_shortcut':
455 if event.key == 'Tab':
456 self.show_next_active_test()
457 test = self.test_list.kbd_shortcut_map.get(event.key)
458 if test:
459 invoc = self.invocations.get(test)
460 if invoc and test.backgroundable:
461 # Already running: just bring to the front if it
462 # has a UI.
463 logging.info('Setting visible test to %s', test.path)
464 self.event_client.post_event(
465 Event(Event.Type.SET_VISIBLE_TEST,
466 path=test.path))
467
468 self.abort_active_tests()
469 for t in test.walk():
470 t.update_state(status=TestState.UNTESTED)
471 self.run_tests(test)
472
473 def run_next_test(self):
474 '''
475 Runs the next eligible test (or tests) in self.tests_to_run.
476 '''
477 self.reap_completed_tests()
478 while self.tests_to_run:
479 logging.debug('Tests to run: %s',
480 [x.path for x in self.tests_to_run])
481
482 test = self.tests_to_run[0]
483
484 if test in self.invocations:
485 logging.info('Next test %s is already running', test.path)
486 self.tests_to_run.popleft()
487 return
488
489 if self.invocations and not (test.backgroundable and all(
490 [x.backgroundable for x in self.invocations])):
491 logging.debug('Waiting for non-backgroundable tests to '
492 'complete before running %s', test.path)
493 return
494
495 self.tests_to_run.popleft()
496
497 if isinstance(test, factory.RebootStep):
498 test.update_state(status=TestState.ACTIVE, increment_count=1,
499 error_msg='', reboot_count=0)
500 # Save pending test list in the state server
501 self.state_instance.set_shared_data(
502 'tests_after_reboot',
503 [t.path for t in self.tests_to_run])
504 self.reboot()
505
506 invoc = TestInvocation(self, test, on_completion=self.run_next_test)
507 self.invocations[test] = invoc
508 if self.visible_test is None and test.has_ui:
509 self.set_visible_test(test)
510 invoc.start()
511
512 def run_tests(self, root, untested_only=False):
513 '''
514 Runs tests under root.
515
516 The tests are run in order unless one fails (then stops).
517 Backgroundable tests are run simultaneously; when a foreground test is
518 encountered, we wait for all active tests to finish before continuing.
519 '''
520 self.tests_to_run = deque()
521 for x in root.walk():
522 if not x.is_leaf():
523 continue
524 if untested_only and x.get_state().status != TestState.UNTESTED:
525 continue
526 self.tests_to_run.append(x)
527 self.run_next_test()
528
529 def reap_completed_tests(self):
530 '''
531 Removes completed tests from the set of active tests.
532
533 Also updates the visible test if it was reaped.
534 '''
535 for t, v in dict(self.invocations).iteritems():
536 if v.is_completed():
537 del self.invocations[t]
538
539 if (self.visible_test is None or
540 self.visible_test not in self.invocations):
541 self.set_visible_test(None)
542 # Make the first running test, if any, the visible test
543 for t in self.test_list.walk():
544 if t in self.invocations:
545 self.set_visible_test(t)
546 break
547
548 def abort_active_tests(self):
549 '''
550 Kills and waits for all active tests.
551 '''
552 self.reap_completed_tests()
553 for test, invoc in self.invocations.items():
554 factory.console.info('Killing active test %s...' % test.path)
555 invoc.abort_and_join()
556 factory.console.info('Killed %s' % test.path)
557 del self.invocations[test]
558 self.reap_completed_tests()
559
560 def reboot(self):
561 '''
562 Reboots the machine (from a RebootTest).
563 '''
564 logging.info('Rebooting')
565 subprocess.check_call('sync')
566 subprocess.check_call('reboot')
567 time.sleep(30)
568 assert False, 'Never reached (should reboot)'
569
570 def main(self):
571 parser = OptionParser()
572 parser.add_option('-v', '--verbose', dest='verbose',
573 action='store_true',
574 help='Enable debug logging')
575 parser.add_option('--print_test_list', dest='print_test_list',
576 metavar='FILE',
577 help='Read and print test list FILE, and exit')
578 (self.options, self.args) = parser.parse_args()
579
580 factory.init_logging('goofy', verbose=self.options.verbose)
581
582 if self.options.print_test_list:
583 print (factory.read_test_list(self.options.print_test_list).
584 __repr__(recursive=True))
585 return
586
587 logging.info('Started')
588
589 self.start_state_server()
590 # Update HWID configuration tag.
591 self.state_instance.set_shared_data('hwid_cfg', get_hwid_cfg())
592
593 self.test_list_path = find_test_list()
594 self.test_list = factory.read_test_list(self.test_list_path,
595 self.state_instance)
596 logging.info('TEST_LIST:\n%s', self.test_list.__repr__(recursive=True))
597
598 self.init_states()
599 self.start_event_server()
600 self.start_ui()
601 prespawner.start()
602
603 def state_change_callback(test, state):
604 self.event_client.post_event(
605 Event(Event.Type.STATE_CHANGE,
606 path=test.path, state=state))
607 self.test_list.state_change_callback = state_change_callback
608
609 try:
610 tests_after_reboot = self.state_instance.get_shared_data(
611 'tests_after_reboot')
612 except KeyError:
613 tests_after_reboot = None
614
615 if tests_after_reboot is not None:
616 logging.info('Resuming tests after reboot: %s', tests_after_reboot)
617 self.state_instance.set_shared_data('tests_after_reboot', None)
618 self.tests_to_run.extend(
619 self.test_list.lookup_path(t) for t in tests_after_reboot)
620 self.run_next_test()
621 else:
622 self.run_tests(self.test_list, untested_only=True)
623
624 # Process events forever.
625 while True:
626 event = self.run_queue.get()
627 if not event:
628 # Shutdown request (although this currently never happens).
629 self.run_queue.task_done()
630 break
631
632 try:
633 event()
634 except Exception as e: # pylint: disable=W0703
635 logging.error('Error in event loop: %s', e)
636 traceback.print_exc(sys.stderr)
637 # But keep going
638 finally:
639 self.run_queue.task_done()
640
641
642if __name__ == '__main__':
643 Goofy().main()