Blame - goofy.py - chromium.googlesource.com/chromiumos/platform/factory

blob: b74e7ea7c97d31dac6367e4f45a6cd41750b395a [file] [log] [blame]

Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	1	#!/usr/bin/python -u
				2	#
				3	# -- coding: utf-8 --
				4	#
				5	# Copyright (c) 2010 The Chromium OS Authors. All rights reserved.
				6	# Use of this source code is governed by a BSD-style license that can be
				7	# found in the LICENSE file.
				8
				9	'''
				10	The main factory flow that runs the factory test and finalizes a device.
				11	'''
				12
Hung-Te Lin	0298809	2012-03-16 12:35:17 +0800	[diff] [blame^]	13	import logging, os, pickle, pipes, re, signal, subprocess, sys, tempfile
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	14	import threading, time, traceback
				15	from collections import deque
				16	from optparse import OptionParser
				17	from Queue import Queue
				18
				19	import factory_common
				20	from autotest_lib.client.bin import prespawner
				21	from autotest_lib.client.cros import factory
				22	from autotest_lib.client.cros.factory import state
				23	from autotest_lib.client.cros.factory import TestState
				24	from autotest_lib.client.cros.factory.event import Event
				25	from autotest_lib.client.cros.factory.event import EventClient
				26	from autotest_lib.client.cros.factory.event import EventServer
				27
				28
				29	SCRIPT_PATH = os.path.realpath(__file__)
				30	CROS_FACTORY_LIB_PATH = os.path.dirname(SCRIPT_PATH)
Hung-Te Lin	6bb4855	2012-02-09 14:37:43 +0800	[diff] [blame]	31	FACTORY_UI_PATH = os.path.join(CROS_FACTORY_LIB_PATH, 'ui')
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	32	CLIENT_PATH = os.path.realpath(os.path.join(CROS_FACTORY_LIB_PATH, '..', '..'))
				33	DEFAULT_TEST_LIST_PATH = os.path.join(
				34	CLIENT_PATH , 'site_tests', 'suite_Factory', 'test_list')
				35	HWID_CFG_PATH = '/usr/local/share/chromeos-hwid/cfg'
				36
				37
				38	def get_hwid_cfg():
				39	'''
				40	Returns the HWID config tag, or an empty string if none can be found.
				41	'''
				42	if 'CROS_HWID' in os.environ:
				43	return os.environ['CROS_HWID']
				44	if os.path.exists(HWID_CFG_PATH):
				45	with open(HWID_CFG_PATH, 'rt') as hwid_cfg_handle:
				46	return hwid_cfg_handle.read().strip()
				47	return ''
				48
				49
				50	def find_test_list():
				51	'''
				52	Returns the path to the active test list, based on the HWID config tag.
				53	'''
				54	hwid_cfg = get_hwid_cfg()
				55
				56	# Try in order: test_list, test_list.$hwid_cfg, test_list.all
				57	if hwid_cfg:
				58	test_list = '%s_%s' % (DEFAULT_TEST_LIST_PATH, hwid_cfg)
				59	if os.path.exists(test_list):
				60	logging.info('Using special test list: %s', test_list)
				61	return test_list
				62	logging.info('WARNING: no specific test list for config: %s', hwid_cfg)
				63
				64	test_list = DEFAULT_TEST_LIST_PATH
				65	if os.path.exists(test_list):
				66	return test_list
				67
				68	test_list = ('%s.all' % DEFAULT_TEST_LIST_PATH)
				69	if os.path.exists(test_list):
				70	logging.info('Using default test list: ' + test_list)
				71	return test_list
				72	logging.info('ERROR: Cannot find any test list.')
				73
				74
				75	def is_process_alive(pid):
				76	'''
				77	Returns true if the named process is alive and not a zombie.
				78	'''
				79	try:
				80	with open("/proc/%d/stat" % pid) as f:
				81	return f.readline().split()[2] != 'Z'
				82	except IOError:
				83	return False
				84
				85
				86	def kill_process_tree(process, caption):
				87	'''
				88	Kills a process and all its subprocesses.
				89
				90	@param process: The process to kill (opened with the subprocess module).
				91	@param caption: A caption describing the process.
				92	'''
Hung-Te Lin	1b09470	2012-03-15 18:14:15 +0800	[diff] [blame]	93	# os.kill does not kill child processes. os.killpg kills all processes
				94	# sharing same group (and is usually used for killing process tree). But in
				95	# our case, to preserve PGID for autotest and upstart service, we need to
				96	# iterate through each level until leaf of the tree.
				97
				98	def get_all_pids(root):
				99	ps_output = subprocess.Popen(['ps','--no-headers','-eo','pid,ppid'],
				100	stdout=subprocess.PIPE)
				101	children = {}
				102	for line in ps_output.stdout:
				103	match = re.findall('\d+', line)
				104	children.setdefault(int(match[1]), []).append(int(match[0]))
				105	pids = []
				106	def add_children(pid):
				107	pids.append(pid)
				108	map(add_children, children.get(pid, []))
				109	add_children(root)
Hung-Te Lin	0298809	2012-03-16 12:35:17 +0800	[diff] [blame^]	110	# Reverse the list to first kill children then parents.
				111	# Note reversed(pids) will return an iterator instead of real list, so
				112	# we must explicitly call pids.reverse() here.
				113	pids.reverse()
				114	return pids
Hung-Te Lin	1b09470	2012-03-15 18:14:15 +0800	[diff] [blame]	115
Hung-Te Lin	0298809	2012-03-16 12:35:17 +0800	[diff] [blame^]	116	pids = get_all_pids(process.pid)
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	117	for sig in [signal.SIGTERM, signal.SIGKILL]:
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	118	logging.info('Stopping %s (pid=%s)...', caption, sorted(pids))
				119
				120	for i in range(25): # Try 25 times (200 ms between tries)
				121	for pid in pids:
				122	try:
Hung-Te Lin	0298809	2012-03-16 12:35:17 +0800	[diff] [blame^]	123	logging.info("Sending signal %s to %d", sig, pid)
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	124	os.kill(pid, sig)
				125	except OSError:
				126	pass
Jon Salz	a751ce5	2012-03-15 14:59:33 +0800	[diff] [blame]	127	pids = filter(is_process_alive, pids)
				128	if not pids:
				129	return
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	130	time.sleep(0.2) # Sleep 200 ms and try again
				131
				132	logging.warn('Failed to stop %s process. Ignoring.', caption)
				133
				134
				135	class TestInvocation(object):
				136	'''
				137	State for an active test.
				138	'''
				139	def __init__(self, goofy, test, on_completion=None):
				140	'''Constructor.
				141
				142	@param goofy: The controlling Goofy object.
				143	@param test: The FactoryTest object to test.
				144	@param on_completion: Callback to invoke in the goofy event queue
				145	on completion.
				146	'''
				147	self.goofy = goofy
				148	self.test = test
				149	self.thread = threading.Thread(target=self._run)
				150	self.on_completion = on_completion
				151
				152	self._lock = threading.Lock()
				153	# The following properties are guarded by the lock.
				154	self._aborted = False
				155	self._completed = False
				156	self._process = None
				157
				158	def start(self):
				159	'''Starts the test thread.'''
				160	self.thread.start()
				161
				162	def abort_and_join(self):
				163	'''
				164	Aborts a test (must be called from the event controller thread).
				165	'''
				166	with self._lock:
				167	self._aborted = True
				168	if self._process:
				169	kill_process_tree(self._process, 'autotest')
				170	if self.thread:
				171	self.thread.join()
				172	with self._lock:
				173	# Should be set by the thread itself, but just in case...
				174	self._completed = True
				175
				176	def is_completed(self):
				177	'''
				178	Returns true if the test has finished.
				179	'''
				180	with self._lock:
				181	return self._completed
				182
				183	def _invoke_autotest(self, test, dargs):
				184	'''
				185	Invokes an autotest test.
				186
				187	This method encapsulates all the magic necessary to run a single
				188	autotest test using the 'autotest' command-line tool and get a
				189	sane pass/fail status and error message out. It may be better
				190	to just write our own command-line wrapper for job.run_test
				191	instead.
				192
				193	@param test: the autotest to run
				194	@param dargs: the argument map
				195	@return: tuple of status (TestState.PASSED or TestState.FAILED) and
				196	error message, if any
				197	'''
				198	status = TestState.FAILED
				199	error_msg = 'Unknown'
				200
				201	try:
				202	client_dir = CLIENT_PATH
				203
				204	output_dir = '%s/results/%s' % (client_dir, test.path)
				205	if not os.path.exists(output_dir):
				206	os.makedirs(output_dir)
				207	tmp_dir = tempfile.mkdtemp(prefix='tmp', dir=output_dir)
				208
				209	control_file = os.path.join(tmp_dir, 'control')
				210	result_file = os.path.join(tmp_dir, 'result')
				211	args_file = os.path.join(tmp_dir, 'args')
				212
				213	with open(args_file, 'w') as f:
				214	pickle.dump(dargs, f)
				215
				216	# Create a new control file to use to run the test
				217	with open(control_file, 'w') as f:
				218	print >> f, 'import common, traceback, utils'
				219	print >> f, 'import cPickle as pickle'
				220	print >> f, ("success = job.run_test("
				221	"'%s', **pickle.load(open('%s')))" % (
				222	test.autotest_name, args_file))
				223
				224	print >> f, (
				225	"pickle.dump((success, "
				226	"str(job.last_error) if job.last_error else None), "
				227	"open('%s', 'w'), protocol=2)"
				228	% result_file)
				229
				230	args = ['%s/bin/autotest' % client_dir,
				231	'--output_dir', output_dir,
				232	control_file]
				233
				234	factory.console.info('Running test %s' % test.path)
				235	logging.debug('Test command line: %s', ' '.join(
				236	[pipes.quote(arg) for arg in args]))
				237
				238	with self._lock:
				239	self._process = prespawner.spawn(
				240	args, {'CROS_FACTORY_TEST_PATH': test.path})
				241
				242	returncode = self._process.wait()
				243	with self._lock:
				244	if self._aborted:
				245	error_msg = 'Aborted by operator'
				246	return
				247
				248	if returncode:
				249	# Only happens when there is an autotest-level problem (not when
				250	# the test actually failed).
				251	error_msg = 'autotest returned with code %d' % returncode
				252	return
				253
				254	with open(result_file) as f:
				255	try:
				256	success, error_msg = pickle.load(f)
				257	except: # pylint: disable=W0702
				258	logging.exception('Unable to retrieve autotest results')
				259	error_msg = 'Unable to retrieve autotest results'
				260	return
				261
				262	if success:
				263	status = TestState.PASSED
				264	error_msg = ''
				265	except Exception: # pylint: disable=W0703
				266	traceback.print_exc(sys.stderr)
				267	finally:
				268	factory.console.info('Test %s: %s' % (test.path, status))
				269	return status, error_msg # pylint: disable=W0150
				270
				271	def _run(self):
				272	with self._lock:
				273	if self._aborted:
				274	return
				275
				276	count = self.test.update_state(
				277	status=TestState.ACTIVE, increment_count=1, error_msg='').count
				278
				279	test_tag = '%s_%s' % (self.test.path, count)
				280	dargs = dict(self.test.dargs)
				281	dargs.update({'tag': test_tag,
				282	'test_list_path': self.goofy.test_list_path})
				283
				284	status, error_msg = self._invoke_autotest(self.test, dargs)
				285	self.test.update_state(status=status, error_msg=error_msg,
				286	visible=False)
				287	with self._lock:
				288	self._completed = True
				289	self.goofy.run_queue.put(self.goofy.reap_completed_tests)
				290	self.goofy.run_queue.put(self.on_completion)
				291
				292
				293	class Goofy(object):
				294	'''
				295	The main factory flow.
				296
				297	Note that all methods in this class must be invoked from the main
				298	(event) thread. Other threads, such as callbacks and TestInvocation
				299	methods, should instead post events on the run queue.
				300
				301	TODO: Unit tests. (chrome-os-partner:7409)
				302
				303	Properties:
				304	state_instance: An instance of FactoryState.
				305	state_server: The FactoryState XML/RPC server.
				306	state_server_thread: A thread running state_server.
				307	event_server: The EventServer socket server.
				308	event_server_thread: A thread running event_server.
				309	event_client: A client to the event server.
Hung-Te Lin	6bb4855	2012-02-09 14:37:43 +0800	[diff] [blame]	310	ui_process: The factory ui process object.
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	311	run_queue: A queue of callbacks to invoke from the main thread.
				312	invocations: A map from FactoryTest objects to the corresponding
				313	TestInvocations objects representing active tests.
				314	tests_to_run: A deque of tests that should be run when the current
				315	test(s) complete.
				316	options: Command-line options.
				317	args: Command-line args.
				318	test_list_path: The path to the test list.
				319	test_list: The test list.
				320	'''
				321	def __init__(self):
				322	self.state_instance = None
				323	self.state_server = None
				324	self.state_server_thread = None
				325	self.event_server = None
				326	self.event_server_thread = None
				327	self.event_client = None
				328	self.ui_process = None
				329	self.run_queue = Queue()
				330	self.invocations = {}
				331	self.tests_to_run = deque()
				332	self.visible_test = None
				333
				334	self.options = None
				335	self.args = None
				336	self.test_list_path = None
				337	self.test_list = None
				338
				339	def __del__(self):
				340	if self.ui_process:
				341	kill_process_tree(self.ui_process, 'ui')
				342	if self.state_server_thread:
				343	logging.info('Stopping state server')
				344	self.state_server.shutdown()
				345	self.state_server_thread.join()
				346	if self.event_server_thread:
				347	logging.info('Stopping event server')
				348	self.event_server.shutdown() # pylint: disable=E1101
				349	self.event_server_thread.join()
				350	prespawner.stop()
				351
				352	def start_state_server(self):
				353	self.state_instance, self.state_server = state.create_server()
				354	logging.info('Starting state server')
				355	self.state_server_thread = threading.Thread(
				356	target=self.state_server.serve_forever)
				357	self.state_server_thread.start()
				358
				359	def start_event_server(self):
				360	self.event_server = EventServer()
				361	logging.info('Starting factory event server')
				362	self.event_server_thread = threading.Thread(
				363	target=self.event_server.serve_forever) # pylint: disable=E1101
				364	self.event_server_thread.start()
				365
				366	self.event_client = EventClient(
				367	callback=self.handle_event, event_loop=self.run_queue)
				368
				369	def start_ui(self):
				370	ui_proc_args = [FACTORY_UI_PATH, self.test_list_path]
				371	logging.info('Starting ui %s', ui_proc_args)
				372	self.ui_process = subprocess.Popen(ui_proc_args)
				373	logging.info('Waiting for UI to come up...')
				374	self.event_client.wait(
				375	lambda event: event.type == Event.Type.UI_READY)
				376	logging.info('UI has started')
				377
				378	def set_visible_test(self, test):
				379	if self.visible_test == test:
				380	return
				381
				382	if test:
				383	test.update_state(visible=True)
				384	if self.visible_test:
				385	self.visible_test.update_state(visible=False)
				386	self.visible_test = test
				387
				388	def handle_reboot_complete(self, test, state):
				389	'''
				390	Handles the case where a reboot was detected during a reboot step.
				391
				392	@param test: The RebootStep.
				393	@param state: The test state.
				394	'''
				395	state = test.update_state(increment_reboot_count=1)
				396	logging.info('Detected reboot (%d of %d)',
				397	state.reboot_count, test.iterations)
				398	if state.reboot_count == test.iterations:
				399	# Good!
				400	test.update_state(status=TestState.PASSED, error_msg='')
				401	elif state.reboot_count > test.iterations:
				402	# Rebooted too many times
				403	test.update_state(status=TestState.FAILED,
				404	error_msg='Too many reboots')
				405	else:
				406	# Need to reboot again
				407	self.reboot()
				408
				409	def init_states(self):
				410	'''
				411	Initializes all states on startup.
				412	'''
				413	for test in self.test_list.get_all_tests():
				414	# Make sure the state server knows about all the tests,
				415	# defaulting to an untested state.
				416	test.update_state(update_parent=False, visible=False)
				417
				418	# Any 'active' tests should be marked as failed now.
				419	for test in self.test_list.walk():
				420	state = test.get_state()
				421	if isinstance(test, factory.InformationScreen):
				422	test.update_state(status=TestState.UNTESTED, error_msg='')
				423	elif state.status == TestState.ACTIVE:
				424	if isinstance(test, factory.RebootStep):
				425	# Rebooted while the test was active - that's good.
				426	self.handle_reboot_complete(test, state)
				427	else:
				428	test.update_state(status=TestState.FAILED,
				429	error_msg='Unknown (reboot?)')
				430
				431	def show_next_active_test(self):
				432	'''
				433	Rotates to the next visible active test.
				434	'''
				435	self.reap_completed_tests()
				436	active_tests = [
				437	t for t in self.test_list.walk()
				438	if t.is_leaf() and t.get_state().status == TestState.ACTIVE]
				439	if not active_tests:
				440	return
				441
				442	try:
				443	next_test = active_tests[
				444	(active_tests.index(self.visible_test) + 1) % len(active_tests)]
				445	except ValueError: # visible_test not present in active_tests
				446	next_test = active_tests[0]
				447
				448	self.set_visible_test(next_test)
				449
				450	def handle_event(self, event):
				451	'''
				452	Handles an event from the event server.
				453	'''
				454	if event.type == 'kbd_shortcut':
				455	if event.key == 'Tab':
				456	self.show_next_active_test()
				457	test = self.test_list.kbd_shortcut_map.get(event.key)
				458	if test:
				459	invoc = self.invocations.get(test)
				460	if invoc and test.backgroundable:
				461	# Already running: just bring to the front if it
				462	# has a UI.
				463	logging.info('Setting visible test to %s', test.path)
				464	self.event_client.post_event(
				465	Event(Event.Type.SET_VISIBLE_TEST,
				466	path=test.path))
				467
				468	self.abort_active_tests()
				469	for t in test.walk():
				470	t.update_state(status=TestState.UNTESTED)
				471	self.run_tests(test)
				472
				473	def run_next_test(self):
				474	'''
				475	Runs the next eligible test (or tests) in self.tests_to_run.
				476	'''
				477	self.reap_completed_tests()
				478	while self.tests_to_run:
				479	logging.debug('Tests to run: %s',
				480	[x.path for x in self.tests_to_run])
				481
				482	test = self.tests_to_run[0]
				483
				484	if test in self.invocations:
				485	logging.info('Next test %s is already running', test.path)
				486	self.tests_to_run.popleft()
				487	return
				488
				489	if self.invocations and not (test.backgroundable and all(
				490	[x.backgroundable for x in self.invocations])):
				491	logging.debug('Waiting for non-backgroundable tests to '
				492	'complete before running %s', test.path)
				493	return
				494
				495	self.tests_to_run.popleft()
				496
				497	if isinstance(test, factory.RebootStep):
				498	test.update_state(status=TestState.ACTIVE, increment_count=1,
				499	error_msg='', reboot_count=0)
				500	# Save pending test list in the state server
				501	self.state_instance.set_shared_data(
				502	'tests_after_reboot',
				503	[t.path for t in self.tests_to_run])
				504	self.reboot()
				505
				506	invoc = TestInvocation(self, test, on_completion=self.run_next_test)
				507	self.invocations[test] = invoc
				508	if self.visible_test is None and test.has_ui:
				509	self.set_visible_test(test)
				510	invoc.start()
				511
				512	def run_tests(self, root, untested_only=False):
				513	'''
				514	Runs tests under root.
				515
				516	The tests are run in order unless one fails (then stops).
				517	Backgroundable tests are run simultaneously; when a foreground test is
				518	encountered, we wait for all active tests to finish before continuing.
				519	'''
				520	self.tests_to_run = deque()
				521	for x in root.walk():
				522	if not x.is_leaf():
				523	continue
				524	if untested_only and x.get_state().status != TestState.UNTESTED:
				525	continue
				526	self.tests_to_run.append(x)
				527	self.run_next_test()
				528
				529	def reap_completed_tests(self):
				530	'''
				531	Removes completed tests from the set of active tests.
				532
				533	Also updates the visible test if it was reaped.
				534	'''
				535	for t, v in dict(self.invocations).iteritems():
				536	if v.is_completed():
				537	del self.invocations[t]
				538
				539	if (self.visible_test is None or
				540	self.visible_test not in self.invocations):
				541	self.set_visible_test(None)
				542	# Make the first running test, if any, the visible test
				543	for t in self.test_list.walk():
				544	if t in self.invocations:
				545	self.set_visible_test(t)
				546	break
				547
				548	def abort_active_tests(self):
				549	'''
				550	Kills and waits for all active tests.
				551	'''
				552	self.reap_completed_tests()
				553	for test, invoc in self.invocations.items():
				554	factory.console.info('Killing active test %s...' % test.path)
				555	invoc.abort_and_join()
				556	factory.console.info('Killed %s' % test.path)
				557	del self.invocations[test]
				558	self.reap_completed_tests()
				559
				560	def reboot(self):
				561	'''
				562	Reboots the machine (from a RebootTest).
				563	'''
				564	logging.info('Rebooting')
				565	subprocess.check_call('sync')
				566	subprocess.check_call('reboot')
				567	time.sleep(30)
				568	assert False, 'Never reached (should reboot)'
				569
				570	def main(self):
				571	parser = OptionParser()
				572	parser.add_option('-v', '--verbose', dest='verbose',
				573	action='store_true',
				574	help='Enable debug logging')
				575	parser.add_option('--print_test_list', dest='print_test_list',
				576	metavar='FILE',
				577	help='Read and print test list FILE, and exit')
				578	(self.options, self.args) = parser.parse_args()
				579
				580	factory.init_logging('goofy', verbose=self.options.verbose)
				581
				582	if self.options.print_test_list:
				583	print (factory.read_test_list(self.options.print_test_list).
				584	__repr__(recursive=True))
				585	return
				586
				587	logging.info('Started')
				588
				589	self.start_state_server()
				590	# Update HWID configuration tag.
				591	self.state_instance.set_shared_data('hwid_cfg', get_hwid_cfg())
				592
				593	self.test_list_path = find_test_list()
				594	self.test_list = factory.read_test_list(self.test_list_path,
				595	self.state_instance)
				596	logging.info('TEST_LIST:\n%s', self.test_list.__repr__(recursive=True))
				597
				598	self.init_states()
				599	self.start_event_server()
				600	self.start_ui()
				601	prespawner.start()
				602
				603	def state_change_callback(test, state):
				604	self.event_client.post_event(
				605	Event(Event.Type.STATE_CHANGE,
				606	path=test.path, state=state))
				607	self.test_list.state_change_callback = state_change_callback
				608
				609	try:
				610	tests_after_reboot = self.state_instance.get_shared_data(
				611	'tests_after_reboot')
				612	except KeyError:
				613	tests_after_reboot = None
				614
				615	if tests_after_reboot is not None:
				616	logging.info('Resuming tests after reboot: %s', tests_after_reboot)
				617	self.state_instance.set_shared_data('tests_after_reboot', None)
				618	self.tests_to_run.extend(
				619	self.test_list.lookup_path(t) for t in tests_after_reboot)
				620	self.run_next_test()
				621	else:
				622	self.run_tests(self.test_list, untested_only=True)
				623
				624	# Process events forever.
				625	while True:
				626	event = self.run_queue.get()
				627	if not event:
				628	# Shutdown request (although this currently never happens).
				629	self.run_queue.task_done()
				630	break
				631
				632	try:
				633	event()
				634	except Exception as e: # pylint: disable=W0703
				635	logging.error('Error in event loop: %s', e)
				636	traceback.print_exc(sys.stderr)
				637	# But keep going
				638	finally:
				639	self.run_queue.task_done()
				640
				641
				642	if __name__ == '__main__':
				643	Goofy().main()