Blame - goofy.py - chromium.googlesource.com/chromiumos/platform/factory

blob: 3bb64f26d52a61a779d4d5873d73880db0818362 [file] [log] [blame]

Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	1	#!/usr/bin/python -u
				2	#
				3	# -- coding: utf-8 --
				4	#
				5	# Copyright (c) 2010 The Chromium OS Authors. All rights reserved.
				6	# Use of this source code is governed by a BSD-style license that can be
				7	# found in the LICENSE file.
				8
				9	'''
				10	The main factory flow that runs the factory test and finalizes a device.
				11	'''
				12
				13	import logging, os, pickle, pipes, signal, subprocess, sys, tempfile
				14	import threading, time, traceback
				15	from collections import deque
				16	from optparse import OptionParser
				17	from Queue import Queue
				18
				19	import factory_common
				20	from autotest_lib.client.bin import prespawner
				21	from autotest_lib.client.cros import factory
				22	from autotest_lib.client.cros.factory import state
				23	from autotest_lib.client.cros.factory import TestState
				24	from autotest_lib.client.cros.factory.event import Event
				25	from autotest_lib.client.cros.factory.event import EventClient
				26	from autotest_lib.client.cros.factory.event import EventServer
				27
				28
				29	SCRIPT_PATH = os.path.realpath(__file__)
				30	CROS_FACTORY_LIB_PATH = os.path.dirname(SCRIPT_PATH)
Hung-Te Lin	6bb4855	2012-02-09 14:37:43 +0800	[diff] [blame]	31	FACTORY_UI_PATH = os.path.join(CROS_FACTORY_LIB_PATH, 'ui')
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	32	CLIENT_PATH = os.path.realpath(os.path.join(CROS_FACTORY_LIB_PATH, '..', '..'))
				33	DEFAULT_TEST_LIST_PATH = os.path.join(
				34	CLIENT_PATH , 'site_tests', 'suite_Factory', 'test_list')
				35	HWID_CFG_PATH = '/usr/local/share/chromeos-hwid/cfg'
				36
				37
				38	def get_hwid_cfg():
				39	'''
				40	Returns the HWID config tag, or an empty string if none can be found.
				41	'''
				42	if 'CROS_HWID' in os.environ:
				43	return os.environ['CROS_HWID']
				44	if os.path.exists(HWID_CFG_PATH):
				45	with open(HWID_CFG_PATH, 'rt') as hwid_cfg_handle:
				46	return hwid_cfg_handle.read().strip()
				47	return ''
				48
				49
				50	def find_test_list():
				51	'''
				52	Returns the path to the active test list, based on the HWID config tag.
				53	'''
				54	hwid_cfg = get_hwid_cfg()
				55
				56	# Try in order: test_list, test_list.$hwid_cfg, test_list.all
				57	if hwid_cfg:
				58	test_list = '%s_%s' % (DEFAULT_TEST_LIST_PATH, hwid_cfg)
				59	if os.path.exists(test_list):
				60	logging.info('Using special test list: %s', test_list)
				61	return test_list
				62	logging.info('WARNING: no specific test list for config: %s', hwid_cfg)
				63
				64	test_list = DEFAULT_TEST_LIST_PATH
				65	if os.path.exists(test_list):
				66	return test_list
				67
				68	test_list = ('%s.all' % DEFAULT_TEST_LIST_PATH)
				69	if os.path.exists(test_list):
				70	logging.info('Using default test list: ' + test_list)
				71	return test_list
				72	logging.info('ERROR: Cannot find any test list.')
				73
				74
				75	def is_process_alive(pid):
				76	'''
				77	Returns true if the named process is alive and not a zombie.
				78	'''
				79	try:
				80	with open("/proc/%d/stat" % pid) as f:
				81	return f.readline().split()[2] != 'Z'
				82	except IOError:
				83	return False
				84
				85
				86	def kill_process_tree(process, caption):
				87	'''
				88	Kills a process and all its subprocesses.
				89
				90	@param process: The process to kill (opened with the subprocess module).
				91	@param caption: A caption describing the process.
				92	'''
Hung-Te Lin	1b09470	2012-03-15 18:14:15 +0800	[diff] [blame^]	93	# os.kill does not kill child processes. os.killpg kills all processes
				94	# sharing same group (and is usually used for killing process tree). But in
				95	# our case, to preserve PGID for autotest and upstart service, we need to
				96	# iterate through each level until leaf of the tree.
				97
				98	def get_all_pids(root):
				99	ps_output = subprocess.Popen(['ps','--no-headers','-eo','pid,ppid'],
				100	stdout=subprocess.PIPE)
				101	children = {}
				102	for line in ps_output.stdout:
				103	match = re.findall('\d+', line)
				104	children.setdefault(int(match[1]), []).append(int(match[0]))
				105	pids = []
				106	def add_children(pid):
				107	pids.append(pid)
				108	map(add_children, children.get(pid, []))
				109	add_children(root)
				110	# Kill children first then parents.
				111	return reversed(pids)
				112
				113	pids = get_all_pids([process.pid])
				114
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	115	for sig in [signal.SIGTERM, signal.SIGKILL]:
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	116	logging.info('Stopping %s (pid=%s)...', caption, sorted(pids))
				117
				118	for i in range(25): # Try 25 times (200 ms between tries)
				119	for pid in pids:
				120	try:
Jon Salz	a751ce5	2012-03-15 14:59:33 +0800	[diff] [blame]	121	logging.info("Sending signal %s to %d" % (sig, pid))
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	122	os.kill(pid, sig)
				123	except OSError:
				124	pass
Jon Salz	a751ce5	2012-03-15 14:59:33 +0800	[diff] [blame]	125	pids = filter(is_process_alive, pids)
				126	if not pids:
				127	return
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	128	time.sleep(0.2) # Sleep 200 ms and try again
				129
				130	logging.warn('Failed to stop %s process. Ignoring.', caption)
				131
				132
				133	class TestInvocation(object):
				134	'''
				135	State for an active test.
				136	'''
				137	def __init__(self, goofy, test, on_completion=None):
				138	'''Constructor.
				139
				140	@param goofy: The controlling Goofy object.
				141	@param test: The FactoryTest object to test.
				142	@param on_completion: Callback to invoke in the goofy event queue
				143	on completion.
				144	'''
				145	self.goofy = goofy
				146	self.test = test
				147	self.thread = threading.Thread(target=self._run)
				148	self.on_completion = on_completion
				149
				150	self._lock = threading.Lock()
				151	# The following properties are guarded by the lock.
				152	self._aborted = False
				153	self._completed = False
				154	self._process = None
				155
				156	def start(self):
				157	'''Starts the test thread.'''
				158	self.thread.start()
				159
				160	def abort_and_join(self):
				161	'''
				162	Aborts a test (must be called from the event controller thread).
				163	'''
				164	with self._lock:
				165	self._aborted = True
				166	if self._process:
				167	kill_process_tree(self._process, 'autotest')
				168	if self.thread:
				169	self.thread.join()
				170	with self._lock:
				171	# Should be set by the thread itself, but just in case...
				172	self._completed = True
				173
				174	def is_completed(self):
				175	'''
				176	Returns true if the test has finished.
				177	'''
				178	with self._lock:
				179	return self._completed
				180
				181	def _invoke_autotest(self, test, dargs):
				182	'''
				183	Invokes an autotest test.
				184
				185	This method encapsulates all the magic necessary to run a single
				186	autotest test using the 'autotest' command-line tool and get a
				187	sane pass/fail status and error message out. It may be better
				188	to just write our own command-line wrapper for job.run_test
				189	instead.
				190
				191	@param test: the autotest to run
				192	@param dargs: the argument map
				193	@return: tuple of status (TestState.PASSED or TestState.FAILED) and
				194	error message, if any
				195	'''
				196	status = TestState.FAILED
				197	error_msg = 'Unknown'
				198
				199	try:
				200	client_dir = CLIENT_PATH
				201
				202	output_dir = '%s/results/%s' % (client_dir, test.path)
				203	if not os.path.exists(output_dir):
				204	os.makedirs(output_dir)
				205	tmp_dir = tempfile.mkdtemp(prefix='tmp', dir=output_dir)
				206
				207	control_file = os.path.join(tmp_dir, 'control')
				208	result_file = os.path.join(tmp_dir, 'result')
				209	args_file = os.path.join(tmp_dir, 'args')
				210
				211	with open(args_file, 'w') as f:
				212	pickle.dump(dargs, f)
				213
				214	# Create a new control file to use to run the test
				215	with open(control_file, 'w') as f:
				216	print >> f, 'import common, traceback, utils'
				217	print >> f, 'import cPickle as pickle'
				218	print >> f, ("success = job.run_test("
				219	"'%s', **pickle.load(open('%s')))" % (
				220	test.autotest_name, args_file))
				221
				222	print >> f, (
				223	"pickle.dump((success, "
				224	"str(job.last_error) if job.last_error else None), "
				225	"open('%s', 'w'), protocol=2)"
				226	% result_file)
				227
				228	args = ['%s/bin/autotest' % client_dir,
				229	'--output_dir', output_dir,
				230	control_file]
				231
				232	factory.console.info('Running test %s' % test.path)
				233	logging.debug('Test command line: %s', ' '.join(
				234	[pipes.quote(arg) for arg in args]))
				235
				236	with self._lock:
				237	self._process = prespawner.spawn(
				238	args, {'CROS_FACTORY_TEST_PATH': test.path})
				239
				240	returncode = self._process.wait()
				241	with self._lock:
				242	if self._aborted:
				243	error_msg = 'Aborted by operator'
				244	return
				245
				246	if returncode:
				247	# Only happens when there is an autotest-level problem (not when
				248	# the test actually failed).
				249	error_msg = 'autotest returned with code %d' % returncode
				250	return
				251
				252	with open(result_file) as f:
				253	try:
				254	success, error_msg = pickle.load(f)
				255	except: # pylint: disable=W0702
				256	logging.exception('Unable to retrieve autotest results')
				257	error_msg = 'Unable to retrieve autotest results'
				258	return
				259
				260	if success:
				261	status = TestState.PASSED
				262	error_msg = ''
				263	except Exception: # pylint: disable=W0703
				264	traceback.print_exc(sys.stderr)
				265	finally:
				266	factory.console.info('Test %s: %s' % (test.path, status))
				267	return status, error_msg # pylint: disable=W0150
				268
				269	def _run(self):
				270	with self._lock:
				271	if self._aborted:
				272	return
				273
				274	count = self.test.update_state(
				275	status=TestState.ACTIVE, increment_count=1, error_msg='').count
				276
				277	test_tag = '%s_%s' % (self.test.path, count)
				278	dargs = dict(self.test.dargs)
				279	dargs.update({'tag': test_tag,
				280	'test_list_path': self.goofy.test_list_path})
				281
				282	status, error_msg = self._invoke_autotest(self.test, dargs)
				283	self.test.update_state(status=status, error_msg=error_msg,
				284	visible=False)
				285	with self._lock:
				286	self._completed = True
				287	self.goofy.run_queue.put(self.goofy.reap_completed_tests)
				288	self.goofy.run_queue.put(self.on_completion)
				289
				290
				291	class Goofy(object):
				292	'''
				293	The main factory flow.
				294
				295	Note that all methods in this class must be invoked from the main
				296	(event) thread. Other threads, such as callbacks and TestInvocation
				297	methods, should instead post events on the run queue.
				298
				299	TODO: Unit tests. (chrome-os-partner:7409)
				300
				301	Properties:
				302	state_instance: An instance of FactoryState.
				303	state_server: The FactoryState XML/RPC server.
				304	state_server_thread: A thread running state_server.
				305	event_server: The EventServer socket server.
				306	event_server_thread: A thread running event_server.
				307	event_client: A client to the event server.
Hung-Te Lin	6bb4855	2012-02-09 14:37:43 +0800	[diff] [blame]	308	ui_process: The factory ui process object.
Hung-Te Lin	f2f78f7	2012-02-08 19:27:11 +0800	[diff] [blame]	309	run_queue: A queue of callbacks to invoke from the main thread.
				310	invocations: A map from FactoryTest objects to the corresponding
				311	TestInvocations objects representing active tests.
				312	tests_to_run: A deque of tests that should be run when the current
				313	test(s) complete.
				314	options: Command-line options.
				315	args: Command-line args.
				316	test_list_path: The path to the test list.
				317	test_list: The test list.
				318	'''
				319	def __init__(self):
				320	self.state_instance = None
				321	self.state_server = None
				322	self.state_server_thread = None
				323	self.event_server = None
				324	self.event_server_thread = None
				325	self.event_client = None
				326	self.ui_process = None
				327	self.run_queue = Queue()
				328	self.invocations = {}
				329	self.tests_to_run = deque()
				330	self.visible_test = None
				331
				332	self.options = None
				333	self.args = None
				334	self.test_list_path = None
				335	self.test_list = None
				336
				337	def __del__(self):
				338	if self.ui_process:
				339	kill_process_tree(self.ui_process, 'ui')
				340	if self.state_server_thread:
				341	logging.info('Stopping state server')
				342	self.state_server.shutdown()
				343	self.state_server_thread.join()
				344	if self.event_server_thread:
				345	logging.info('Stopping event server')
				346	self.event_server.shutdown() # pylint: disable=E1101
				347	self.event_server_thread.join()
				348	prespawner.stop()
				349
				350	def start_state_server(self):
				351	self.state_instance, self.state_server = state.create_server()
				352	logging.info('Starting state server')
				353	self.state_server_thread = threading.Thread(
				354	target=self.state_server.serve_forever)
				355	self.state_server_thread.start()
				356
				357	def start_event_server(self):
				358	self.event_server = EventServer()
				359	logging.info('Starting factory event server')
				360	self.event_server_thread = threading.Thread(
				361	target=self.event_server.serve_forever) # pylint: disable=E1101
				362	self.event_server_thread.start()
				363
				364	self.event_client = EventClient(
				365	callback=self.handle_event, event_loop=self.run_queue)
				366
				367	def start_ui(self):
				368	ui_proc_args = [FACTORY_UI_PATH, self.test_list_path]
				369	logging.info('Starting ui %s', ui_proc_args)
				370	self.ui_process = subprocess.Popen(ui_proc_args)
				371	logging.info('Waiting for UI to come up...')
				372	self.event_client.wait(
				373	lambda event: event.type == Event.Type.UI_READY)
				374	logging.info('UI has started')
				375
				376	def set_visible_test(self, test):
				377	if self.visible_test == test:
				378	return
				379
				380	if test:
				381	test.update_state(visible=True)
				382	if self.visible_test:
				383	self.visible_test.update_state(visible=False)
				384	self.visible_test = test
				385
				386	def handle_reboot_complete(self, test, state):
				387	'''
				388	Handles the case where a reboot was detected during a reboot step.
				389
				390	@param test: The RebootStep.
				391	@param state: The test state.
				392	'''
				393	state = test.update_state(increment_reboot_count=1)
				394	logging.info('Detected reboot (%d of %d)',
				395	state.reboot_count, test.iterations)
				396	if state.reboot_count == test.iterations:
				397	# Good!
				398	test.update_state(status=TestState.PASSED, error_msg='')
				399	elif state.reboot_count > test.iterations:
				400	# Rebooted too many times
				401	test.update_state(status=TestState.FAILED,
				402	error_msg='Too many reboots')
				403	else:
				404	# Need to reboot again
				405	self.reboot()
				406
				407	def init_states(self):
				408	'''
				409	Initializes all states on startup.
				410	'''
				411	for test in self.test_list.get_all_tests():
				412	# Make sure the state server knows about all the tests,
				413	# defaulting to an untested state.
				414	test.update_state(update_parent=False, visible=False)
				415
				416	# Any 'active' tests should be marked as failed now.
				417	for test in self.test_list.walk():
				418	state = test.get_state()
				419	if isinstance(test, factory.InformationScreen):
				420	test.update_state(status=TestState.UNTESTED, error_msg='')
				421	elif state.status == TestState.ACTIVE:
				422	if isinstance(test, factory.RebootStep):
				423	# Rebooted while the test was active - that's good.
				424	self.handle_reboot_complete(test, state)
				425	else:
				426	test.update_state(status=TestState.FAILED,
				427	error_msg='Unknown (reboot?)')
				428
				429	def show_next_active_test(self):
				430	'''
				431	Rotates to the next visible active test.
				432	'''
				433	self.reap_completed_tests()
				434	active_tests = [
				435	t for t in self.test_list.walk()
				436	if t.is_leaf() and t.get_state().status == TestState.ACTIVE]
				437	if not active_tests:
				438	return
				439
				440	try:
				441	next_test = active_tests[
				442	(active_tests.index(self.visible_test) + 1) % len(active_tests)]
				443	except ValueError: # visible_test not present in active_tests
				444	next_test = active_tests[0]
				445
				446	self.set_visible_test(next_test)
				447
				448	def handle_event(self, event):
				449	'''
				450	Handles an event from the event server.
				451	'''
				452	if event.type == 'kbd_shortcut':
				453	if event.key == 'Tab':
				454	self.show_next_active_test()
				455	test = self.test_list.kbd_shortcut_map.get(event.key)
				456	if test:
				457	invoc = self.invocations.get(test)
				458	if invoc and test.backgroundable:
				459	# Already running: just bring to the front if it
				460	# has a UI.
				461	logging.info('Setting visible test to %s', test.path)
				462	self.event_client.post_event(
				463	Event(Event.Type.SET_VISIBLE_TEST,
				464	path=test.path))
				465
				466	self.abort_active_tests()
				467	for t in test.walk():
				468	t.update_state(status=TestState.UNTESTED)
				469	self.run_tests(test)
				470
				471	def run_next_test(self):
				472	'''
				473	Runs the next eligible test (or tests) in self.tests_to_run.
				474	'''
				475	self.reap_completed_tests()
				476	while self.tests_to_run:
				477	logging.debug('Tests to run: %s',
				478	[x.path for x in self.tests_to_run])
				479
				480	test = self.tests_to_run[0]
				481
				482	if test in self.invocations:
				483	logging.info('Next test %s is already running', test.path)
				484	self.tests_to_run.popleft()
				485	return
				486
				487	if self.invocations and not (test.backgroundable and all(
				488	[x.backgroundable for x in self.invocations])):
				489	logging.debug('Waiting for non-backgroundable tests to '
				490	'complete before running %s', test.path)
				491	return
				492
				493	self.tests_to_run.popleft()
				494
				495	if isinstance(test, factory.RebootStep):
				496	test.update_state(status=TestState.ACTIVE, increment_count=1,
				497	error_msg='', reboot_count=0)
				498	# Save pending test list in the state server
				499	self.state_instance.set_shared_data(
				500	'tests_after_reboot',
				501	[t.path for t in self.tests_to_run])
				502	self.reboot()
				503
				504	invoc = TestInvocation(self, test, on_completion=self.run_next_test)
				505	self.invocations[test] = invoc
				506	if self.visible_test is None and test.has_ui:
				507	self.set_visible_test(test)
				508	invoc.start()
				509
				510	def run_tests(self, root, untested_only=False):
				511	'''
				512	Runs tests under root.
				513
				514	The tests are run in order unless one fails (then stops).
				515	Backgroundable tests are run simultaneously; when a foreground test is
				516	encountered, we wait for all active tests to finish before continuing.
				517	'''
				518	self.tests_to_run = deque()
				519	for x in root.walk():
				520	if not x.is_leaf():
				521	continue
				522	if untested_only and x.get_state().status != TestState.UNTESTED:
				523	continue
				524	self.tests_to_run.append(x)
				525	self.run_next_test()
				526
				527	def reap_completed_tests(self):
				528	'''
				529	Removes completed tests from the set of active tests.
				530
				531	Also updates the visible test if it was reaped.
				532	'''
				533	for t, v in dict(self.invocations).iteritems():
				534	if v.is_completed():
				535	del self.invocations[t]
				536
				537	if (self.visible_test is None or
				538	self.visible_test not in self.invocations):
				539	self.set_visible_test(None)
				540	# Make the first running test, if any, the visible test
				541	for t in self.test_list.walk():
				542	if t in self.invocations:
				543	self.set_visible_test(t)
				544	break
				545
				546	def abort_active_tests(self):
				547	'''
				548	Kills and waits for all active tests.
				549	'''
				550	self.reap_completed_tests()
				551	for test, invoc in self.invocations.items():
				552	factory.console.info('Killing active test %s...' % test.path)
				553	invoc.abort_and_join()
				554	factory.console.info('Killed %s' % test.path)
				555	del self.invocations[test]
				556	self.reap_completed_tests()
				557
				558	def reboot(self):
				559	'''
				560	Reboots the machine (from a RebootTest).
				561	'''
				562	logging.info('Rebooting')
				563	subprocess.check_call('sync')
				564	subprocess.check_call('reboot')
				565	time.sleep(30)
				566	assert False, 'Never reached (should reboot)'
				567
				568	def main(self):
				569	parser = OptionParser()
				570	parser.add_option('-v', '--verbose', dest='verbose',
				571	action='store_true',
				572	help='Enable debug logging')
				573	parser.add_option('--print_test_list', dest='print_test_list',
				574	metavar='FILE',
				575	help='Read and print test list FILE, and exit')
				576	(self.options, self.args) = parser.parse_args()
				577
				578	factory.init_logging('goofy', verbose=self.options.verbose)
				579
				580	if self.options.print_test_list:
				581	print (factory.read_test_list(self.options.print_test_list).
				582	__repr__(recursive=True))
				583	return
				584
				585	logging.info('Started')
				586
				587	self.start_state_server()
				588	# Update HWID configuration tag.
				589	self.state_instance.set_shared_data('hwid_cfg', get_hwid_cfg())
				590
				591	self.test_list_path = find_test_list()
				592	self.test_list = factory.read_test_list(self.test_list_path,
				593	self.state_instance)
				594	logging.info('TEST_LIST:\n%s', self.test_list.__repr__(recursive=True))
				595
				596	self.init_states()
				597	self.start_event_server()
				598	self.start_ui()
				599	prespawner.start()
				600
				601	def state_change_callback(test, state):
				602	self.event_client.post_event(
				603	Event(Event.Type.STATE_CHANGE,
				604	path=test.path, state=state))
				605	self.test_list.state_change_callback = state_change_callback
				606
				607	try:
				608	tests_after_reboot = self.state_instance.get_shared_data(
				609	'tests_after_reboot')
				610	except KeyError:
				611	tests_after_reboot = None
				612
				613	if tests_after_reboot is not None:
				614	logging.info('Resuming tests after reboot: %s', tests_after_reboot)
				615	self.state_instance.set_shared_data('tests_after_reboot', None)
				616	self.tests_to_run.extend(
				617	self.test_list.lookup_path(t) for t in tests_after_reboot)
				618	self.run_next_test()
				619	else:
				620	self.run_tests(self.test_list, untested_only=True)
				621
				622	# Process events forever.
				623	while True:
				624	event = self.run_queue.get()
				625	if not event:
				626	# Shutdown request (although this currently never happens).
				627	self.run_queue.task_done()
				628	break
				629
				630	try:
				631	event()
				632	except Exception as e: # pylint: disable=W0703
				633	logging.error('Error in event loop: %s', e)
				634	traceback.print_exc(sys.stderr)
				635	# But keep going
				636	finally:
				637	self.run_queue.task_done()
				638
				639
				640	if __name__ == '__main__':
				641	Goofy().main()