Blame - server/hosts/site_host.py - chromium.googlesource.com/chromiumos/platform/tauto

blob: 0f75d6ba8a3d37a8ec5b6dc211519f208580599a [file] [log] [blame]

J. Richard Barnette	24adbf4	2012-04-11 15:04:53 -0700	[diff] [blame]	1	# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Dale Curtis	aa5eedb	2011-08-23 16:18:52 -0700	[diff] [blame]	2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4
J. Richard Barnette	1d78b01	2012-05-15 13:56:30 -0700	[diff] [blame^]	5	import logging
				6	import subprocess
J. Richard Barnette	134ec2c	2012-04-25 12:59:37 -0700	[diff] [blame]	7	import time
J. Richard Barnette	1d78b01	2012-05-15 13:56:30 -0700	[diff] [blame^]	8	import xmlrpclib
J. Richard Barnette	134ec2c	2012-04-25 12:59:37 -0700	[diff] [blame]	9
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	10	from autotest_lib.client.bin import utils
J. Richard Barnette	134ec2c	2012-04-25 12:59:37 -0700	[diff] [blame]	11	from autotest_lib.client.common_lib import global_config, error
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	12	from autotest_lib.client.common_lib.cros import autoupdater
				13	from autotest_lib.server import autoserv_parser
				14	from autotest_lib.server import site_host_attributes
				15	from autotest_lib.server import site_remote_power
J. Richard Barnette	67ccb87	2012-04-19 16:34:56 -0700	[diff] [blame]	16	from autotest_lib.server.cros import servo
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	17	from autotest_lib.server.hosts import remote
J. Richard Barnette	24adbf4	2012-04-11 15:04:53 -0700	[diff] [blame]	18
				19
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	20	def make_ssh_command(user='root', port=22, opts='', hosts_file=None,
				21	connect_timeout=None, alive_interval=None):
				22	"""Override default make_ssh_command to use options tuned for Chrome OS.
				23
				24	Tuning changes:
Chris Sosa	f7fcd6e	2011-09-27 17:30:47 -0700	[diff] [blame]	25	- ConnectTimeout=30; maximum of 30 seconds allowed for an SSH connection
				26	failure. Consistency with remote_access.sh.
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	27
Dale Curtis	aa5eedb	2011-08-23 16:18:52 -0700	[diff] [blame]	28	- ServerAliveInterval=180; which causes SSH to ping connection every
				29	180 seconds. In conjunction with ServerAliveCountMax ensures that if the
				30	connection dies, Autotest will bail out quickly. Originally tried 60 secs,
				31	but saw frequent job ABORTS where the test completed successfully.
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	32
Chris Sosa	f7fcd6e	2011-09-27 17:30:47 -0700	[diff] [blame]	33	- ServerAliveCountMax=3; consistency with remote_access.sh.
				34
				35	- ConnectAttempts=4; reduce flakiness in connection errors; consistency
				36	with remote_access.sh.
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	37
				38	- UserKnownHostsFile=/dev/null; we don't care about the keys. Host keys
				39	change with every new installation, don't waste memory/space saving them.
Chris Sosa	f7fcd6e	2011-09-27 17:30:47 -0700	[diff] [blame]	40
				41	- SSH protocol forced to 2; needed for ServerAliveInterval.
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	42	"""
				43	base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
				44	' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
Chris Sosa	f7fcd6e	2011-09-27 17:30:47 -0700	[diff] [blame]	45	' -o ConnectTimeout=30 -o ServerAliveInterval=180'
				46	' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
				47	' -o Protocol=2 -l %s -p %d')
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	48	return base_command % (opts, user, port)
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	49
				50
				51	class SiteHost(remote.RemoteHost):
				52	"""Chromium OS specific subclass of Host."""
				53
				54	_parser = autoserv_parser.autoserv_parser
				55
				56	# Time to wait for new kernel to be marked successful.
Chris Masone	163cead	2012-05-16 11:49:48 -0700	[diff] [blame]	57	_KERNEL_UPDATE_TIMEOUT = 120
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	58
				59	# Ephemeral file to indicate that an update has just occurred.
				60	_JUST_UPDATED_FLAG = '/tmp/just_updated'
				61
J. Richard Barnette	134ec2c	2012-04-25 12:59:37 -0700	[diff] [blame]	62	# Timeout values used in test_wait_for_sleep(), et al.
				63	#
				64	# _RESUME_TIMEOUT has to be big enough to allow time for WiFi
				65	# reconnection.
				66	#
				67	# _REBOOT_TIMEOUT has to be big enough to allow time for the 30
				68	# second dev-mode screen delay _and_ time for network startup,
				69	# which takes several seconds longer than boot.
				70	#
				71	# TODO(jrbarnette): None of these times have been thoroughly
				72	# tested empirically; if timeouts are a problem, increasing the
				73	# time limit really might be the right answer.
				74	_SLEEP_TIMEOUT = 2
				75	_RESUME_TIMEOUT = 5
				76	_SHUTDOWN_TIMEOUT = 5
				77	_REBOOT_TIMEOUT = 45
				78
				79
J. Richard Barnette	67ccb87	2012-04-19 16:34:56 -0700	[diff] [blame]	80	def _initialize(self, hostname, require_servo=False, args, *dargs):
				81	"""Initialize superclasses, and \|self.servo\|.
				82
				83	For creating the host servo object, there are three
				84	possibilities: First, if the host is a lab system known to
				85	have a servo board, we connect to that servo unconditionally.
				86	Second, if we're called from a control file that requires
				87	servo features for testing, it will pass \|require_servo\| set
				88	to \|True\|, and we will start a local servod. If neither of
				89	these cases apply, \|self.servo\| will be \|None\|.
				90
				91	"""
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	92	super(SiteHost, self)._initialize(hostname=hostname,
				93	args, *dargs)
J. Richard Barnette	1d78b01	2012-05-15 13:56:30 -0700	[diff] [blame^]	94	self._xmlrpc_proxy_map = {}
J. Richard Barnette	67ccb87	2012-04-19 16:34:56 -0700	[diff] [blame]	95	self.servo = servo.Servo.get_lab_servo(hostname)
				96	if not self.servo and require_servo:
				97	self.servo = servo.Servo()
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	98
				99
				100	def machine_install(self, update_url=None, force_update=False):
				101	if not update_url and self._parser.options.image:
				102	update_url = self._parser.options.image
				103	elif not update_url:
				104	raise autoupdater.ChromiumOSError(
				105	'Update failed. No update URL provided.')
				106
				107	# Attempt to update the system.
				108	updater = autoupdater.ChromiumOSUpdater(update_url, host=self)
				109	if updater.run_update(force_update):
				110	# Figure out active and inactive kernel.
				111	active_kernel, inactive_kernel = updater.get_kernel_state()
				112
				113	# Ensure inactive kernel has higher priority than active.
				114	if (updater.get_kernel_priority(inactive_kernel)
				115	< updater.get_kernel_priority(active_kernel)):
				116	raise autoupdater.ChromiumOSError(
				117	'Update failed. The priority of the inactive kernel'
				118	' partition is less than that of the active kernel'
				119	' partition.')
				120
				121	# Updater has returned, successfully, reboot the host.
				122	self.reboot(timeout=60, wait=True)
				123
				124	# Following the reboot, verify the correct version.
				125	updater.check_version()
				126
				127	# Figure out newly active kernel.
				128	new_active_kernel, _ = updater.get_kernel_state()
				129
				130	# Ensure that previously inactive kernel is now the active kernel.
				131	if new_active_kernel != inactive_kernel:
				132	raise autoupdater.ChromiumOSError(
				133	'Update failed. New kernel partition is not active after'
				134	' boot.')
				135
				136	host_attributes = site_host_attributes.HostAttributes(self.hostname)
				137	if host_attributes.has_chromeos_firmware:
				138	# Wait until tries == 0 and success, or until timeout.
				139	utils.poll_for_condition(
				140	lambda: (updater.get_kernel_tries(new_active_kernel) == 0
				141	and updater.get_kernel_success(new_active_kernel)),
				142	exception=autoupdater.ChromiumOSError(
				143	'Update failed. Timed out waiting for system to mark'
				144	' new kernel as successful.'),
				145	timeout=self._KERNEL_UPDATE_TIMEOUT, sleep_interval=5)
				146
				147	# TODO(dalecurtis): Hack for R12 builds to make sure BVT runs of
				148	# platform_Shutdown pass correctly.
				149	if updater.update_version.startswith('0.12'):
				150	self.reboot(timeout=60, wait=True)
				151
				152	# Mark host as recently updated. Hosts are rebooted at the end of
				153	# every test cycle which will remove the file.
				154	self.run('touch %s' % self._JUST_UPDATED_FLAG)
				155
				156	# Clean up any old autotest directories which may be lying around.
				157	for path in global_config.global_config.get_config_value(
				158	'AUTOSERV', 'client_autodir_paths', type=list):
				159	self.run('rm -rf ' + path)
				160
				161
				162	def has_just_updated(self):
				163	"""Indicates whether the host was updated within this boot."""
				164	# Check for the existence of the just updated flag file.
				165	return self.run(
				166	'[ -f %s ] && echo T \|\| echo F'
				167	% self._JUST_UPDATED_FLAG).stdout.strip() == 'T'
				168
				169
J. Richard Barnette	1d78b01	2012-05-15 13:56:30 -0700	[diff] [blame^]	170	def close(self):
				171	super(SiteHost, self).close()
				172	self.xmlrpc_disconnect_all()
				173
				174
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	175	def cleanup(self):
				176	"""Special cleanup method to make sure hosts always get power back."""
				177	super(SiteHost, self).cleanup()
				178	remote_power = site_remote_power.RemotePower(self.hostname)
				179	if remote_power:
				180	remote_power.set_power_on()
				181
				182
				183	def verify_software(self):
				184	"""Ensure the stateful partition has space for Autotest and updates.
				185
				186	Similar to what is done by AbstractSSH, except instead of checking the
				187	Autotest installation path, just check the stateful partition.
				188
				189	Checking the stateful partition is preferable in case it has been wiped,
				190	resulting in an Autotest installation path which doesn't exist and isn't
				191	writable. We still want to pass verify in this state since the partition
				192	will be recovered with the next install.
				193	"""
				194	super(SiteHost, self).verify_software()
				195	self.check_diskspace(
				196	'/mnt/stateful_partition',
				197	global_config.global_config.get_config_value(
				198	'SERVER', 'gb_diskspace_required', type=int,
				199	default=20))
J. Richard Barnette	134ec2c	2012-04-25 12:59:37 -0700	[diff] [blame]	200
				201
J. Richard Barnette	1d78b01	2012-05-15 13:56:30 -0700	[diff] [blame^]	202	def xmlrpc_connect(self, command, port, cleanup=None):
				203	"""Connect to an XMLRPC server on the host.
				204
				205	The `command` argument should be a simple shell command that
				206	starts an XMLRPC server on the given `port`. The command
				207	must not daemonize, and must terminate cleanly on SIGTERM.
				208	The command is started in the background on the host, and a
				209	local XMLRPC client for the server is created and returned
				210	to the caller.
				211
				212	Note that the process of creating an XMLRPC client makes no
				213	attempt to connect to the remote server; the caller is
				214	responsible for determining whether the server is running
				215	correctly, and is ready to serve requests.
				216
				217	@param command Shell command to start the server.
				218	@param port Port number on which the server is expected to
				219	be serving.
				220	"""
				221	self.xmlrpc_disconnect(port)
				222
				223	# Chrome OS on the target closes down most external ports
				224	# for security. We could open the port, but doing that
				225	# would conflict with security tests that check that only
				226	# expected ports are open. So, to get to the port on the
				227	# target we use an ssh tunnel.
				228	local_port = utils.get_unused_port()
				229	tunnel_options = '-n -N -q -L %d:localhost:%d' % (local_port, port)
				230	ssh_cmd = make_ssh_command(opts=tunnel_options)
				231	tunnel_cmd = '%s %s' % (ssh_cmd, self.hostname)
				232	logging.debug('Full tunnel command: %s', tunnel_cmd)
				233	tunnel_proc = subprocess.Popen(tunnel_cmd, shell=True, close_fds=True)
				234	logging.debug('Started XMLRPC tunnel, local = %d'
				235	' remote = %d, pid = %d',
				236	local_port, port, tunnel_proc.pid)
				237
				238	# Start the server on the host. Redirection in the command
				239	# below is necessary, because 'ssh' won't terminate until
				240	# background child processes close stdin, stdout, and
				241	# stderr.
				242	remote_cmd = '( %s ) </dev/null >/dev/null 2>&1 & echo $!' % command
				243	remote_pid = self.run(remote_cmd).stdout.rstrip('\n')
				244	logging.debug('Started XMLRPC server on host %s, pid = %s',
				245	self.hostname, remote_pid)
				246
				247	self._xmlrpc_proxy_map[port] = (cleanup, tunnel_proc)
				248	rpc_url = 'http://localhost:%d' % local_port
				249	return xmlrpclib.ServerProxy(rpc_url, allow_none=True)
				250
				251
				252	def xmlrpc_disconnect(self, port):
				253	"""Disconnect from an XMLRPC server on the host.
				254
				255	Terminates the remote XMLRPC server previously started for
				256	the given `port`. Also closes the local ssh tunnel created
				257	for the connection to the host. This function does not
				258	directly alter the state of a previously returned XMLRPC
				259	client object; however disconnection will cause all
				260	subsequent calls to methods on the object to fail.
				261
				262	This function does nothing if requested to disconnect a port
				263	that was not previously connected via `self.xmlrpc_connect()`
				264
				265	@param port Port number passed to a previous call to
				266	`xmlrpc_connect()`
				267	"""
				268	if port not in self._xmlrpc_proxy_map:
				269	return
				270	entry = self._xmlrpc_proxy_map[port]
				271	remote_name = entry[0]
				272	tunnel_proc = entry[1]
				273	if remote_name:
				274	# We use 'pkill' to find our target process rather than
				275	# a PID, because the host may have rebooted since
				276	# connecting, and we don't want to kill an innocent
				277	# process with the same PID.
				278	#
				279	# 'pkill' helpfully exits with status 1 if no target
				280	# process is found, for which run() will throw an
				281	# exception. We don't want that, so we ignore the
				282	# status.
				283	self.run("pkill -f '%s'" % remote_name, ignore_status=True)
				284
				285	if tunnel_proc.poll() is None:
				286	tunnel_proc.terminate()
				287	logging.debug('Terminated tunnel, pid %d', tunnel_proc.pid)
				288	else:
				289	logging.debug('Tunnel pid %d terminated early, status %d',
				290	tunnel_proc.pid, tunnel_proc.returncode)
				291	del self._xmlrpc_proxy_map[port]
				292
				293
				294	def xmlrpc_disconnect_all(self):
				295	"""Disconnect all known XMLRPC proxy ports."""
				296	for port in self._xmlrpc_proxy_map.keys():
				297	self.xmlrpc_disconnect(port)
				298
				299
J. Richard Barnette	134ec2c	2012-04-25 12:59:37 -0700	[diff] [blame]	300	def _ping_is_up(self):
				301	"""Ping the host once, and return whether it responded."""
				302	return utils.ping(self.hostname, tries=1, deadline=1) == 0
				303
				304
				305	def _ping_wait_down(self, timeout):
				306	"""Wait until the host no longer responds to `ping`.
				307
				308	@param timeout Minimum time to allow before declaring the
				309	host to be non-responsive.
				310	"""
				311
				312	# This function is a slightly faster version of wait_down().
				313	#
				314	# In AbstractSSHHost.wait_down(), `ssh` is used to determine
				315	# whether the host is down. In some situations (mine, at
				316	# least), `ssh` can take over a minute to determine that the
				317	# host is down. The `ping` command answers the question
				318	# faster, so we use that here instead.
				319	#
				320	# There is no equivalent for wait_up(), because a target that
				321	# answers to `ping` won't necessarily respond to `ssh`.
				322	end_time = time.time() + timeout
				323	while time.time() <= end_time:
				324	if not self._ping_is_up():
				325	return True
				326
				327	# If the timeout is short relative to the run time of
				328	# _ping_is_up(), we might be prone to false failures for
				329	# lack of checking frequently enough. To be safe, we make
				330	# one last check _after_ the deadline.
				331	return not self._ping_is_up()
				332
				333
				334	def test_wait_for_sleep(self):
				335	"""Wait for the client to enter low-power sleep mode.
				336
				337	The test for "is asleep" can't distinguish a system that is
				338	powered off; to confirm that the unit was asleep, it is
				339	necessary to force resume, and then call
				340	`test_wait_for_resume()`.
				341
				342	This function is expected to be called from a test as part
				343	of a sequence like the following:
				344
				345	~~~~~~~~
				346	boot_id = host.get_boot_id()
				347	# trigger sleep on the host
				348	host.test_wait_for_sleep()
				349	# trigger resume on the host
				350	host.test_wait_for_resume(boot_id)
				351	~~~~~~~~
				352
				353	@exception TestFail The host did not go to sleep within
				354	the allowed time.
				355	"""
				356	if not self._ping_wait_down(timeout=self._SLEEP_TIMEOUT):
				357	raise error.TestFail(
				358	'client failed to sleep after %d seconds' %
				359	self._SLEEP_TIMEOUT)
				360
				361
				362	def test_wait_for_resume(self, old_boot_id):
				363	"""Wait for the client to resume from low-power sleep mode.
				364
				365	The `old_boot_id` parameter should be the value from
				366	`get_boot_id()` obtained prior to entering sleep mode. A
				367	`TestFail` exception is raised if the boot id changes.
				368
				369	See @ref test_wait_for_sleep for more on this function's
				370	usage.
				371
				372	@param[in] old_boot_id A boot id value obtained before the
				373	target host went to sleep.
				374
				375	@exception TestFail The host did not respond within the
				376	allowed time.
				377	@exception TestFail The host responded, but the boot id test
				378	indicated a reboot rather than a sleep
				379	cycle.
				380	"""
				381	if not self.wait_up(timeout=self._RESUME_TIMEOUT):
				382	raise error.TestFail(
				383	'client failed to resume from sleep after %d seconds' %
				384	self._RESUME_TIMEOUT)
				385	else:
				386	new_boot_id = self.get_boot_id()
				387	if new_boot_id != old_boot_id:
				388	raise error.TestFail(
				389	'client rebooted, but sleep was expected'
				390	' (old boot %s, new boot %s)'
				391	% (old_boot_id, new_boot_id))
				392
				393
				394	def test_wait_for_shutdown(self):
				395	"""Wait for the client to shut down.
				396
				397	The test for "has shut down" can't distinguish a system that
				398	is merely asleep; to confirm that the unit was down, it is
				399	necessary to force boot, and then call test_wait_for_boot().
				400
				401	This function is expected to be called from a test as part
				402	of a sequence like the following:
				403
				404	~~~~~~~~
				405	boot_id = host.get_boot_id()
				406	# trigger shutdown on the host
				407	host.test_wait_for_shutdown()
				408	# trigger boot on the host
				409	host.test_wait_for_boot(boot_id)
				410	~~~~~~~~
				411
				412	@exception TestFail The host did not shut down within the
				413	allowed time.
				414	"""
				415	if not self._ping_wait_down(timeout=self._SHUTDOWN_TIMEOUT):
				416	raise error.TestFail(
				417	'client failed to shut down after %d seconds' %
				418	self._SHUTDOWN_TIMEOUT)
				419
				420
				421	def test_wait_for_boot(self, old_boot_id=None):
				422	"""Wait for the client to boot from cold power.
				423
				424	The `old_boot_id` parameter should be the value from
				425	`get_boot_id()` obtained prior to shutting down. A
				426	`TestFail` exception is raised if the boot id does not
				427	change. The boot id test is omitted if `old_boot_id` is not
				428	specified.
				429
				430	See @ref test_wait_for_shutdown for more on this function's
				431	usage.
				432
				433	@param[in] old_boot_id A boot id value obtained before the
				434	shut down.
				435
				436	@exception TestFail The host did not respond within the
				437	allowed time.
				438	@exception TestFail The host responded, but the boot id test
				439	indicated that there was no reboot.
				440	"""
				441	if not self.wait_up(timeout=self._REBOOT_TIMEOUT):
				442	raise error.TestFail(
				443	'client failed to reboot after %d seconds' %
				444	self._REBOOT_TIMEOUT)
				445	elif old_boot_id:
				446	if self.get_boot_id() == old_boot_id:
				447	raise error.TestFail(
				448	'client is back up, but did not reboot'
				449	' (boot %s)' % old_boot_id)