Blame - server/hosts/site_host.py - chromium.googlesource.com/chromiumos/platform/tauto

blob: 32587ae3efb459f4e1ab63f99872b22af6cf8984 [file] [log] [blame]

J. Richard Barnette	24adbf4	2012-04-11 15:04:53 -0700	[diff] [blame]	1	# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Dale Curtis	aa5eedb	2011-08-23 16:18:52 -0700	[diff] [blame]	2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4
J. Richard Barnette	134ec2c	2012-04-25 12:59:37 -0700	[diff] [blame]	5	import time
				6
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	7	from autotest_lib.client.bin import utils
J. Richard Barnette	134ec2c	2012-04-25 12:59:37 -0700	[diff] [blame]	8	from autotest_lib.client.common_lib import global_config, error
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	9	from autotest_lib.client.common_lib.cros import autoupdater
				10	from autotest_lib.server import autoserv_parser
				11	from autotest_lib.server import site_host_attributes
				12	from autotest_lib.server import site_remote_power
J. Richard Barnette	67ccb87	2012-04-19 16:34:56 -0700	[diff] [blame]	13	from autotest_lib.server.cros import servo
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	14	from autotest_lib.server.hosts import remote
J. Richard Barnette	24adbf4	2012-04-11 15:04:53 -0700	[diff] [blame]	15
				16
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	17	def make_ssh_command(user='root', port=22, opts='', hosts_file=None,
				18	connect_timeout=None, alive_interval=None):
				19	"""Override default make_ssh_command to use options tuned for Chrome OS.
				20
				21	Tuning changes:
Chris Sosa	f7fcd6e	2011-09-27 17:30:47 -0700	[diff] [blame]	22	- ConnectTimeout=30; maximum of 30 seconds allowed for an SSH connection
				23	failure. Consistency with remote_access.sh.
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	24
Dale Curtis	aa5eedb	2011-08-23 16:18:52 -0700	[diff] [blame]	25	- ServerAliveInterval=180; which causes SSH to ping connection every
				26	180 seconds. In conjunction with ServerAliveCountMax ensures that if the
				27	connection dies, Autotest will bail out quickly. Originally tried 60 secs,
				28	but saw frequent job ABORTS where the test completed successfully.
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	29
Chris Sosa	f7fcd6e	2011-09-27 17:30:47 -0700	[diff] [blame]	30	- ServerAliveCountMax=3; consistency with remote_access.sh.
				31
				32	- ConnectAttempts=4; reduce flakiness in connection errors; consistency
				33	with remote_access.sh.
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	34
				35	- UserKnownHostsFile=/dev/null; we don't care about the keys. Host keys
				36	change with every new installation, don't waste memory/space saving them.
Chris Sosa	f7fcd6e	2011-09-27 17:30:47 -0700	[diff] [blame]	37
				38	- SSH protocol forced to 2; needed for ServerAliveInterval.
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	39	"""
				40	base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
				41	' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
Chris Sosa	f7fcd6e	2011-09-27 17:30:47 -0700	[diff] [blame]	42	' -o ConnectTimeout=30 -o ServerAliveInterval=180'
				43	' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
				44	' -o Protocol=2 -l %s -p %d')
Dale Curtis	cb7bfaf	2011-06-07 16:21:57 -0700	[diff] [blame]	45	return base_command % (opts, user, port)
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	46
				47
				48	class SiteHost(remote.RemoteHost):
				49	"""Chromium OS specific subclass of Host."""
				50
				51	_parser = autoserv_parser.autoserv_parser
				52
				53	# Time to wait for new kernel to be marked successful.
Chris Masone	163cead	2012-05-16 11:49:48 -0700	[diff] [blame]	54	_KERNEL_UPDATE_TIMEOUT = 120
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	55
				56	# Ephemeral file to indicate that an update has just occurred.
				57	_JUST_UPDATED_FLAG = '/tmp/just_updated'
				58
J. Richard Barnette	134ec2c	2012-04-25 12:59:37 -0700	[diff] [blame]	59	# Timeout values used in test_wait_for_sleep(), et al.
				60	#
				61	# _RESUME_TIMEOUT has to be big enough to allow time for WiFi
				62	# reconnection.
				63	#
				64	# _REBOOT_TIMEOUT has to be big enough to allow time for the 30
				65	# second dev-mode screen delay _and_ time for network startup,
				66	# which takes several seconds longer than boot.
				67	#
				68	# TODO(jrbarnette): None of these times have been thoroughly
				69	# tested empirically; if timeouts are a problem, increasing the
				70	# time limit really might be the right answer.
				71	_SLEEP_TIMEOUT = 2
				72	_RESUME_TIMEOUT = 5
				73	_SHUTDOWN_TIMEOUT = 5
				74	_REBOOT_TIMEOUT = 45
				75
				76
J. Richard Barnette	67ccb87	2012-04-19 16:34:56 -0700	[diff] [blame]	77	def _initialize(self, hostname, require_servo=False, args, *dargs):
				78	"""Initialize superclasses, and \|self.servo\|.
				79
				80	For creating the host servo object, there are three
				81	possibilities: First, if the host is a lab system known to
				82	have a servo board, we connect to that servo unconditionally.
				83	Second, if we're called from a control file that requires
				84	servo features for testing, it will pass \|require_servo\| set
				85	to \|True\|, and we will start a local servod. If neither of
				86	these cases apply, \|self.servo\| will be \|None\|.
				87
				88	"""
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	89	super(SiteHost, self)._initialize(hostname=hostname,
				90	args, *dargs)
J. Richard Barnette	67ccb87	2012-04-19 16:34:56 -0700	[diff] [blame]	91	self.servo = servo.Servo.get_lab_servo(hostname)
				92	if not self.servo and require_servo:
				93	self.servo = servo.Servo()
J. Richard Barnette	45e93de	2012-04-11 17:24:15 -0700	[diff] [blame]	94
				95
				96	def machine_install(self, update_url=None, force_update=False):
				97	if not update_url and self._parser.options.image:
				98	update_url = self._parser.options.image
				99	elif not update_url:
				100	raise autoupdater.ChromiumOSError(
				101	'Update failed. No update URL provided.')
				102
				103	# Attempt to update the system.
				104	updater = autoupdater.ChromiumOSUpdater(update_url, host=self)
				105	if updater.run_update(force_update):
				106	# Figure out active and inactive kernel.
				107	active_kernel, inactive_kernel = updater.get_kernel_state()
				108
				109	# Ensure inactive kernel has higher priority than active.
				110	if (updater.get_kernel_priority(inactive_kernel)
				111	< updater.get_kernel_priority(active_kernel)):
				112	raise autoupdater.ChromiumOSError(
				113	'Update failed. The priority of the inactive kernel'
				114	' partition is less than that of the active kernel'
				115	' partition.')
				116
				117	# Updater has returned, successfully, reboot the host.
				118	self.reboot(timeout=60, wait=True)
				119
				120	# Following the reboot, verify the correct version.
				121	updater.check_version()
				122
				123	# Figure out newly active kernel.
				124	new_active_kernel, _ = updater.get_kernel_state()
				125
				126	# Ensure that previously inactive kernel is now the active kernel.
				127	if new_active_kernel != inactive_kernel:
				128	raise autoupdater.ChromiumOSError(
				129	'Update failed. New kernel partition is not active after'
				130	' boot.')
				131
				132	host_attributes = site_host_attributes.HostAttributes(self.hostname)
				133	if host_attributes.has_chromeos_firmware:
				134	# Wait until tries == 0 and success, or until timeout.
				135	utils.poll_for_condition(
				136	lambda: (updater.get_kernel_tries(new_active_kernel) == 0
				137	and updater.get_kernel_success(new_active_kernel)),
				138	exception=autoupdater.ChromiumOSError(
				139	'Update failed. Timed out waiting for system to mark'
				140	' new kernel as successful.'),
				141	timeout=self._KERNEL_UPDATE_TIMEOUT, sleep_interval=5)
				142
				143	# TODO(dalecurtis): Hack for R12 builds to make sure BVT runs of
				144	# platform_Shutdown pass correctly.
				145	if updater.update_version.startswith('0.12'):
				146	self.reboot(timeout=60, wait=True)
				147
				148	# Mark host as recently updated. Hosts are rebooted at the end of
				149	# every test cycle which will remove the file.
				150	self.run('touch %s' % self._JUST_UPDATED_FLAG)
				151
				152	# Clean up any old autotest directories which may be lying around.
				153	for path in global_config.global_config.get_config_value(
				154	'AUTOSERV', 'client_autodir_paths', type=list):
				155	self.run('rm -rf ' + path)
				156
				157
				158	def has_just_updated(self):
				159	"""Indicates whether the host was updated within this boot."""
				160	# Check for the existence of the just updated flag file.
				161	return self.run(
				162	'[ -f %s ] && echo T \|\| echo F'
				163	% self._JUST_UPDATED_FLAG).stdout.strip() == 'T'
				164
				165
				166	def cleanup(self):
				167	"""Special cleanup method to make sure hosts always get power back."""
				168	super(SiteHost, self).cleanup()
				169	remote_power = site_remote_power.RemotePower(self.hostname)
				170	if remote_power:
				171	remote_power.set_power_on()
				172
				173
				174	def verify_software(self):
				175	"""Ensure the stateful partition has space for Autotest and updates.
				176
				177	Similar to what is done by AbstractSSH, except instead of checking the
				178	Autotest installation path, just check the stateful partition.
				179
				180	Checking the stateful partition is preferable in case it has been wiped,
				181	resulting in an Autotest installation path which doesn't exist and isn't
				182	writable. We still want to pass verify in this state since the partition
				183	will be recovered with the next install.
				184	"""
				185	super(SiteHost, self).verify_software()
				186	self.check_diskspace(
				187	'/mnt/stateful_partition',
				188	global_config.global_config.get_config_value(
				189	'SERVER', 'gb_diskspace_required', type=int,
				190	default=20))
J. Richard Barnette	134ec2c	2012-04-25 12:59:37 -0700	[diff] [blame]	191
				192
				193	def _ping_is_up(self):
				194	"""Ping the host once, and return whether it responded."""
				195	return utils.ping(self.hostname, tries=1, deadline=1) == 0
				196
				197
				198	def _ping_wait_down(self, timeout):
				199	"""Wait until the host no longer responds to `ping`.
				200
				201	@param timeout Minimum time to allow before declaring the
				202	host to be non-responsive.
				203	"""
				204
				205	# This function is a slightly faster version of wait_down().
				206	#
				207	# In AbstractSSHHost.wait_down(), `ssh` is used to determine
				208	# whether the host is down. In some situations (mine, at
				209	# least), `ssh` can take over a minute to determine that the
				210	# host is down. The `ping` command answers the question
				211	# faster, so we use that here instead.
				212	#
				213	# There is no equivalent for wait_up(), because a target that
				214	# answers to `ping` won't necessarily respond to `ssh`.
				215	end_time = time.time() + timeout
				216	while time.time() <= end_time:
				217	if not self._ping_is_up():
				218	return True
				219
				220	# If the timeout is short relative to the run time of
				221	# _ping_is_up(), we might be prone to false failures for
				222	# lack of checking frequently enough. To be safe, we make
				223	# one last check _after_ the deadline.
				224	return not self._ping_is_up()
				225
				226
				227	def test_wait_for_sleep(self):
				228	"""Wait for the client to enter low-power sleep mode.
				229
				230	The test for "is asleep" can't distinguish a system that is
				231	powered off; to confirm that the unit was asleep, it is
				232	necessary to force resume, and then call
				233	`test_wait_for_resume()`.
				234
				235	This function is expected to be called from a test as part
				236	of a sequence like the following:
				237
				238	~~~~~~~~
				239	boot_id = host.get_boot_id()
				240	# trigger sleep on the host
				241	host.test_wait_for_sleep()
				242	# trigger resume on the host
				243	host.test_wait_for_resume(boot_id)
				244	~~~~~~~~
				245
				246	@exception TestFail The host did not go to sleep within
				247	the allowed time.
				248	"""
				249	if not self._ping_wait_down(timeout=self._SLEEP_TIMEOUT):
				250	raise error.TestFail(
				251	'client failed to sleep after %d seconds' %
				252	self._SLEEP_TIMEOUT)
				253
				254
				255	def test_wait_for_resume(self, old_boot_id):
				256	"""Wait for the client to resume from low-power sleep mode.
				257
				258	The `old_boot_id` parameter should be the value from
				259	`get_boot_id()` obtained prior to entering sleep mode. A
				260	`TestFail` exception is raised if the boot id changes.
				261
				262	See @ref test_wait_for_sleep for more on this function's
				263	usage.
				264
				265	@param[in] old_boot_id A boot id value obtained before the
				266	target host went to sleep.
				267
				268	@exception TestFail The host did not respond within the
				269	allowed time.
				270	@exception TestFail The host responded, but the boot id test
				271	indicated a reboot rather than a sleep
				272	cycle.
				273	"""
				274	if not self.wait_up(timeout=self._RESUME_TIMEOUT):
				275	raise error.TestFail(
				276	'client failed to resume from sleep after %d seconds' %
				277	self._RESUME_TIMEOUT)
				278	else:
				279	new_boot_id = self.get_boot_id()
				280	if new_boot_id != old_boot_id:
				281	raise error.TestFail(
				282	'client rebooted, but sleep was expected'
				283	' (old boot %s, new boot %s)'
				284	% (old_boot_id, new_boot_id))
				285
				286
				287	def test_wait_for_shutdown(self):
				288	"""Wait for the client to shut down.
				289
				290	The test for "has shut down" can't distinguish a system that
				291	is merely asleep; to confirm that the unit was down, it is
				292	necessary to force boot, and then call test_wait_for_boot().
				293
				294	This function is expected to be called from a test as part
				295	of a sequence like the following:
				296
				297	~~~~~~~~
				298	boot_id = host.get_boot_id()
				299	# trigger shutdown on the host
				300	host.test_wait_for_shutdown()
				301	# trigger boot on the host
				302	host.test_wait_for_boot(boot_id)
				303	~~~~~~~~
				304
				305	@exception TestFail The host did not shut down within the
				306	allowed time.
				307	"""
				308	if not self._ping_wait_down(timeout=self._SHUTDOWN_TIMEOUT):
				309	raise error.TestFail(
				310	'client failed to shut down after %d seconds' %
				311	self._SHUTDOWN_TIMEOUT)
				312
				313
				314	def test_wait_for_boot(self, old_boot_id=None):
				315	"""Wait for the client to boot from cold power.
				316
				317	The `old_boot_id` parameter should be the value from
				318	`get_boot_id()` obtained prior to shutting down. A
				319	`TestFail` exception is raised if the boot id does not
				320	change. The boot id test is omitted if `old_boot_id` is not
				321	specified.
				322
				323	See @ref test_wait_for_shutdown for more on this function's
				324	usage.
				325
				326	@param[in] old_boot_id A boot id value obtained before the
				327	shut down.
				328
				329	@exception TestFail The host did not respond within the
				330	allowed time.
				331	@exception TestFail The host responded, but the boot id test
				332	indicated that there was no reboot.
				333	"""
				334	if not self.wait_up(timeout=self._REBOOT_TIMEOUT):
				335	raise error.TestFail(
				336	'client failed to reboot after %d seconds' %
				337	self._REBOOT_TIMEOUT)
				338	elif old_boot_id:
				339	if self.get_boot_id() == old_boot_id:
				340	raise error.TestFail(
				341	'client is back up, but did not reboot'
				342	' (boot %s)' % old_boot_id)