blob: 305de1e9759a48101c06dd6787b617ae748ad30d [file] [log] [blame]
J. Richard Barnette24adbf42012-04-11 15:04:53 -07001# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Dale Curtisaa5eedb2011-08-23 16:18:52 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
J. Richard Barnette1d78b012012-05-15 13:56:30 -07005import logging
6import subprocess
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07007import time
J. Richard Barnette1d78b012012-05-15 13:56:30 -07008import xmlrpclib
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07009
J. Richard Barnette45e93de2012-04-11 17:24:15 -070010from autotest_lib.client.bin import utils
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070011from autotest_lib.client.common_lib import global_config, error
J. Richard Barnette45e93de2012-04-11 17:24:15 -070012from autotest_lib.client.common_lib.cros import autoupdater
13from autotest_lib.server import autoserv_parser
14from autotest_lib.server import site_host_attributes
15from autotest_lib.server import site_remote_power
J. Richard Barnette67ccb872012-04-19 16:34:56 -070016from autotest_lib.server.cros import servo
J. Richard Barnette45e93de2012-04-11 17:24:15 -070017from autotest_lib.server.hosts import remote
J. Richard Barnette24adbf42012-04-11 15:04:53 -070018
19
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070020def make_ssh_command(user='root', port=22, opts='', hosts_file=None,
21 connect_timeout=None, alive_interval=None):
22 """Override default make_ssh_command to use options tuned for Chrome OS.
23
24 Tuning changes:
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070025 - ConnectTimeout=30; maximum of 30 seconds allowed for an SSH connection
26 failure. Consistency with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070027
Dale Curtisaa5eedb2011-08-23 16:18:52 -070028 - ServerAliveInterval=180; which causes SSH to ping connection every
29 180 seconds. In conjunction with ServerAliveCountMax ensures that if the
30 connection dies, Autotest will bail out quickly. Originally tried 60 secs,
31 but saw frequent job ABORTS where the test completed successfully.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070032
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070033 - ServerAliveCountMax=3; consistency with remote_access.sh.
34
35 - ConnectAttempts=4; reduce flakiness in connection errors; consistency
36 with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070037
38 - UserKnownHostsFile=/dev/null; we don't care about the keys. Host keys
39 change with every new installation, don't waste memory/space saving them.
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070040
41 - SSH protocol forced to 2; needed for ServerAliveInterval.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070042 """
43 base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
44 ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070045 ' -o ConnectTimeout=30 -o ServerAliveInterval=180'
46 ' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
47 ' -o Protocol=2 -l %s -p %d')
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070048 return base_command % (opts, user, port)
J. Richard Barnette45e93de2012-04-11 17:24:15 -070049
50
51class SiteHost(remote.RemoteHost):
52 """Chromium OS specific subclass of Host."""
53
54 _parser = autoserv_parser.autoserv_parser
55
56 # Time to wait for new kernel to be marked successful.
Chris Masone163cead2012-05-16 11:49:48 -070057 _KERNEL_UPDATE_TIMEOUT = 120
J. Richard Barnette45e93de2012-04-11 17:24:15 -070058
59 # Ephemeral file to indicate that an update has just occurred.
60 _JUST_UPDATED_FLAG = '/tmp/just_updated'
61
J. Richard Barnetteeb69d722012-06-18 17:29:44 -070062 # Timeout values associated with various Chrome OS state
63 # changes. In general, the timeouts are the maximum time to
64 # allow between some event X, and the time that the unit is
65 # on (or off) the network. Note that "time to get on the
66 # network" is typically longer than the time to complete the
67 # operation.
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070068 #
69 # TODO(jrbarnette): None of these times have been thoroughly
70 # tested empirically; if timeouts are a problem, increasing the
71 # time limit really might be the right answer.
J. Richard Barnetteeb69d722012-06-18 17:29:44 -070072 #
73 # SLEEP_TIMEOUT: Time to allow for suspend to memory.
74 # RESUME_TIMEOUT: Time to allow for resume after suspend.
75 # BOOT_TIMEOUT: Time to allow for boot from power off. Among
76 # other things, this includes time for the 30 second dev-mode
77 # screen delay,
78 # USB_BOOT_TIMEOUT: Time to allow for boot from a USB device,
79 # including the 30 second dev-mode delay.
80 # SHUTDOWN_TIMEOUT: Time to allow to shut down.
81 # REBOOT_TIMEOUT: Combination of shutdown and reboot times.
82
83 SLEEP_TIMEOUT = 2
84 RESUME_TIMEOUT = 5
85 BOOT_TIMEOUT = 45
86 USB_BOOT_TIMEOUT = 150
87 SHUTDOWN_TIMEOUT = 5
88 REBOOT_TIMEOUT = SHUTDOWN_TIMEOUT + BOOT_TIMEOUT
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070089
90
J. Richard Barnette55fb8062012-05-23 10:29:31 -070091 def _initialize(self, hostname, servo_host=None, servo_port=None,
92 *args, **dargs):
J. Richard Barnette67ccb872012-04-19 16:34:56 -070093 """Initialize superclasses, and |self.servo|.
94
95 For creating the host servo object, there are three
96 possibilities: First, if the host is a lab system known to
97 have a servo board, we connect to that servo unconditionally.
98 Second, if we're called from a control file that requires
J. Richard Barnette55fb8062012-05-23 10:29:31 -070099 servo features for testing, it will pass settings for
100 `servo_host`, `servo_port`, or both. If neither of these
101 cases apply, `self.servo` will be `None`.
J. Richard Barnette67ccb872012-04-19 16:34:56 -0700102
103 """
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700104 super(SiteHost, self)._initialize(hostname=hostname,
105 *args, **dargs)
J. Richard Barnettef0859852012-08-20 14:55:50 -0700106 # self.env is a dictionary of environment variable settings
107 # to be exported for commands run on the host.
108 # LIBC_FATAL_STDERR_ can be useful for diagnosing certain
109 # errors that might happen.
110 self.env['LIBC_FATAL_STDERR_'] = '1'
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700111 self._xmlrpc_proxy_map = {}
J. Richard Barnette67ccb872012-04-19 16:34:56 -0700112 self.servo = servo.Servo.get_lab_servo(hostname)
J. Richard Barnette55fb8062012-05-23 10:29:31 -0700113 if not self.servo:
114 # The Servo constructor generally doesn't accept 'None'
115 # for its parameters.
116 if servo_host is not None:
117 if servo_port is not None:
118 self.servo = servo.Servo(servo_host=servo_host,
119 servo_port=servo_port)
120 else:
121 self.servo = servo.Servo(servo_host=servo_host)
122 elif servo_port is not None:
123 self.servo = servo.Servo(servo_port=servo_port)
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700124
125
Chris Sosaa3ac2152012-05-23 22:23:13 -0700126 def machine_install(self, update_url=None, force_update=False,
127 local_devserver=False):
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700128 if not update_url and self._parser.options.image:
129 update_url = self._parser.options.image
130 elif not update_url:
131 raise autoupdater.ChromiumOSError(
132 'Update failed. No update URL provided.')
133
134 # Attempt to update the system.
Chris Sosaa3ac2152012-05-23 22:23:13 -0700135 updater = autoupdater.ChromiumOSUpdater(update_url, host=self,
136 local_devserver=local_devserver)
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700137 if updater.run_update(force_update):
138 # Figure out active and inactive kernel.
139 active_kernel, inactive_kernel = updater.get_kernel_state()
140
141 # Ensure inactive kernel has higher priority than active.
142 if (updater.get_kernel_priority(inactive_kernel)
143 < updater.get_kernel_priority(active_kernel)):
144 raise autoupdater.ChromiumOSError(
145 'Update failed. The priority of the inactive kernel'
146 ' partition is less than that of the active kernel'
147 ' partition.')
148
149 # Updater has returned, successfully, reboot the host.
150 self.reboot(timeout=60, wait=True)
151
152 # Following the reboot, verify the correct version.
153 updater.check_version()
154
155 # Figure out newly active kernel.
156 new_active_kernel, _ = updater.get_kernel_state()
157
158 # Ensure that previously inactive kernel is now the active kernel.
159 if new_active_kernel != inactive_kernel:
160 raise autoupdater.ChromiumOSError(
161 'Update failed. New kernel partition is not active after'
162 ' boot.')
163
164 host_attributes = site_host_attributes.HostAttributes(self.hostname)
165 if host_attributes.has_chromeos_firmware:
166 # Wait until tries == 0 and success, or until timeout.
167 utils.poll_for_condition(
168 lambda: (updater.get_kernel_tries(new_active_kernel) == 0
169 and updater.get_kernel_success(new_active_kernel)),
170 exception=autoupdater.ChromiumOSError(
171 'Update failed. Timed out waiting for system to mark'
172 ' new kernel as successful.'),
173 timeout=self._KERNEL_UPDATE_TIMEOUT, sleep_interval=5)
174
175 # TODO(dalecurtis): Hack for R12 builds to make sure BVT runs of
176 # platform_Shutdown pass correctly.
177 if updater.update_version.startswith('0.12'):
178 self.reboot(timeout=60, wait=True)
179
180 # Mark host as recently updated. Hosts are rebooted at the end of
181 # every test cycle which will remove the file.
182 self.run('touch %s' % self._JUST_UPDATED_FLAG)
183
184 # Clean up any old autotest directories which may be lying around.
185 for path in global_config.global_config.get_config_value(
186 'AUTOSERV', 'client_autodir_paths', type=list):
187 self.run('rm -rf ' + path)
188
189
190 def has_just_updated(self):
191 """Indicates whether the host was updated within this boot."""
192 # Check for the existence of the just updated flag file.
193 return self.run(
194 '[ -f %s ] && echo T || echo F'
195 % self._JUST_UPDATED_FLAG).stdout.strip() == 'T'
196
197
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700198 def close(self):
199 super(SiteHost, self).close()
200 self.xmlrpc_disconnect_all()
201
202
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700203 def cleanup(self):
204 """Special cleanup method to make sure hosts always get power back."""
205 super(SiteHost, self).cleanup()
206 remote_power = site_remote_power.RemotePower(self.hostname)
207 if remote_power:
208 remote_power.set_power_on()
209
210
Yu-Ju Honga2be94a2012-07-31 09:48:52 -0700211 def reboot(self, **dargs):
212 """
213 This function reboots the site host. The more generic
214 RemoteHost.reboot() performs sync and sleeps for 5
215 seconds. This is not necessary for Chrome OS devices as the
216 sync should be finished in a short time during the reboot
217 command.
218 """
Tom Wai-Hong Tamf5cd1d42012-08-13 12:04:08 +0800219 if 'reboot_cmd' not in dargs:
220 dargs['reboot_cmd'] = ('((reboot & sleep 10; reboot -f &)'
221 ' </dev/null >/dev/null 2>&1 &)')
Yu-Ju Honga2be94a2012-07-31 09:48:52 -0700222 # Enable fastsync to avoid running extra sync commands before reboot.
Tom Wai-Hong Tamf5cd1d42012-08-13 12:04:08 +0800223 if 'fastsync' not in dargs:
224 dargs['fastsync'] = True
Yu-Ju Honga2be94a2012-07-31 09:48:52 -0700225 super(SiteHost, self).reboot(**dargs)
226
227
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700228 def verify_software(self):
229 """Ensure the stateful partition has space for Autotest and updates.
230
231 Similar to what is done by AbstractSSH, except instead of checking the
232 Autotest installation path, just check the stateful partition.
233
234 Checking the stateful partition is preferable in case it has been wiped,
235 resulting in an Autotest installation path which doesn't exist and isn't
236 writable. We still want to pass verify in this state since the partition
237 will be recovered with the next install.
238 """
239 super(SiteHost, self).verify_software()
240 self.check_diskspace(
241 '/mnt/stateful_partition',
242 global_config.global_config.get_config_value(
243 'SERVER', 'gb_diskspace_required', type=int,
244 default=20))
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700245
246
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700247 def xmlrpc_connect(self, command, port, cleanup=None):
248 """Connect to an XMLRPC server on the host.
249
250 The `command` argument should be a simple shell command that
251 starts an XMLRPC server on the given `port`. The command
252 must not daemonize, and must terminate cleanly on SIGTERM.
253 The command is started in the background on the host, and a
254 local XMLRPC client for the server is created and returned
255 to the caller.
256
257 Note that the process of creating an XMLRPC client makes no
258 attempt to connect to the remote server; the caller is
259 responsible for determining whether the server is running
260 correctly, and is ready to serve requests.
261
262 @param command Shell command to start the server.
263 @param port Port number on which the server is expected to
264 be serving.
265 """
266 self.xmlrpc_disconnect(port)
267
268 # Chrome OS on the target closes down most external ports
269 # for security. We could open the port, but doing that
270 # would conflict with security tests that check that only
271 # expected ports are open. So, to get to the port on the
272 # target we use an ssh tunnel.
273 local_port = utils.get_unused_port()
274 tunnel_options = '-n -N -q -L %d:localhost:%d' % (local_port, port)
275 ssh_cmd = make_ssh_command(opts=tunnel_options)
276 tunnel_cmd = '%s %s' % (ssh_cmd, self.hostname)
277 logging.debug('Full tunnel command: %s', tunnel_cmd)
278 tunnel_proc = subprocess.Popen(tunnel_cmd, shell=True, close_fds=True)
279 logging.debug('Started XMLRPC tunnel, local = %d'
280 ' remote = %d, pid = %d',
281 local_port, port, tunnel_proc.pid)
282
283 # Start the server on the host. Redirection in the command
284 # below is necessary, because 'ssh' won't terminate until
285 # background child processes close stdin, stdout, and
286 # stderr.
287 remote_cmd = '( %s ) </dev/null >/dev/null 2>&1 & echo $!' % command
288 remote_pid = self.run(remote_cmd).stdout.rstrip('\n')
289 logging.debug('Started XMLRPC server on host %s, pid = %s',
290 self.hostname, remote_pid)
291
292 self._xmlrpc_proxy_map[port] = (cleanup, tunnel_proc)
293 rpc_url = 'http://localhost:%d' % local_port
294 return xmlrpclib.ServerProxy(rpc_url, allow_none=True)
295
296
297 def xmlrpc_disconnect(self, port):
298 """Disconnect from an XMLRPC server on the host.
299
300 Terminates the remote XMLRPC server previously started for
301 the given `port`. Also closes the local ssh tunnel created
302 for the connection to the host. This function does not
303 directly alter the state of a previously returned XMLRPC
304 client object; however disconnection will cause all
305 subsequent calls to methods on the object to fail.
306
307 This function does nothing if requested to disconnect a port
308 that was not previously connected via `self.xmlrpc_connect()`
309
310 @param port Port number passed to a previous call to
311 `xmlrpc_connect()`
312 """
313 if port not in self._xmlrpc_proxy_map:
314 return
315 entry = self._xmlrpc_proxy_map[port]
316 remote_name = entry[0]
317 tunnel_proc = entry[1]
318 if remote_name:
319 # We use 'pkill' to find our target process rather than
320 # a PID, because the host may have rebooted since
321 # connecting, and we don't want to kill an innocent
322 # process with the same PID.
323 #
324 # 'pkill' helpfully exits with status 1 if no target
325 # process is found, for which run() will throw an
326 # exception. We don't want that, so we ignore the
327 # status.
328 self.run("pkill -f '%s'" % remote_name, ignore_status=True)
329
330 if tunnel_proc.poll() is None:
331 tunnel_proc.terminate()
332 logging.debug('Terminated tunnel, pid %d', tunnel_proc.pid)
333 else:
334 logging.debug('Tunnel pid %d terminated early, status %d',
335 tunnel_proc.pid, tunnel_proc.returncode)
336 del self._xmlrpc_proxy_map[port]
337
338
339 def xmlrpc_disconnect_all(self):
340 """Disconnect all known XMLRPC proxy ports."""
341 for port in self._xmlrpc_proxy_map.keys():
342 self.xmlrpc_disconnect(port)
343
344
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700345 def _ping_is_up(self):
346 """Ping the host once, and return whether it responded."""
347 return utils.ping(self.hostname, tries=1, deadline=1) == 0
348
349
350 def _ping_wait_down(self, timeout):
351 """Wait until the host no longer responds to `ping`.
352
353 @param timeout Minimum time to allow before declaring the
354 host to be non-responsive.
355 """
356
357 # This function is a slightly faster version of wait_down().
358 #
359 # In AbstractSSHHost.wait_down(), `ssh` is used to determine
360 # whether the host is down. In some situations (mine, at
361 # least), `ssh` can take over a minute to determine that the
362 # host is down. The `ping` command answers the question
363 # faster, so we use that here instead.
364 #
365 # There is no equivalent for wait_up(), because a target that
366 # answers to `ping` won't necessarily respond to `ssh`.
367 end_time = time.time() + timeout
368 while time.time() <= end_time:
369 if not self._ping_is_up():
370 return True
371
372 # If the timeout is short relative to the run time of
373 # _ping_is_up(), we might be prone to false failures for
374 # lack of checking frequently enough. To be safe, we make
375 # one last check _after_ the deadline.
376 return not self._ping_is_up()
377
378
379 def test_wait_for_sleep(self):
380 """Wait for the client to enter low-power sleep mode.
381
382 The test for "is asleep" can't distinguish a system that is
383 powered off; to confirm that the unit was asleep, it is
384 necessary to force resume, and then call
385 `test_wait_for_resume()`.
386
387 This function is expected to be called from a test as part
388 of a sequence like the following:
389
390 ~~~~~~~~
391 boot_id = host.get_boot_id()
392 # trigger sleep on the host
393 host.test_wait_for_sleep()
394 # trigger resume on the host
395 host.test_wait_for_resume(boot_id)
396 ~~~~~~~~
397
398 @exception TestFail The host did not go to sleep within
399 the allowed time.
400 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700401 if not self._ping_wait_down(timeout=self.SLEEP_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700402 raise error.TestFail(
403 'client failed to sleep after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700404 self.SLEEP_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700405
406
407 def test_wait_for_resume(self, old_boot_id):
408 """Wait for the client to resume from low-power sleep mode.
409
410 The `old_boot_id` parameter should be the value from
411 `get_boot_id()` obtained prior to entering sleep mode. A
412 `TestFail` exception is raised if the boot id changes.
413
414 See @ref test_wait_for_sleep for more on this function's
415 usage.
416
417 @param[in] old_boot_id A boot id value obtained before the
418 target host went to sleep.
419
420 @exception TestFail The host did not respond within the
421 allowed time.
422 @exception TestFail The host responded, but the boot id test
423 indicated a reboot rather than a sleep
424 cycle.
425 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700426 if not self.wait_up(timeout=self.RESUME_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700427 raise error.TestFail(
428 'client failed to resume from sleep after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700429 self.RESUME_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700430 else:
431 new_boot_id = self.get_boot_id()
432 if new_boot_id != old_boot_id:
433 raise error.TestFail(
434 'client rebooted, but sleep was expected'
435 ' (old boot %s, new boot %s)'
436 % (old_boot_id, new_boot_id))
437
438
439 def test_wait_for_shutdown(self):
440 """Wait for the client to shut down.
441
442 The test for "has shut down" can't distinguish a system that
443 is merely asleep; to confirm that the unit was down, it is
444 necessary to force boot, and then call test_wait_for_boot().
445
446 This function is expected to be called from a test as part
447 of a sequence like the following:
448
449 ~~~~~~~~
450 boot_id = host.get_boot_id()
451 # trigger shutdown on the host
452 host.test_wait_for_shutdown()
453 # trigger boot on the host
454 host.test_wait_for_boot(boot_id)
455 ~~~~~~~~
456
457 @exception TestFail The host did not shut down within the
458 allowed time.
459 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700460 if not self._ping_wait_down(timeout=self.SHUTDOWN_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700461 raise error.TestFail(
462 'client failed to shut down after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700463 self.SHUTDOWN_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700464
465
466 def test_wait_for_boot(self, old_boot_id=None):
467 """Wait for the client to boot from cold power.
468
469 The `old_boot_id` parameter should be the value from
470 `get_boot_id()` obtained prior to shutting down. A
471 `TestFail` exception is raised if the boot id does not
472 change. The boot id test is omitted if `old_boot_id` is not
473 specified.
474
475 See @ref test_wait_for_shutdown for more on this function's
476 usage.
477
478 @param[in] old_boot_id A boot id value obtained before the
479 shut down.
480
481 @exception TestFail The host did not respond within the
482 allowed time.
483 @exception TestFail The host responded, but the boot id test
484 indicated that there was no reboot.
485 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700486 if not self.wait_up(timeout=self.REBOOT_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700487 raise error.TestFail(
488 'client failed to reboot after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700489 self.REBOOT_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700490 elif old_boot_id:
491 if self.get_boot_id() == old_boot_id:
492 raise error.TestFail(
493 'client is back up, but did not reboot'
494 ' (boot %s)' % old_boot_id)