blob: f5b4af25d678f537374fa2d8eaa90e1ac62b727f [file] [log] [blame]
J. Richard Barnette24adbf42012-04-11 15:04:53 -07001# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Dale Curtisaa5eedb2011-08-23 16:18:52 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
J. Richard Barnette1d78b012012-05-15 13:56:30 -07005import logging
6import subprocess
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07007import time
J. Richard Barnette1d78b012012-05-15 13:56:30 -07008import xmlrpclib
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07009
J. Richard Barnette45e93de2012-04-11 17:24:15 -070010from autotest_lib.client.bin import utils
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070011from autotest_lib.client.common_lib import global_config, error
J. Richard Barnette45e93de2012-04-11 17:24:15 -070012from autotest_lib.client.common_lib.cros import autoupdater
13from autotest_lib.server import autoserv_parser
14from autotest_lib.server import site_host_attributes
15from autotest_lib.server import site_remote_power
J. Richard Barnette67ccb872012-04-19 16:34:56 -070016from autotest_lib.server.cros import servo
J. Richard Barnette45e93de2012-04-11 17:24:15 -070017from autotest_lib.server.hosts import remote
J. Richard Barnette24adbf42012-04-11 15:04:53 -070018
19
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070020def make_ssh_command(user='root', port=22, opts='', hosts_file=None,
21 connect_timeout=None, alive_interval=None):
22 """Override default make_ssh_command to use options tuned for Chrome OS.
23
24 Tuning changes:
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070025 - ConnectTimeout=30; maximum of 30 seconds allowed for an SSH connection
26 failure. Consistency with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070027
Dale Curtisaa5eedb2011-08-23 16:18:52 -070028 - ServerAliveInterval=180; which causes SSH to ping connection every
29 180 seconds. In conjunction with ServerAliveCountMax ensures that if the
30 connection dies, Autotest will bail out quickly. Originally tried 60 secs,
31 but saw frequent job ABORTS where the test completed successfully.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070032
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070033 - ServerAliveCountMax=3; consistency with remote_access.sh.
34
35 - ConnectAttempts=4; reduce flakiness in connection errors; consistency
36 with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070037
38 - UserKnownHostsFile=/dev/null; we don't care about the keys. Host keys
39 change with every new installation, don't waste memory/space saving them.
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070040
41 - SSH protocol forced to 2; needed for ServerAliveInterval.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070042 """
43 base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
44 ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070045 ' -o ConnectTimeout=30 -o ServerAliveInterval=180'
46 ' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
47 ' -o Protocol=2 -l %s -p %d')
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070048 return base_command % (opts, user, port)
J. Richard Barnette45e93de2012-04-11 17:24:15 -070049
50
51class SiteHost(remote.RemoteHost):
52 """Chromium OS specific subclass of Host."""
53
54 _parser = autoserv_parser.autoserv_parser
55
56 # Time to wait for new kernel to be marked successful.
Chris Masone163cead2012-05-16 11:49:48 -070057 _KERNEL_UPDATE_TIMEOUT = 120
J. Richard Barnette45e93de2012-04-11 17:24:15 -070058
59 # Ephemeral file to indicate that an update has just occurred.
60 _JUST_UPDATED_FLAG = '/tmp/just_updated'
61
J. Richard Barnetteeb69d722012-06-18 17:29:44 -070062 # Timeout values associated with various Chrome OS state
63 # changes. In general, the timeouts are the maximum time to
64 # allow between some event X, and the time that the unit is
65 # on (or off) the network. Note that "time to get on the
66 # network" is typically longer than the time to complete the
67 # operation.
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070068 #
69 # TODO(jrbarnette): None of these times have been thoroughly
70 # tested empirically; if timeouts are a problem, increasing the
71 # time limit really might be the right answer.
J. Richard Barnetteeb69d722012-06-18 17:29:44 -070072 #
73 # SLEEP_TIMEOUT: Time to allow for suspend to memory.
74 # RESUME_TIMEOUT: Time to allow for resume after suspend.
75 # BOOT_TIMEOUT: Time to allow for boot from power off. Among
76 # other things, this includes time for the 30 second dev-mode
77 # screen delay,
78 # USB_BOOT_TIMEOUT: Time to allow for boot from a USB device,
79 # including the 30 second dev-mode delay.
80 # SHUTDOWN_TIMEOUT: Time to allow to shut down.
81 # REBOOT_TIMEOUT: Combination of shutdown and reboot times.
82
83 SLEEP_TIMEOUT = 2
84 RESUME_TIMEOUT = 5
85 BOOT_TIMEOUT = 45
86 USB_BOOT_TIMEOUT = 150
87 SHUTDOWN_TIMEOUT = 5
88 REBOOT_TIMEOUT = SHUTDOWN_TIMEOUT + BOOT_TIMEOUT
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070089
90
J. Richard Barnette55fb8062012-05-23 10:29:31 -070091 def _initialize(self, hostname, servo_host=None, servo_port=None,
92 *args, **dargs):
J. Richard Barnette67ccb872012-04-19 16:34:56 -070093 """Initialize superclasses, and |self.servo|.
94
95 For creating the host servo object, there are three
96 possibilities: First, if the host is a lab system known to
97 have a servo board, we connect to that servo unconditionally.
98 Second, if we're called from a control file that requires
J. Richard Barnette55fb8062012-05-23 10:29:31 -070099 servo features for testing, it will pass settings for
100 `servo_host`, `servo_port`, or both. If neither of these
101 cases apply, `self.servo` will be `None`.
J. Richard Barnette67ccb872012-04-19 16:34:56 -0700102
103 """
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700104 super(SiteHost, self)._initialize(hostname=hostname,
105 *args, **dargs)
J. Richard Barnettef0859852012-08-20 14:55:50 -0700106 # self.env is a dictionary of environment variable settings
107 # to be exported for commands run on the host.
108 # LIBC_FATAL_STDERR_ can be useful for diagnosing certain
109 # errors that might happen.
110 self.env['LIBC_FATAL_STDERR_'] = '1'
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700111 self._xmlrpc_proxy_map = {}
J. Richard Barnette67ccb872012-04-19 16:34:56 -0700112 self.servo = servo.Servo.get_lab_servo(hostname)
J. Richard Barnette55fb8062012-05-23 10:29:31 -0700113 if not self.servo:
114 # The Servo constructor generally doesn't accept 'None'
115 # for its parameters.
116 if servo_host is not None:
117 if servo_port is not None:
118 self.servo = servo.Servo(servo_host=servo_host,
119 servo_port=servo_port)
120 else:
121 self.servo = servo.Servo(servo_host=servo_host)
122 elif servo_port is not None:
123 self.servo = servo.Servo(servo_port=servo_port)
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700124
125
Chris Sosaa3ac2152012-05-23 22:23:13 -0700126 def machine_install(self, update_url=None, force_update=False,
127 local_devserver=False):
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700128 if not update_url and self._parser.options.image:
129 update_url = self._parser.options.image
130 elif not update_url:
131 raise autoupdater.ChromiumOSError(
132 'Update failed. No update URL provided.')
133
134 # Attempt to update the system.
Chris Sosaa3ac2152012-05-23 22:23:13 -0700135 updater = autoupdater.ChromiumOSUpdater(update_url, host=self,
136 local_devserver=local_devserver)
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700137 if updater.run_update(force_update):
138 # Figure out active and inactive kernel.
139 active_kernel, inactive_kernel = updater.get_kernel_state()
140
141 # Ensure inactive kernel has higher priority than active.
142 if (updater.get_kernel_priority(inactive_kernel)
143 < updater.get_kernel_priority(active_kernel)):
144 raise autoupdater.ChromiumOSError(
145 'Update failed. The priority of the inactive kernel'
146 ' partition is less than that of the active kernel'
147 ' partition.')
148
Scott Zawalski21902002012-09-19 17:57:00 -0400149 update_engine_log = '/var/log/update_engine.log'
150 logging.info('Dumping %s', update_engine_log)
151 self.run('cat %s' % update_engine_log)
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700152 # Updater has returned, successfully, reboot the host.
153 self.reboot(timeout=60, wait=True)
154
155 # Following the reboot, verify the correct version.
156 updater.check_version()
157
158 # Figure out newly active kernel.
159 new_active_kernel, _ = updater.get_kernel_state()
160
161 # Ensure that previously inactive kernel is now the active kernel.
162 if new_active_kernel != inactive_kernel:
163 raise autoupdater.ChromiumOSError(
164 'Update failed. New kernel partition is not active after'
165 ' boot.')
166
167 host_attributes = site_host_attributes.HostAttributes(self.hostname)
168 if host_attributes.has_chromeos_firmware:
169 # Wait until tries == 0 and success, or until timeout.
170 utils.poll_for_condition(
171 lambda: (updater.get_kernel_tries(new_active_kernel) == 0
172 and updater.get_kernel_success(new_active_kernel)),
173 exception=autoupdater.ChromiumOSError(
174 'Update failed. Timed out waiting for system to mark'
175 ' new kernel as successful.'),
176 timeout=self._KERNEL_UPDATE_TIMEOUT, sleep_interval=5)
177
178 # TODO(dalecurtis): Hack for R12 builds to make sure BVT runs of
179 # platform_Shutdown pass correctly.
180 if updater.update_version.startswith('0.12'):
181 self.reboot(timeout=60, wait=True)
182
183 # Mark host as recently updated. Hosts are rebooted at the end of
184 # every test cycle which will remove the file.
185 self.run('touch %s' % self._JUST_UPDATED_FLAG)
186
187 # Clean up any old autotest directories which may be lying around.
188 for path in global_config.global_config.get_config_value(
189 'AUTOSERV', 'client_autodir_paths', type=list):
190 self.run('rm -rf ' + path)
191
192
193 def has_just_updated(self):
194 """Indicates whether the host was updated within this boot."""
195 # Check for the existence of the just updated flag file.
196 return self.run(
197 '[ -f %s ] && echo T || echo F'
198 % self._JUST_UPDATED_FLAG).stdout.strip() == 'T'
199
200
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700201 def close(self):
202 super(SiteHost, self).close()
203 self.xmlrpc_disconnect_all()
204
205
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700206 def cleanup(self):
207 """Special cleanup method to make sure hosts always get power back."""
208 super(SiteHost, self).cleanup()
209 remote_power = site_remote_power.RemotePower(self.hostname)
210 if remote_power:
211 remote_power.set_power_on()
212
213
Yu-Ju Honga2be94a2012-07-31 09:48:52 -0700214 def reboot(self, **dargs):
215 """
216 This function reboots the site host. The more generic
217 RemoteHost.reboot() performs sync and sleeps for 5
218 seconds. This is not necessary for Chrome OS devices as the
219 sync should be finished in a short time during the reboot
220 command.
221 """
Tom Wai-Hong Tamf5cd1d42012-08-13 12:04:08 +0800222 if 'reboot_cmd' not in dargs:
223 dargs['reboot_cmd'] = ('((reboot & sleep 10; reboot -f &)'
224 ' </dev/null >/dev/null 2>&1 &)')
Yu-Ju Honga2be94a2012-07-31 09:48:52 -0700225 # Enable fastsync to avoid running extra sync commands before reboot.
Tom Wai-Hong Tamf5cd1d42012-08-13 12:04:08 +0800226 if 'fastsync' not in dargs:
227 dargs['fastsync'] = True
Yu-Ju Honga2be94a2012-07-31 09:48:52 -0700228 super(SiteHost, self).reboot(**dargs)
229
230
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700231 def verify_software(self):
232 """Ensure the stateful partition has space for Autotest and updates.
233
234 Similar to what is done by AbstractSSH, except instead of checking the
235 Autotest installation path, just check the stateful partition.
236
237 Checking the stateful partition is preferable in case it has been wiped,
238 resulting in an Autotest installation path which doesn't exist and isn't
239 writable. We still want to pass verify in this state since the partition
240 will be recovered with the next install.
241 """
242 super(SiteHost, self).verify_software()
243 self.check_diskspace(
244 '/mnt/stateful_partition',
245 global_config.global_config.get_config_value(
246 'SERVER', 'gb_diskspace_required', type=int,
247 default=20))
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700248
249
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700250 def xmlrpc_connect(self, command, port, cleanup=None):
251 """Connect to an XMLRPC server on the host.
252
253 The `command` argument should be a simple shell command that
254 starts an XMLRPC server on the given `port`. The command
255 must not daemonize, and must terminate cleanly on SIGTERM.
256 The command is started in the background on the host, and a
257 local XMLRPC client for the server is created and returned
258 to the caller.
259
260 Note that the process of creating an XMLRPC client makes no
261 attempt to connect to the remote server; the caller is
262 responsible for determining whether the server is running
263 correctly, and is ready to serve requests.
264
265 @param command Shell command to start the server.
266 @param port Port number on which the server is expected to
267 be serving.
268 """
269 self.xmlrpc_disconnect(port)
270
271 # Chrome OS on the target closes down most external ports
272 # for security. We could open the port, but doing that
273 # would conflict with security tests that check that only
274 # expected ports are open. So, to get to the port on the
275 # target we use an ssh tunnel.
276 local_port = utils.get_unused_port()
277 tunnel_options = '-n -N -q -L %d:localhost:%d' % (local_port, port)
278 ssh_cmd = make_ssh_command(opts=tunnel_options)
279 tunnel_cmd = '%s %s' % (ssh_cmd, self.hostname)
280 logging.debug('Full tunnel command: %s', tunnel_cmd)
281 tunnel_proc = subprocess.Popen(tunnel_cmd, shell=True, close_fds=True)
282 logging.debug('Started XMLRPC tunnel, local = %d'
283 ' remote = %d, pid = %d',
284 local_port, port, tunnel_proc.pid)
285
286 # Start the server on the host. Redirection in the command
287 # below is necessary, because 'ssh' won't terminate until
288 # background child processes close stdin, stdout, and
289 # stderr.
290 remote_cmd = '( %s ) </dev/null >/dev/null 2>&1 & echo $!' % command
291 remote_pid = self.run(remote_cmd).stdout.rstrip('\n')
292 logging.debug('Started XMLRPC server on host %s, pid = %s',
293 self.hostname, remote_pid)
294
295 self._xmlrpc_proxy_map[port] = (cleanup, tunnel_proc)
296 rpc_url = 'http://localhost:%d' % local_port
297 return xmlrpclib.ServerProxy(rpc_url, allow_none=True)
298
299
300 def xmlrpc_disconnect(self, port):
301 """Disconnect from an XMLRPC server on the host.
302
303 Terminates the remote XMLRPC server previously started for
304 the given `port`. Also closes the local ssh tunnel created
305 for the connection to the host. This function does not
306 directly alter the state of a previously returned XMLRPC
307 client object; however disconnection will cause all
308 subsequent calls to methods on the object to fail.
309
310 This function does nothing if requested to disconnect a port
311 that was not previously connected via `self.xmlrpc_connect()`
312
313 @param port Port number passed to a previous call to
314 `xmlrpc_connect()`
315 """
316 if port not in self._xmlrpc_proxy_map:
317 return
318 entry = self._xmlrpc_proxy_map[port]
319 remote_name = entry[0]
320 tunnel_proc = entry[1]
321 if remote_name:
322 # We use 'pkill' to find our target process rather than
323 # a PID, because the host may have rebooted since
324 # connecting, and we don't want to kill an innocent
325 # process with the same PID.
326 #
327 # 'pkill' helpfully exits with status 1 if no target
328 # process is found, for which run() will throw an
329 # exception. We don't want that, so we ignore the
330 # status.
331 self.run("pkill -f '%s'" % remote_name, ignore_status=True)
332
333 if tunnel_proc.poll() is None:
334 tunnel_proc.terminate()
335 logging.debug('Terminated tunnel, pid %d', tunnel_proc.pid)
336 else:
337 logging.debug('Tunnel pid %d terminated early, status %d',
338 tunnel_proc.pid, tunnel_proc.returncode)
339 del self._xmlrpc_proxy_map[port]
340
341
342 def xmlrpc_disconnect_all(self):
343 """Disconnect all known XMLRPC proxy ports."""
344 for port in self._xmlrpc_proxy_map.keys():
345 self.xmlrpc_disconnect(port)
346
347
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700348 def _ping_is_up(self):
349 """Ping the host once, and return whether it responded."""
350 return utils.ping(self.hostname, tries=1, deadline=1) == 0
351
352
353 def _ping_wait_down(self, timeout):
354 """Wait until the host no longer responds to `ping`.
355
356 @param timeout Minimum time to allow before declaring the
357 host to be non-responsive.
358 """
359
360 # This function is a slightly faster version of wait_down().
361 #
362 # In AbstractSSHHost.wait_down(), `ssh` is used to determine
363 # whether the host is down. In some situations (mine, at
364 # least), `ssh` can take over a minute to determine that the
365 # host is down. The `ping` command answers the question
366 # faster, so we use that here instead.
367 #
368 # There is no equivalent for wait_up(), because a target that
369 # answers to `ping` won't necessarily respond to `ssh`.
370 end_time = time.time() + timeout
371 while time.time() <= end_time:
372 if not self._ping_is_up():
373 return True
374
375 # If the timeout is short relative to the run time of
376 # _ping_is_up(), we might be prone to false failures for
377 # lack of checking frequently enough. To be safe, we make
378 # one last check _after_ the deadline.
379 return not self._ping_is_up()
380
381
382 def test_wait_for_sleep(self):
383 """Wait for the client to enter low-power sleep mode.
384
385 The test for "is asleep" can't distinguish a system that is
386 powered off; to confirm that the unit was asleep, it is
387 necessary to force resume, and then call
388 `test_wait_for_resume()`.
389
390 This function is expected to be called from a test as part
391 of a sequence like the following:
392
393 ~~~~~~~~
394 boot_id = host.get_boot_id()
395 # trigger sleep on the host
396 host.test_wait_for_sleep()
397 # trigger resume on the host
398 host.test_wait_for_resume(boot_id)
399 ~~~~~~~~
400
401 @exception TestFail The host did not go to sleep within
402 the allowed time.
403 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700404 if not self._ping_wait_down(timeout=self.SLEEP_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700405 raise error.TestFail(
406 'client failed to sleep after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700407 self.SLEEP_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700408
409
410 def test_wait_for_resume(self, old_boot_id):
411 """Wait for the client to resume from low-power sleep mode.
412
413 The `old_boot_id` parameter should be the value from
414 `get_boot_id()` obtained prior to entering sleep mode. A
415 `TestFail` exception is raised if the boot id changes.
416
417 See @ref test_wait_for_sleep for more on this function's
418 usage.
419
420 @param[in] old_boot_id A boot id value obtained before the
421 target host went to sleep.
422
423 @exception TestFail The host did not respond within the
424 allowed time.
425 @exception TestFail The host responded, but the boot id test
426 indicated a reboot rather than a sleep
427 cycle.
428 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700429 if not self.wait_up(timeout=self.RESUME_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700430 raise error.TestFail(
431 'client failed to resume from sleep after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700432 self.RESUME_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700433 else:
434 new_boot_id = self.get_boot_id()
435 if new_boot_id != old_boot_id:
436 raise error.TestFail(
437 'client rebooted, but sleep was expected'
438 ' (old boot %s, new boot %s)'
439 % (old_boot_id, new_boot_id))
440
441
442 def test_wait_for_shutdown(self):
443 """Wait for the client to shut down.
444
445 The test for "has shut down" can't distinguish a system that
446 is merely asleep; to confirm that the unit was down, it is
447 necessary to force boot, and then call test_wait_for_boot().
448
449 This function is expected to be called from a test as part
450 of a sequence like the following:
451
452 ~~~~~~~~
453 boot_id = host.get_boot_id()
454 # trigger shutdown on the host
455 host.test_wait_for_shutdown()
456 # trigger boot on the host
457 host.test_wait_for_boot(boot_id)
458 ~~~~~~~~
459
460 @exception TestFail The host did not shut down within the
461 allowed time.
462 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700463 if not self._ping_wait_down(timeout=self.SHUTDOWN_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700464 raise error.TestFail(
465 'client failed to shut down after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700466 self.SHUTDOWN_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700467
468
469 def test_wait_for_boot(self, old_boot_id=None):
470 """Wait for the client to boot from cold power.
471
472 The `old_boot_id` parameter should be the value from
473 `get_boot_id()` obtained prior to shutting down. A
474 `TestFail` exception is raised if the boot id does not
475 change. The boot id test is omitted if `old_boot_id` is not
476 specified.
477
478 See @ref test_wait_for_shutdown for more on this function's
479 usage.
480
481 @param[in] old_boot_id A boot id value obtained before the
482 shut down.
483
484 @exception TestFail The host did not respond within the
485 allowed time.
486 @exception TestFail The host responded, but the boot id test
487 indicated that there was no reboot.
488 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700489 if not self.wait_up(timeout=self.REBOOT_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700490 raise error.TestFail(
491 'client failed to reboot after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700492 self.REBOOT_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700493 elif old_boot_id:
494 if self.get_boot_id() == old_boot_id:
495 raise error.TestFail(
496 'client is back up, but did not reboot'
497 ' (boot %s)' % old_boot_id)