blob: 2a0bac5aa7ff6a5f1dde3414a0b1770f592a2f4a [file] [log] [blame]
J. Richard Barnette24adbf42012-04-11 15:04:53 -07001# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Dale Curtisaa5eedb2011-08-23 16:18:52 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
J. Richard Barnette1d78b012012-05-15 13:56:30 -07005import logging
6import subprocess
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07007import time
J. Richard Barnette1d78b012012-05-15 13:56:30 -07008import xmlrpclib
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07009
J. Richard Barnette45e93de2012-04-11 17:24:15 -070010from autotest_lib.client.bin import utils
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070011from autotest_lib.client.common_lib import global_config, error
J. Richard Barnette45e93de2012-04-11 17:24:15 -070012from autotest_lib.client.common_lib.cros import autoupdater
13from autotest_lib.server import autoserv_parser
14from autotest_lib.server import site_host_attributes
15from autotest_lib.server import site_remote_power
J. Richard Barnette67ccb872012-04-19 16:34:56 -070016from autotest_lib.server.cros import servo
J. Richard Barnette45e93de2012-04-11 17:24:15 -070017from autotest_lib.server.hosts import remote
J. Richard Barnette24adbf42012-04-11 15:04:53 -070018
19
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070020def make_ssh_command(user='root', port=22, opts='', hosts_file=None,
21 connect_timeout=None, alive_interval=None):
22 """Override default make_ssh_command to use options tuned for Chrome OS.
23
24 Tuning changes:
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070025 - ConnectTimeout=30; maximum of 30 seconds allowed for an SSH connection
26 failure. Consistency with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070027
Dale Curtisaa5eedb2011-08-23 16:18:52 -070028 - ServerAliveInterval=180; which causes SSH to ping connection every
29 180 seconds. In conjunction with ServerAliveCountMax ensures that if the
30 connection dies, Autotest will bail out quickly. Originally tried 60 secs,
31 but saw frequent job ABORTS where the test completed successfully.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070032
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070033 - ServerAliveCountMax=3; consistency with remote_access.sh.
34
35 - ConnectAttempts=4; reduce flakiness in connection errors; consistency
36 with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070037
38 - UserKnownHostsFile=/dev/null; we don't care about the keys. Host keys
39 change with every new installation, don't waste memory/space saving them.
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070040
41 - SSH protocol forced to 2; needed for ServerAliveInterval.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070042 """
43 base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
44 ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070045 ' -o ConnectTimeout=30 -o ServerAliveInterval=180'
46 ' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
47 ' -o Protocol=2 -l %s -p %d')
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070048 return base_command % (opts, user, port)
J. Richard Barnette45e93de2012-04-11 17:24:15 -070049
50
51class SiteHost(remote.RemoteHost):
52 """Chromium OS specific subclass of Host."""
53
54 _parser = autoserv_parser.autoserv_parser
55
56 # Time to wait for new kernel to be marked successful.
Chris Masone163cead2012-05-16 11:49:48 -070057 _KERNEL_UPDATE_TIMEOUT = 120
J. Richard Barnette45e93de2012-04-11 17:24:15 -070058
59 # Ephemeral file to indicate that an update has just occurred.
60 _JUST_UPDATED_FLAG = '/tmp/just_updated'
61
J. Richard Barnetteeb69d722012-06-18 17:29:44 -070062 # Timeout values associated with various Chrome OS state
63 # changes. In general, the timeouts are the maximum time to
64 # allow between some event X, and the time that the unit is
65 # on (or off) the network. Note that "time to get on the
66 # network" is typically longer than the time to complete the
67 # operation.
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070068 #
69 # TODO(jrbarnette): None of these times have been thoroughly
70 # tested empirically; if timeouts are a problem, increasing the
71 # time limit really might be the right answer.
J. Richard Barnetteeb69d722012-06-18 17:29:44 -070072 #
73 # SLEEP_TIMEOUT: Time to allow for suspend to memory.
74 # RESUME_TIMEOUT: Time to allow for resume after suspend.
75 # BOOT_TIMEOUT: Time to allow for boot from power off. Among
76 # other things, this includes time for the 30 second dev-mode
77 # screen delay,
78 # USB_BOOT_TIMEOUT: Time to allow for boot from a USB device,
79 # including the 30 second dev-mode delay.
80 # SHUTDOWN_TIMEOUT: Time to allow to shut down.
81 # REBOOT_TIMEOUT: Combination of shutdown and reboot times.
82
83 SLEEP_TIMEOUT = 2
84 RESUME_TIMEOUT = 5
85 BOOT_TIMEOUT = 45
86 USB_BOOT_TIMEOUT = 150
87 SHUTDOWN_TIMEOUT = 5
88 REBOOT_TIMEOUT = SHUTDOWN_TIMEOUT + BOOT_TIMEOUT
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070089
90
J. Richard Barnette55fb8062012-05-23 10:29:31 -070091 def _initialize(self, hostname, servo_host=None, servo_port=None,
92 *args, **dargs):
J. Richard Barnette67ccb872012-04-19 16:34:56 -070093 """Initialize superclasses, and |self.servo|.
94
95 For creating the host servo object, there are three
96 possibilities: First, if the host is a lab system known to
97 have a servo board, we connect to that servo unconditionally.
98 Second, if we're called from a control file that requires
J. Richard Barnette55fb8062012-05-23 10:29:31 -070099 servo features for testing, it will pass settings for
100 `servo_host`, `servo_port`, or both. If neither of these
101 cases apply, `self.servo` will be `None`.
J. Richard Barnette67ccb872012-04-19 16:34:56 -0700102
103 """
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700104 super(SiteHost, self)._initialize(hostname=hostname,
105 *args, **dargs)
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700106 self._xmlrpc_proxy_map = {}
J. Richard Barnette67ccb872012-04-19 16:34:56 -0700107 self.servo = servo.Servo.get_lab_servo(hostname)
J. Richard Barnette55fb8062012-05-23 10:29:31 -0700108 if not self.servo:
109 # The Servo constructor generally doesn't accept 'None'
110 # for its parameters.
111 if servo_host is not None:
112 if servo_port is not None:
113 self.servo = servo.Servo(servo_host=servo_host,
114 servo_port=servo_port)
115 else:
116 self.servo = servo.Servo(servo_host=servo_host)
117 elif servo_port is not None:
118 self.servo = servo.Servo(servo_port=servo_port)
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700119
120
Chris Sosaa3ac2152012-05-23 22:23:13 -0700121 def machine_install(self, update_url=None, force_update=False,
122 local_devserver=False):
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700123 if not update_url and self._parser.options.image:
124 update_url = self._parser.options.image
125 elif not update_url:
126 raise autoupdater.ChromiumOSError(
127 'Update failed. No update URL provided.')
128
129 # Attempt to update the system.
Chris Sosaa3ac2152012-05-23 22:23:13 -0700130 updater = autoupdater.ChromiumOSUpdater(update_url, host=self,
131 local_devserver=local_devserver)
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700132 if updater.run_update(force_update):
133 # Figure out active and inactive kernel.
134 active_kernel, inactive_kernel = updater.get_kernel_state()
135
136 # Ensure inactive kernel has higher priority than active.
137 if (updater.get_kernel_priority(inactive_kernel)
138 < updater.get_kernel_priority(active_kernel)):
139 raise autoupdater.ChromiumOSError(
140 'Update failed. The priority of the inactive kernel'
141 ' partition is less than that of the active kernel'
142 ' partition.')
143
144 # Updater has returned, successfully, reboot the host.
145 self.reboot(timeout=60, wait=True)
146
147 # Following the reboot, verify the correct version.
148 updater.check_version()
149
150 # Figure out newly active kernel.
151 new_active_kernel, _ = updater.get_kernel_state()
152
153 # Ensure that previously inactive kernel is now the active kernel.
154 if new_active_kernel != inactive_kernel:
155 raise autoupdater.ChromiumOSError(
156 'Update failed. New kernel partition is not active after'
157 ' boot.')
158
159 host_attributes = site_host_attributes.HostAttributes(self.hostname)
160 if host_attributes.has_chromeos_firmware:
161 # Wait until tries == 0 and success, or until timeout.
162 utils.poll_for_condition(
163 lambda: (updater.get_kernel_tries(new_active_kernel) == 0
164 and updater.get_kernel_success(new_active_kernel)),
165 exception=autoupdater.ChromiumOSError(
166 'Update failed. Timed out waiting for system to mark'
167 ' new kernel as successful.'),
168 timeout=self._KERNEL_UPDATE_TIMEOUT, sleep_interval=5)
169
170 # TODO(dalecurtis): Hack for R12 builds to make sure BVT runs of
171 # platform_Shutdown pass correctly.
172 if updater.update_version.startswith('0.12'):
173 self.reboot(timeout=60, wait=True)
174
175 # Mark host as recently updated. Hosts are rebooted at the end of
176 # every test cycle which will remove the file.
177 self.run('touch %s' % self._JUST_UPDATED_FLAG)
178
179 # Clean up any old autotest directories which may be lying around.
180 for path in global_config.global_config.get_config_value(
181 'AUTOSERV', 'client_autodir_paths', type=list):
182 self.run('rm -rf ' + path)
183
184
185 def has_just_updated(self):
186 """Indicates whether the host was updated within this boot."""
187 # Check for the existence of the just updated flag file.
188 return self.run(
189 '[ -f %s ] && echo T || echo F'
190 % self._JUST_UPDATED_FLAG).stdout.strip() == 'T'
191
192
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700193 def close(self):
194 super(SiteHost, self).close()
195 self.xmlrpc_disconnect_all()
196
197
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700198 def cleanup(self):
199 """Special cleanup method to make sure hosts always get power back."""
200 super(SiteHost, self).cleanup()
201 remote_power = site_remote_power.RemotePower(self.hostname)
202 if remote_power:
203 remote_power.set_power_on()
204
205
Yu-Ju Honga2be94a2012-07-31 09:48:52 -0700206 def reboot(self, **dargs):
207 """
208 This function reboots the site host. The more generic
209 RemoteHost.reboot() performs sync and sleeps for 5
210 seconds. This is not necessary for Chrome OS devices as the
211 sync should be finished in a short time during the reboot
212 command.
213 """
Tom Wai-Hong Tamf5cd1d42012-08-13 12:04:08 +0800214 if 'reboot_cmd' not in dargs:
215 dargs['reboot_cmd'] = ('((reboot & sleep 10; reboot -f &)'
216 ' </dev/null >/dev/null 2>&1 &)')
Yu-Ju Honga2be94a2012-07-31 09:48:52 -0700217 # Enable fastsync to avoid running extra sync commands before reboot.
Tom Wai-Hong Tamf5cd1d42012-08-13 12:04:08 +0800218 if 'fastsync' not in dargs:
219 dargs['fastsync'] = True
Yu-Ju Honga2be94a2012-07-31 09:48:52 -0700220 super(SiteHost, self).reboot(**dargs)
221
222
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700223 def verify_software(self):
224 """Ensure the stateful partition has space for Autotest and updates.
225
226 Similar to what is done by AbstractSSH, except instead of checking the
227 Autotest installation path, just check the stateful partition.
228
229 Checking the stateful partition is preferable in case it has been wiped,
230 resulting in an Autotest installation path which doesn't exist and isn't
231 writable. We still want to pass verify in this state since the partition
232 will be recovered with the next install.
233 """
234 super(SiteHost, self).verify_software()
235 self.check_diskspace(
236 '/mnt/stateful_partition',
237 global_config.global_config.get_config_value(
238 'SERVER', 'gb_diskspace_required', type=int,
239 default=20))
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700240
241
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700242 def xmlrpc_connect(self, command, port, cleanup=None):
243 """Connect to an XMLRPC server on the host.
244
245 The `command` argument should be a simple shell command that
246 starts an XMLRPC server on the given `port`. The command
247 must not daemonize, and must terminate cleanly on SIGTERM.
248 The command is started in the background on the host, and a
249 local XMLRPC client for the server is created and returned
250 to the caller.
251
252 Note that the process of creating an XMLRPC client makes no
253 attempt to connect to the remote server; the caller is
254 responsible for determining whether the server is running
255 correctly, and is ready to serve requests.
256
257 @param command Shell command to start the server.
258 @param port Port number on which the server is expected to
259 be serving.
260 """
261 self.xmlrpc_disconnect(port)
262
263 # Chrome OS on the target closes down most external ports
264 # for security. We could open the port, but doing that
265 # would conflict with security tests that check that only
266 # expected ports are open. So, to get to the port on the
267 # target we use an ssh tunnel.
268 local_port = utils.get_unused_port()
269 tunnel_options = '-n -N -q -L %d:localhost:%d' % (local_port, port)
270 ssh_cmd = make_ssh_command(opts=tunnel_options)
271 tunnel_cmd = '%s %s' % (ssh_cmd, self.hostname)
272 logging.debug('Full tunnel command: %s', tunnel_cmd)
273 tunnel_proc = subprocess.Popen(tunnel_cmd, shell=True, close_fds=True)
274 logging.debug('Started XMLRPC tunnel, local = %d'
275 ' remote = %d, pid = %d',
276 local_port, port, tunnel_proc.pid)
277
278 # Start the server on the host. Redirection in the command
279 # below is necessary, because 'ssh' won't terminate until
280 # background child processes close stdin, stdout, and
281 # stderr.
282 remote_cmd = '( %s ) </dev/null >/dev/null 2>&1 & echo $!' % command
283 remote_pid = self.run(remote_cmd).stdout.rstrip('\n')
284 logging.debug('Started XMLRPC server on host %s, pid = %s',
285 self.hostname, remote_pid)
286
287 self._xmlrpc_proxy_map[port] = (cleanup, tunnel_proc)
288 rpc_url = 'http://localhost:%d' % local_port
289 return xmlrpclib.ServerProxy(rpc_url, allow_none=True)
290
291
292 def xmlrpc_disconnect(self, port):
293 """Disconnect from an XMLRPC server on the host.
294
295 Terminates the remote XMLRPC server previously started for
296 the given `port`. Also closes the local ssh tunnel created
297 for the connection to the host. This function does not
298 directly alter the state of a previously returned XMLRPC
299 client object; however disconnection will cause all
300 subsequent calls to methods on the object to fail.
301
302 This function does nothing if requested to disconnect a port
303 that was not previously connected via `self.xmlrpc_connect()`
304
305 @param port Port number passed to a previous call to
306 `xmlrpc_connect()`
307 """
308 if port not in self._xmlrpc_proxy_map:
309 return
310 entry = self._xmlrpc_proxy_map[port]
311 remote_name = entry[0]
312 tunnel_proc = entry[1]
313 if remote_name:
314 # We use 'pkill' to find our target process rather than
315 # a PID, because the host may have rebooted since
316 # connecting, and we don't want to kill an innocent
317 # process with the same PID.
318 #
319 # 'pkill' helpfully exits with status 1 if no target
320 # process is found, for which run() will throw an
321 # exception. We don't want that, so we ignore the
322 # status.
323 self.run("pkill -f '%s'" % remote_name, ignore_status=True)
324
325 if tunnel_proc.poll() is None:
326 tunnel_proc.terminate()
327 logging.debug('Terminated tunnel, pid %d', tunnel_proc.pid)
328 else:
329 logging.debug('Tunnel pid %d terminated early, status %d',
330 tunnel_proc.pid, tunnel_proc.returncode)
331 del self._xmlrpc_proxy_map[port]
332
333
334 def xmlrpc_disconnect_all(self):
335 """Disconnect all known XMLRPC proxy ports."""
336 for port in self._xmlrpc_proxy_map.keys():
337 self.xmlrpc_disconnect(port)
338
339
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700340 def _ping_is_up(self):
341 """Ping the host once, and return whether it responded."""
342 return utils.ping(self.hostname, tries=1, deadline=1) == 0
343
344
345 def _ping_wait_down(self, timeout):
346 """Wait until the host no longer responds to `ping`.
347
348 @param timeout Minimum time to allow before declaring the
349 host to be non-responsive.
350 """
351
352 # This function is a slightly faster version of wait_down().
353 #
354 # In AbstractSSHHost.wait_down(), `ssh` is used to determine
355 # whether the host is down. In some situations (mine, at
356 # least), `ssh` can take over a minute to determine that the
357 # host is down. The `ping` command answers the question
358 # faster, so we use that here instead.
359 #
360 # There is no equivalent for wait_up(), because a target that
361 # answers to `ping` won't necessarily respond to `ssh`.
362 end_time = time.time() + timeout
363 while time.time() <= end_time:
364 if not self._ping_is_up():
365 return True
366
367 # If the timeout is short relative to the run time of
368 # _ping_is_up(), we might be prone to false failures for
369 # lack of checking frequently enough. To be safe, we make
370 # one last check _after_ the deadline.
371 return not self._ping_is_up()
372
373
374 def test_wait_for_sleep(self):
375 """Wait for the client to enter low-power sleep mode.
376
377 The test for "is asleep" can't distinguish a system that is
378 powered off; to confirm that the unit was asleep, it is
379 necessary to force resume, and then call
380 `test_wait_for_resume()`.
381
382 This function is expected to be called from a test as part
383 of a sequence like the following:
384
385 ~~~~~~~~
386 boot_id = host.get_boot_id()
387 # trigger sleep on the host
388 host.test_wait_for_sleep()
389 # trigger resume on the host
390 host.test_wait_for_resume(boot_id)
391 ~~~~~~~~
392
393 @exception TestFail The host did not go to sleep within
394 the allowed time.
395 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700396 if not self._ping_wait_down(timeout=self.SLEEP_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700397 raise error.TestFail(
398 'client failed to sleep after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700399 self.SLEEP_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700400
401
402 def test_wait_for_resume(self, old_boot_id):
403 """Wait for the client to resume from low-power sleep mode.
404
405 The `old_boot_id` parameter should be the value from
406 `get_boot_id()` obtained prior to entering sleep mode. A
407 `TestFail` exception is raised if the boot id changes.
408
409 See @ref test_wait_for_sleep for more on this function's
410 usage.
411
412 @param[in] old_boot_id A boot id value obtained before the
413 target host went to sleep.
414
415 @exception TestFail The host did not respond within the
416 allowed time.
417 @exception TestFail The host responded, but the boot id test
418 indicated a reboot rather than a sleep
419 cycle.
420 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700421 if not self.wait_up(timeout=self.RESUME_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700422 raise error.TestFail(
423 'client failed to resume from sleep after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700424 self.RESUME_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700425 else:
426 new_boot_id = self.get_boot_id()
427 if new_boot_id != old_boot_id:
428 raise error.TestFail(
429 'client rebooted, but sleep was expected'
430 ' (old boot %s, new boot %s)'
431 % (old_boot_id, new_boot_id))
432
433
434 def test_wait_for_shutdown(self):
435 """Wait for the client to shut down.
436
437 The test for "has shut down" can't distinguish a system that
438 is merely asleep; to confirm that the unit was down, it is
439 necessary to force boot, and then call test_wait_for_boot().
440
441 This function is expected to be called from a test as part
442 of a sequence like the following:
443
444 ~~~~~~~~
445 boot_id = host.get_boot_id()
446 # trigger shutdown on the host
447 host.test_wait_for_shutdown()
448 # trigger boot on the host
449 host.test_wait_for_boot(boot_id)
450 ~~~~~~~~
451
452 @exception TestFail The host did not shut down within the
453 allowed time.
454 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700455 if not self._ping_wait_down(timeout=self.SHUTDOWN_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700456 raise error.TestFail(
457 'client failed to shut down after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700458 self.SHUTDOWN_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700459
460
461 def test_wait_for_boot(self, old_boot_id=None):
462 """Wait for the client to boot from cold power.
463
464 The `old_boot_id` parameter should be the value from
465 `get_boot_id()` obtained prior to shutting down. A
466 `TestFail` exception is raised if the boot id does not
467 change. The boot id test is omitted if `old_boot_id` is not
468 specified.
469
470 See @ref test_wait_for_shutdown for more on this function's
471 usage.
472
473 @param[in] old_boot_id A boot id value obtained before the
474 shut down.
475
476 @exception TestFail The host did not respond within the
477 allowed time.
478 @exception TestFail The host responded, but the boot id test
479 indicated that there was no reboot.
480 """
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700481 if not self.wait_up(timeout=self.REBOOT_TIMEOUT):
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700482 raise error.TestFail(
483 'client failed to reboot after %d seconds' %
J. Richard Barnetteeb69d722012-06-18 17:29:44 -0700484 self.REBOOT_TIMEOUT)
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700485 elif old_boot_id:
486 if self.get_boot_id() == old_boot_id:
487 raise error.TestFail(
488 'client is back up, but did not reboot'
489 ' (boot %s)' % old_boot_id)