blob: 4feda2f9d9dd169d1c93e0282713d25fc2b6fc11 [file] [log] [blame]
J. Richard Barnette24adbf42012-04-11 15:04:53 -07001# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Dale Curtisaa5eedb2011-08-23 16:18:52 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
J. Richard Barnette1d78b012012-05-15 13:56:30 -07005import logging
6import subprocess
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07007import time
J. Richard Barnette1d78b012012-05-15 13:56:30 -07008import xmlrpclib
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07009
J. Richard Barnette45e93de2012-04-11 17:24:15 -070010from autotest_lib.client.bin import utils
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070011from autotest_lib.client.common_lib import global_config, error
J. Richard Barnette45e93de2012-04-11 17:24:15 -070012from autotest_lib.client.common_lib.cros import autoupdater
13from autotest_lib.server import autoserv_parser
14from autotest_lib.server import site_host_attributes
15from autotest_lib.server import site_remote_power
J. Richard Barnette67ccb872012-04-19 16:34:56 -070016from autotest_lib.server.cros import servo
J. Richard Barnette45e93de2012-04-11 17:24:15 -070017from autotest_lib.server.hosts import remote
J. Richard Barnette24adbf42012-04-11 15:04:53 -070018
19
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070020def make_ssh_command(user='root', port=22, opts='', hosts_file=None,
21 connect_timeout=None, alive_interval=None):
22 """Override default make_ssh_command to use options tuned for Chrome OS.
23
24 Tuning changes:
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070025 - ConnectTimeout=30; maximum of 30 seconds allowed for an SSH connection
26 failure. Consistency with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070027
Dale Curtisaa5eedb2011-08-23 16:18:52 -070028 - ServerAliveInterval=180; which causes SSH to ping connection every
29 180 seconds. In conjunction with ServerAliveCountMax ensures that if the
30 connection dies, Autotest will bail out quickly. Originally tried 60 secs,
31 but saw frequent job ABORTS where the test completed successfully.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070032
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070033 - ServerAliveCountMax=3; consistency with remote_access.sh.
34
35 - ConnectAttempts=4; reduce flakiness in connection errors; consistency
36 with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070037
38 - UserKnownHostsFile=/dev/null; we don't care about the keys. Host keys
39 change with every new installation, don't waste memory/space saving them.
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070040
41 - SSH protocol forced to 2; needed for ServerAliveInterval.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070042 """
43 base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
44 ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070045 ' -o ConnectTimeout=30 -o ServerAliveInterval=180'
46 ' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
47 ' -o Protocol=2 -l %s -p %d')
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070048 return base_command % (opts, user, port)
J. Richard Barnette45e93de2012-04-11 17:24:15 -070049
50
51class SiteHost(remote.RemoteHost):
52 """Chromium OS specific subclass of Host."""
53
54 _parser = autoserv_parser.autoserv_parser
55
56 # Time to wait for new kernel to be marked successful.
Chris Masone163cead2012-05-16 11:49:48 -070057 _KERNEL_UPDATE_TIMEOUT = 120
J. Richard Barnette45e93de2012-04-11 17:24:15 -070058
59 # Ephemeral file to indicate that an update has just occurred.
60 _JUST_UPDATED_FLAG = '/tmp/just_updated'
61
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070062 # Timeout values used in test_wait_for_sleep(), et al.
63 #
64 # _RESUME_TIMEOUT has to be big enough to allow time for WiFi
65 # reconnection.
66 #
67 # _REBOOT_TIMEOUT has to be big enough to allow time for the 30
68 # second dev-mode screen delay _and_ time for network startup,
69 # which takes several seconds longer than boot.
70 #
71 # TODO(jrbarnette): None of these times have been thoroughly
72 # tested empirically; if timeouts are a problem, increasing the
73 # time limit really might be the right answer.
74 _SLEEP_TIMEOUT = 2
75 _RESUME_TIMEOUT = 5
76 _SHUTDOWN_TIMEOUT = 5
77 _REBOOT_TIMEOUT = 45
78
79
J. Richard Barnette55fb8062012-05-23 10:29:31 -070080 def _initialize(self, hostname, servo_host=None, servo_port=None,
81 *args, **dargs):
J. Richard Barnette67ccb872012-04-19 16:34:56 -070082 """Initialize superclasses, and |self.servo|.
83
84 For creating the host servo object, there are three
85 possibilities: First, if the host is a lab system known to
86 have a servo board, we connect to that servo unconditionally.
87 Second, if we're called from a control file that requires
J. Richard Barnette55fb8062012-05-23 10:29:31 -070088 servo features for testing, it will pass settings for
89 `servo_host`, `servo_port`, or both. If neither of these
90 cases apply, `self.servo` will be `None`.
J. Richard Barnette67ccb872012-04-19 16:34:56 -070091
92 """
J. Richard Barnette45e93de2012-04-11 17:24:15 -070093 super(SiteHost, self)._initialize(hostname=hostname,
94 *args, **dargs)
J. Richard Barnette1d78b012012-05-15 13:56:30 -070095 self._xmlrpc_proxy_map = {}
J. Richard Barnette67ccb872012-04-19 16:34:56 -070096 self.servo = servo.Servo.get_lab_servo(hostname)
J. Richard Barnette55fb8062012-05-23 10:29:31 -070097 if not self.servo:
98 # The Servo constructor generally doesn't accept 'None'
99 # for its parameters.
100 if servo_host is not None:
101 if servo_port is not None:
102 self.servo = servo.Servo(servo_host=servo_host,
103 servo_port=servo_port)
104 else:
105 self.servo = servo.Servo(servo_host=servo_host)
106 elif servo_port is not None:
107 self.servo = servo.Servo(servo_port=servo_port)
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700108
109
Chris Sosaa3ac2152012-05-23 22:23:13 -0700110 def machine_install(self, update_url=None, force_update=False,
111 local_devserver=False):
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700112 if not update_url and self._parser.options.image:
113 update_url = self._parser.options.image
114 elif not update_url:
115 raise autoupdater.ChromiumOSError(
116 'Update failed. No update URL provided.')
117
118 # Attempt to update the system.
Chris Sosaa3ac2152012-05-23 22:23:13 -0700119 updater = autoupdater.ChromiumOSUpdater(update_url, host=self,
120 local_devserver=local_devserver)
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700121 if updater.run_update(force_update):
122 # Figure out active and inactive kernel.
123 active_kernel, inactive_kernel = updater.get_kernel_state()
124
125 # Ensure inactive kernel has higher priority than active.
126 if (updater.get_kernel_priority(inactive_kernel)
127 < updater.get_kernel_priority(active_kernel)):
128 raise autoupdater.ChromiumOSError(
129 'Update failed. The priority of the inactive kernel'
130 ' partition is less than that of the active kernel'
131 ' partition.')
132
133 # Updater has returned, successfully, reboot the host.
134 self.reboot(timeout=60, wait=True)
135
136 # Following the reboot, verify the correct version.
137 updater.check_version()
138
139 # Figure out newly active kernel.
140 new_active_kernel, _ = updater.get_kernel_state()
141
142 # Ensure that previously inactive kernel is now the active kernel.
143 if new_active_kernel != inactive_kernel:
144 raise autoupdater.ChromiumOSError(
145 'Update failed. New kernel partition is not active after'
146 ' boot.')
147
148 host_attributes = site_host_attributes.HostAttributes(self.hostname)
149 if host_attributes.has_chromeos_firmware:
150 # Wait until tries == 0 and success, or until timeout.
151 utils.poll_for_condition(
152 lambda: (updater.get_kernel_tries(new_active_kernel) == 0
153 and updater.get_kernel_success(new_active_kernel)),
154 exception=autoupdater.ChromiumOSError(
155 'Update failed. Timed out waiting for system to mark'
156 ' new kernel as successful.'),
157 timeout=self._KERNEL_UPDATE_TIMEOUT, sleep_interval=5)
158
159 # TODO(dalecurtis): Hack for R12 builds to make sure BVT runs of
160 # platform_Shutdown pass correctly.
161 if updater.update_version.startswith('0.12'):
162 self.reboot(timeout=60, wait=True)
163
164 # Mark host as recently updated. Hosts are rebooted at the end of
165 # every test cycle which will remove the file.
166 self.run('touch %s' % self._JUST_UPDATED_FLAG)
167
168 # Clean up any old autotest directories which may be lying around.
169 for path in global_config.global_config.get_config_value(
170 'AUTOSERV', 'client_autodir_paths', type=list):
171 self.run('rm -rf ' + path)
172
173
174 def has_just_updated(self):
175 """Indicates whether the host was updated within this boot."""
176 # Check for the existence of the just updated flag file.
177 return self.run(
178 '[ -f %s ] && echo T || echo F'
179 % self._JUST_UPDATED_FLAG).stdout.strip() == 'T'
180
181
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700182 def close(self):
183 super(SiteHost, self).close()
184 self.xmlrpc_disconnect_all()
185
186
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700187 def cleanup(self):
188 """Special cleanup method to make sure hosts always get power back."""
189 super(SiteHost, self).cleanup()
190 remote_power = site_remote_power.RemotePower(self.hostname)
191 if remote_power:
192 remote_power.set_power_on()
193
194
195 def verify_software(self):
196 """Ensure the stateful partition has space for Autotest and updates.
197
198 Similar to what is done by AbstractSSH, except instead of checking the
199 Autotest installation path, just check the stateful partition.
200
201 Checking the stateful partition is preferable in case it has been wiped,
202 resulting in an Autotest installation path which doesn't exist and isn't
203 writable. We still want to pass verify in this state since the partition
204 will be recovered with the next install.
205 """
206 super(SiteHost, self).verify_software()
207 self.check_diskspace(
208 '/mnt/stateful_partition',
209 global_config.global_config.get_config_value(
210 'SERVER', 'gb_diskspace_required', type=int,
211 default=20))
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700212
213
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700214 def xmlrpc_connect(self, command, port, cleanup=None):
215 """Connect to an XMLRPC server on the host.
216
217 The `command` argument should be a simple shell command that
218 starts an XMLRPC server on the given `port`. The command
219 must not daemonize, and must terminate cleanly on SIGTERM.
220 The command is started in the background on the host, and a
221 local XMLRPC client for the server is created and returned
222 to the caller.
223
224 Note that the process of creating an XMLRPC client makes no
225 attempt to connect to the remote server; the caller is
226 responsible for determining whether the server is running
227 correctly, and is ready to serve requests.
228
229 @param command Shell command to start the server.
230 @param port Port number on which the server is expected to
231 be serving.
232 """
233 self.xmlrpc_disconnect(port)
234
235 # Chrome OS on the target closes down most external ports
236 # for security. We could open the port, but doing that
237 # would conflict with security tests that check that only
238 # expected ports are open. So, to get to the port on the
239 # target we use an ssh tunnel.
240 local_port = utils.get_unused_port()
241 tunnel_options = '-n -N -q -L %d:localhost:%d' % (local_port, port)
242 ssh_cmd = make_ssh_command(opts=tunnel_options)
243 tunnel_cmd = '%s %s' % (ssh_cmd, self.hostname)
244 logging.debug('Full tunnel command: %s', tunnel_cmd)
245 tunnel_proc = subprocess.Popen(tunnel_cmd, shell=True, close_fds=True)
246 logging.debug('Started XMLRPC tunnel, local = %d'
247 ' remote = %d, pid = %d',
248 local_port, port, tunnel_proc.pid)
249
250 # Start the server on the host. Redirection in the command
251 # below is necessary, because 'ssh' won't terminate until
252 # background child processes close stdin, stdout, and
253 # stderr.
254 remote_cmd = '( %s ) </dev/null >/dev/null 2>&1 & echo $!' % command
255 remote_pid = self.run(remote_cmd).stdout.rstrip('\n')
256 logging.debug('Started XMLRPC server on host %s, pid = %s',
257 self.hostname, remote_pid)
258
259 self._xmlrpc_proxy_map[port] = (cleanup, tunnel_proc)
260 rpc_url = 'http://localhost:%d' % local_port
261 return xmlrpclib.ServerProxy(rpc_url, allow_none=True)
262
263
264 def xmlrpc_disconnect(self, port):
265 """Disconnect from an XMLRPC server on the host.
266
267 Terminates the remote XMLRPC server previously started for
268 the given `port`. Also closes the local ssh tunnel created
269 for the connection to the host. This function does not
270 directly alter the state of a previously returned XMLRPC
271 client object; however disconnection will cause all
272 subsequent calls to methods on the object to fail.
273
274 This function does nothing if requested to disconnect a port
275 that was not previously connected via `self.xmlrpc_connect()`
276
277 @param port Port number passed to a previous call to
278 `xmlrpc_connect()`
279 """
280 if port not in self._xmlrpc_proxy_map:
281 return
282 entry = self._xmlrpc_proxy_map[port]
283 remote_name = entry[0]
284 tunnel_proc = entry[1]
285 if remote_name:
286 # We use 'pkill' to find our target process rather than
287 # a PID, because the host may have rebooted since
288 # connecting, and we don't want to kill an innocent
289 # process with the same PID.
290 #
291 # 'pkill' helpfully exits with status 1 if no target
292 # process is found, for which run() will throw an
293 # exception. We don't want that, so we ignore the
294 # status.
295 self.run("pkill -f '%s'" % remote_name, ignore_status=True)
296
297 if tunnel_proc.poll() is None:
298 tunnel_proc.terminate()
299 logging.debug('Terminated tunnel, pid %d', tunnel_proc.pid)
300 else:
301 logging.debug('Tunnel pid %d terminated early, status %d',
302 tunnel_proc.pid, tunnel_proc.returncode)
303 del self._xmlrpc_proxy_map[port]
304
305
306 def xmlrpc_disconnect_all(self):
307 """Disconnect all known XMLRPC proxy ports."""
308 for port in self._xmlrpc_proxy_map.keys():
309 self.xmlrpc_disconnect(port)
310
311
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700312 def _ping_is_up(self):
313 """Ping the host once, and return whether it responded."""
314 return utils.ping(self.hostname, tries=1, deadline=1) == 0
315
316
317 def _ping_wait_down(self, timeout):
318 """Wait until the host no longer responds to `ping`.
319
320 @param timeout Minimum time to allow before declaring the
321 host to be non-responsive.
322 """
323
324 # This function is a slightly faster version of wait_down().
325 #
326 # In AbstractSSHHost.wait_down(), `ssh` is used to determine
327 # whether the host is down. In some situations (mine, at
328 # least), `ssh` can take over a minute to determine that the
329 # host is down. The `ping` command answers the question
330 # faster, so we use that here instead.
331 #
332 # There is no equivalent for wait_up(), because a target that
333 # answers to `ping` won't necessarily respond to `ssh`.
334 end_time = time.time() + timeout
335 while time.time() <= end_time:
336 if not self._ping_is_up():
337 return True
338
339 # If the timeout is short relative to the run time of
340 # _ping_is_up(), we might be prone to false failures for
341 # lack of checking frequently enough. To be safe, we make
342 # one last check _after_ the deadline.
343 return not self._ping_is_up()
344
345
346 def test_wait_for_sleep(self):
347 """Wait for the client to enter low-power sleep mode.
348
349 The test for "is asleep" can't distinguish a system that is
350 powered off; to confirm that the unit was asleep, it is
351 necessary to force resume, and then call
352 `test_wait_for_resume()`.
353
354 This function is expected to be called from a test as part
355 of a sequence like the following:
356
357 ~~~~~~~~
358 boot_id = host.get_boot_id()
359 # trigger sleep on the host
360 host.test_wait_for_sleep()
361 # trigger resume on the host
362 host.test_wait_for_resume(boot_id)
363 ~~~~~~~~
364
365 @exception TestFail The host did not go to sleep within
366 the allowed time.
367 """
368 if not self._ping_wait_down(timeout=self._SLEEP_TIMEOUT):
369 raise error.TestFail(
370 'client failed to sleep after %d seconds' %
371 self._SLEEP_TIMEOUT)
372
373
374 def test_wait_for_resume(self, old_boot_id):
375 """Wait for the client to resume from low-power sleep mode.
376
377 The `old_boot_id` parameter should be the value from
378 `get_boot_id()` obtained prior to entering sleep mode. A
379 `TestFail` exception is raised if the boot id changes.
380
381 See @ref test_wait_for_sleep for more on this function's
382 usage.
383
384 @param[in] old_boot_id A boot id value obtained before the
385 target host went to sleep.
386
387 @exception TestFail The host did not respond within the
388 allowed time.
389 @exception TestFail The host responded, but the boot id test
390 indicated a reboot rather than a sleep
391 cycle.
392 """
393 if not self.wait_up(timeout=self._RESUME_TIMEOUT):
394 raise error.TestFail(
395 'client failed to resume from sleep after %d seconds' %
396 self._RESUME_TIMEOUT)
397 else:
398 new_boot_id = self.get_boot_id()
399 if new_boot_id != old_boot_id:
400 raise error.TestFail(
401 'client rebooted, but sleep was expected'
402 ' (old boot %s, new boot %s)'
403 % (old_boot_id, new_boot_id))
404
405
406 def test_wait_for_shutdown(self):
407 """Wait for the client to shut down.
408
409 The test for "has shut down" can't distinguish a system that
410 is merely asleep; to confirm that the unit was down, it is
411 necessary to force boot, and then call test_wait_for_boot().
412
413 This function is expected to be called from a test as part
414 of a sequence like the following:
415
416 ~~~~~~~~
417 boot_id = host.get_boot_id()
418 # trigger shutdown on the host
419 host.test_wait_for_shutdown()
420 # trigger boot on the host
421 host.test_wait_for_boot(boot_id)
422 ~~~~~~~~
423
424 @exception TestFail The host did not shut down within the
425 allowed time.
426 """
427 if not self._ping_wait_down(timeout=self._SHUTDOWN_TIMEOUT):
428 raise error.TestFail(
429 'client failed to shut down after %d seconds' %
430 self._SHUTDOWN_TIMEOUT)
431
432
433 def test_wait_for_boot(self, old_boot_id=None):
434 """Wait for the client to boot from cold power.
435
436 The `old_boot_id` parameter should be the value from
437 `get_boot_id()` obtained prior to shutting down. A
438 `TestFail` exception is raised if the boot id does not
439 change. The boot id test is omitted if `old_boot_id` is not
440 specified.
441
442 See @ref test_wait_for_shutdown for more on this function's
443 usage.
444
445 @param[in] old_boot_id A boot id value obtained before the
446 shut down.
447
448 @exception TestFail The host did not respond within the
449 allowed time.
450 @exception TestFail The host responded, but the boot id test
451 indicated that there was no reboot.
452 """
453 if not self.wait_up(timeout=self._REBOOT_TIMEOUT):
454 raise error.TestFail(
455 'client failed to reboot after %d seconds' %
456 self._REBOOT_TIMEOUT)
457 elif old_boot_id:
458 if self.get_boot_id() == old_boot_id:
459 raise error.TestFail(
460 'client is back up, but did not reboot'
461 ' (boot %s)' % old_boot_id)