blob: 0f75d6ba8a3d37a8ec5b6dc211519f208580599a [file] [log] [blame]
J. Richard Barnette24adbf42012-04-11 15:04:53 -07001# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Dale Curtisaa5eedb2011-08-23 16:18:52 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
J. Richard Barnette1d78b012012-05-15 13:56:30 -07005import logging
6import subprocess
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07007import time
J. Richard Barnette1d78b012012-05-15 13:56:30 -07008import xmlrpclib
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07009
J. Richard Barnette45e93de2012-04-11 17:24:15 -070010from autotest_lib.client.bin import utils
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070011from autotest_lib.client.common_lib import global_config, error
J. Richard Barnette45e93de2012-04-11 17:24:15 -070012from autotest_lib.client.common_lib.cros import autoupdater
13from autotest_lib.server import autoserv_parser
14from autotest_lib.server import site_host_attributes
15from autotest_lib.server import site_remote_power
J. Richard Barnette67ccb872012-04-19 16:34:56 -070016from autotest_lib.server.cros import servo
J. Richard Barnette45e93de2012-04-11 17:24:15 -070017from autotest_lib.server.hosts import remote
J. Richard Barnette24adbf42012-04-11 15:04:53 -070018
19
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070020def make_ssh_command(user='root', port=22, opts='', hosts_file=None,
21 connect_timeout=None, alive_interval=None):
22 """Override default make_ssh_command to use options tuned for Chrome OS.
23
24 Tuning changes:
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070025 - ConnectTimeout=30; maximum of 30 seconds allowed for an SSH connection
26 failure. Consistency with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070027
Dale Curtisaa5eedb2011-08-23 16:18:52 -070028 - ServerAliveInterval=180; which causes SSH to ping connection every
29 180 seconds. In conjunction with ServerAliveCountMax ensures that if the
30 connection dies, Autotest will bail out quickly. Originally tried 60 secs,
31 but saw frequent job ABORTS where the test completed successfully.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070032
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070033 - ServerAliveCountMax=3; consistency with remote_access.sh.
34
35 - ConnectAttempts=4; reduce flakiness in connection errors; consistency
36 with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070037
38 - UserKnownHostsFile=/dev/null; we don't care about the keys. Host keys
39 change with every new installation, don't waste memory/space saving them.
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070040
41 - SSH protocol forced to 2; needed for ServerAliveInterval.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070042 """
43 base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
44 ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070045 ' -o ConnectTimeout=30 -o ServerAliveInterval=180'
46 ' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
47 ' -o Protocol=2 -l %s -p %d')
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070048 return base_command % (opts, user, port)
J. Richard Barnette45e93de2012-04-11 17:24:15 -070049
50
51class SiteHost(remote.RemoteHost):
52 """Chromium OS specific subclass of Host."""
53
54 _parser = autoserv_parser.autoserv_parser
55
56 # Time to wait for new kernel to be marked successful.
Chris Masone163cead2012-05-16 11:49:48 -070057 _KERNEL_UPDATE_TIMEOUT = 120
J. Richard Barnette45e93de2012-04-11 17:24:15 -070058
59 # Ephemeral file to indicate that an update has just occurred.
60 _JUST_UPDATED_FLAG = '/tmp/just_updated'
61
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070062 # Timeout values used in test_wait_for_sleep(), et al.
63 #
64 # _RESUME_TIMEOUT has to be big enough to allow time for WiFi
65 # reconnection.
66 #
67 # _REBOOT_TIMEOUT has to be big enough to allow time for the 30
68 # second dev-mode screen delay _and_ time for network startup,
69 # which takes several seconds longer than boot.
70 #
71 # TODO(jrbarnette): None of these times have been thoroughly
72 # tested empirically; if timeouts are a problem, increasing the
73 # time limit really might be the right answer.
74 _SLEEP_TIMEOUT = 2
75 _RESUME_TIMEOUT = 5
76 _SHUTDOWN_TIMEOUT = 5
77 _REBOOT_TIMEOUT = 45
78
79
J. Richard Barnette67ccb872012-04-19 16:34:56 -070080 def _initialize(self, hostname, require_servo=False, *args, **dargs):
81 """Initialize superclasses, and |self.servo|.
82
83 For creating the host servo object, there are three
84 possibilities: First, if the host is a lab system known to
85 have a servo board, we connect to that servo unconditionally.
86 Second, if we're called from a control file that requires
87 servo features for testing, it will pass |require_servo| set
88 to |True|, and we will start a local servod. If neither of
89 these cases apply, |self.servo| will be |None|.
90
91 """
J. Richard Barnette45e93de2012-04-11 17:24:15 -070092 super(SiteHost, self)._initialize(hostname=hostname,
93 *args, **dargs)
J. Richard Barnette1d78b012012-05-15 13:56:30 -070094 self._xmlrpc_proxy_map = {}
J. Richard Barnette67ccb872012-04-19 16:34:56 -070095 self.servo = servo.Servo.get_lab_servo(hostname)
96 if not self.servo and require_servo:
97 self.servo = servo.Servo()
J. Richard Barnette45e93de2012-04-11 17:24:15 -070098
99
100 def machine_install(self, update_url=None, force_update=False):
101 if not update_url and self._parser.options.image:
102 update_url = self._parser.options.image
103 elif not update_url:
104 raise autoupdater.ChromiumOSError(
105 'Update failed. No update URL provided.')
106
107 # Attempt to update the system.
108 updater = autoupdater.ChromiumOSUpdater(update_url, host=self)
109 if updater.run_update(force_update):
110 # Figure out active and inactive kernel.
111 active_kernel, inactive_kernel = updater.get_kernel_state()
112
113 # Ensure inactive kernel has higher priority than active.
114 if (updater.get_kernel_priority(inactive_kernel)
115 < updater.get_kernel_priority(active_kernel)):
116 raise autoupdater.ChromiumOSError(
117 'Update failed. The priority of the inactive kernel'
118 ' partition is less than that of the active kernel'
119 ' partition.')
120
121 # Updater has returned, successfully, reboot the host.
122 self.reboot(timeout=60, wait=True)
123
124 # Following the reboot, verify the correct version.
125 updater.check_version()
126
127 # Figure out newly active kernel.
128 new_active_kernel, _ = updater.get_kernel_state()
129
130 # Ensure that previously inactive kernel is now the active kernel.
131 if new_active_kernel != inactive_kernel:
132 raise autoupdater.ChromiumOSError(
133 'Update failed. New kernel partition is not active after'
134 ' boot.')
135
136 host_attributes = site_host_attributes.HostAttributes(self.hostname)
137 if host_attributes.has_chromeos_firmware:
138 # Wait until tries == 0 and success, or until timeout.
139 utils.poll_for_condition(
140 lambda: (updater.get_kernel_tries(new_active_kernel) == 0
141 and updater.get_kernel_success(new_active_kernel)),
142 exception=autoupdater.ChromiumOSError(
143 'Update failed. Timed out waiting for system to mark'
144 ' new kernel as successful.'),
145 timeout=self._KERNEL_UPDATE_TIMEOUT, sleep_interval=5)
146
147 # TODO(dalecurtis): Hack for R12 builds to make sure BVT runs of
148 # platform_Shutdown pass correctly.
149 if updater.update_version.startswith('0.12'):
150 self.reboot(timeout=60, wait=True)
151
152 # Mark host as recently updated. Hosts are rebooted at the end of
153 # every test cycle which will remove the file.
154 self.run('touch %s' % self._JUST_UPDATED_FLAG)
155
156 # Clean up any old autotest directories which may be lying around.
157 for path in global_config.global_config.get_config_value(
158 'AUTOSERV', 'client_autodir_paths', type=list):
159 self.run('rm -rf ' + path)
160
161
162 def has_just_updated(self):
163 """Indicates whether the host was updated within this boot."""
164 # Check for the existence of the just updated flag file.
165 return self.run(
166 '[ -f %s ] && echo T || echo F'
167 % self._JUST_UPDATED_FLAG).stdout.strip() == 'T'
168
169
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700170 def close(self):
171 super(SiteHost, self).close()
172 self.xmlrpc_disconnect_all()
173
174
J. Richard Barnette45e93de2012-04-11 17:24:15 -0700175 def cleanup(self):
176 """Special cleanup method to make sure hosts always get power back."""
177 super(SiteHost, self).cleanup()
178 remote_power = site_remote_power.RemotePower(self.hostname)
179 if remote_power:
180 remote_power.set_power_on()
181
182
183 def verify_software(self):
184 """Ensure the stateful partition has space for Autotest and updates.
185
186 Similar to what is done by AbstractSSH, except instead of checking the
187 Autotest installation path, just check the stateful partition.
188
189 Checking the stateful partition is preferable in case it has been wiped,
190 resulting in an Autotest installation path which doesn't exist and isn't
191 writable. We still want to pass verify in this state since the partition
192 will be recovered with the next install.
193 """
194 super(SiteHost, self).verify_software()
195 self.check_diskspace(
196 '/mnt/stateful_partition',
197 global_config.global_config.get_config_value(
198 'SERVER', 'gb_diskspace_required', type=int,
199 default=20))
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700200
201
J. Richard Barnette1d78b012012-05-15 13:56:30 -0700202 def xmlrpc_connect(self, command, port, cleanup=None):
203 """Connect to an XMLRPC server on the host.
204
205 The `command` argument should be a simple shell command that
206 starts an XMLRPC server on the given `port`. The command
207 must not daemonize, and must terminate cleanly on SIGTERM.
208 The command is started in the background on the host, and a
209 local XMLRPC client for the server is created and returned
210 to the caller.
211
212 Note that the process of creating an XMLRPC client makes no
213 attempt to connect to the remote server; the caller is
214 responsible for determining whether the server is running
215 correctly, and is ready to serve requests.
216
217 @param command Shell command to start the server.
218 @param port Port number on which the server is expected to
219 be serving.
220 """
221 self.xmlrpc_disconnect(port)
222
223 # Chrome OS on the target closes down most external ports
224 # for security. We could open the port, but doing that
225 # would conflict with security tests that check that only
226 # expected ports are open. So, to get to the port on the
227 # target we use an ssh tunnel.
228 local_port = utils.get_unused_port()
229 tunnel_options = '-n -N -q -L %d:localhost:%d' % (local_port, port)
230 ssh_cmd = make_ssh_command(opts=tunnel_options)
231 tunnel_cmd = '%s %s' % (ssh_cmd, self.hostname)
232 logging.debug('Full tunnel command: %s', tunnel_cmd)
233 tunnel_proc = subprocess.Popen(tunnel_cmd, shell=True, close_fds=True)
234 logging.debug('Started XMLRPC tunnel, local = %d'
235 ' remote = %d, pid = %d',
236 local_port, port, tunnel_proc.pid)
237
238 # Start the server on the host. Redirection in the command
239 # below is necessary, because 'ssh' won't terminate until
240 # background child processes close stdin, stdout, and
241 # stderr.
242 remote_cmd = '( %s ) </dev/null >/dev/null 2>&1 & echo $!' % command
243 remote_pid = self.run(remote_cmd).stdout.rstrip('\n')
244 logging.debug('Started XMLRPC server on host %s, pid = %s',
245 self.hostname, remote_pid)
246
247 self._xmlrpc_proxy_map[port] = (cleanup, tunnel_proc)
248 rpc_url = 'http://localhost:%d' % local_port
249 return xmlrpclib.ServerProxy(rpc_url, allow_none=True)
250
251
252 def xmlrpc_disconnect(self, port):
253 """Disconnect from an XMLRPC server on the host.
254
255 Terminates the remote XMLRPC server previously started for
256 the given `port`. Also closes the local ssh tunnel created
257 for the connection to the host. This function does not
258 directly alter the state of a previously returned XMLRPC
259 client object; however disconnection will cause all
260 subsequent calls to methods on the object to fail.
261
262 This function does nothing if requested to disconnect a port
263 that was not previously connected via `self.xmlrpc_connect()`
264
265 @param port Port number passed to a previous call to
266 `xmlrpc_connect()`
267 """
268 if port not in self._xmlrpc_proxy_map:
269 return
270 entry = self._xmlrpc_proxy_map[port]
271 remote_name = entry[0]
272 tunnel_proc = entry[1]
273 if remote_name:
274 # We use 'pkill' to find our target process rather than
275 # a PID, because the host may have rebooted since
276 # connecting, and we don't want to kill an innocent
277 # process with the same PID.
278 #
279 # 'pkill' helpfully exits with status 1 if no target
280 # process is found, for which run() will throw an
281 # exception. We don't want that, so we ignore the
282 # status.
283 self.run("pkill -f '%s'" % remote_name, ignore_status=True)
284
285 if tunnel_proc.poll() is None:
286 tunnel_proc.terminate()
287 logging.debug('Terminated tunnel, pid %d', tunnel_proc.pid)
288 else:
289 logging.debug('Tunnel pid %d terminated early, status %d',
290 tunnel_proc.pid, tunnel_proc.returncode)
291 del self._xmlrpc_proxy_map[port]
292
293
294 def xmlrpc_disconnect_all(self):
295 """Disconnect all known XMLRPC proxy ports."""
296 for port in self._xmlrpc_proxy_map.keys():
297 self.xmlrpc_disconnect(port)
298
299
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700300 def _ping_is_up(self):
301 """Ping the host once, and return whether it responded."""
302 return utils.ping(self.hostname, tries=1, deadline=1) == 0
303
304
305 def _ping_wait_down(self, timeout):
306 """Wait until the host no longer responds to `ping`.
307
308 @param timeout Minimum time to allow before declaring the
309 host to be non-responsive.
310 """
311
312 # This function is a slightly faster version of wait_down().
313 #
314 # In AbstractSSHHost.wait_down(), `ssh` is used to determine
315 # whether the host is down. In some situations (mine, at
316 # least), `ssh` can take over a minute to determine that the
317 # host is down. The `ping` command answers the question
318 # faster, so we use that here instead.
319 #
320 # There is no equivalent for wait_up(), because a target that
321 # answers to `ping` won't necessarily respond to `ssh`.
322 end_time = time.time() + timeout
323 while time.time() <= end_time:
324 if not self._ping_is_up():
325 return True
326
327 # If the timeout is short relative to the run time of
328 # _ping_is_up(), we might be prone to false failures for
329 # lack of checking frequently enough. To be safe, we make
330 # one last check _after_ the deadline.
331 return not self._ping_is_up()
332
333
334 def test_wait_for_sleep(self):
335 """Wait for the client to enter low-power sleep mode.
336
337 The test for "is asleep" can't distinguish a system that is
338 powered off; to confirm that the unit was asleep, it is
339 necessary to force resume, and then call
340 `test_wait_for_resume()`.
341
342 This function is expected to be called from a test as part
343 of a sequence like the following:
344
345 ~~~~~~~~
346 boot_id = host.get_boot_id()
347 # trigger sleep on the host
348 host.test_wait_for_sleep()
349 # trigger resume on the host
350 host.test_wait_for_resume(boot_id)
351 ~~~~~~~~
352
353 @exception TestFail The host did not go to sleep within
354 the allowed time.
355 """
356 if not self._ping_wait_down(timeout=self._SLEEP_TIMEOUT):
357 raise error.TestFail(
358 'client failed to sleep after %d seconds' %
359 self._SLEEP_TIMEOUT)
360
361
362 def test_wait_for_resume(self, old_boot_id):
363 """Wait for the client to resume from low-power sleep mode.
364
365 The `old_boot_id` parameter should be the value from
366 `get_boot_id()` obtained prior to entering sleep mode. A
367 `TestFail` exception is raised if the boot id changes.
368
369 See @ref test_wait_for_sleep for more on this function's
370 usage.
371
372 @param[in] old_boot_id A boot id value obtained before the
373 target host went to sleep.
374
375 @exception TestFail The host did not respond within the
376 allowed time.
377 @exception TestFail The host responded, but the boot id test
378 indicated a reboot rather than a sleep
379 cycle.
380 """
381 if not self.wait_up(timeout=self._RESUME_TIMEOUT):
382 raise error.TestFail(
383 'client failed to resume from sleep after %d seconds' %
384 self._RESUME_TIMEOUT)
385 else:
386 new_boot_id = self.get_boot_id()
387 if new_boot_id != old_boot_id:
388 raise error.TestFail(
389 'client rebooted, but sleep was expected'
390 ' (old boot %s, new boot %s)'
391 % (old_boot_id, new_boot_id))
392
393
394 def test_wait_for_shutdown(self):
395 """Wait for the client to shut down.
396
397 The test for "has shut down" can't distinguish a system that
398 is merely asleep; to confirm that the unit was down, it is
399 necessary to force boot, and then call test_wait_for_boot().
400
401 This function is expected to be called from a test as part
402 of a sequence like the following:
403
404 ~~~~~~~~
405 boot_id = host.get_boot_id()
406 # trigger shutdown on the host
407 host.test_wait_for_shutdown()
408 # trigger boot on the host
409 host.test_wait_for_boot(boot_id)
410 ~~~~~~~~
411
412 @exception TestFail The host did not shut down within the
413 allowed time.
414 """
415 if not self._ping_wait_down(timeout=self._SHUTDOWN_TIMEOUT):
416 raise error.TestFail(
417 'client failed to shut down after %d seconds' %
418 self._SHUTDOWN_TIMEOUT)
419
420
421 def test_wait_for_boot(self, old_boot_id=None):
422 """Wait for the client to boot from cold power.
423
424 The `old_boot_id` parameter should be the value from
425 `get_boot_id()` obtained prior to shutting down. A
426 `TestFail` exception is raised if the boot id does not
427 change. The boot id test is omitted if `old_boot_id` is not
428 specified.
429
430 See @ref test_wait_for_shutdown for more on this function's
431 usage.
432
433 @param[in] old_boot_id A boot id value obtained before the
434 shut down.
435
436 @exception TestFail The host did not respond within the
437 allowed time.
438 @exception TestFail The host responded, but the boot id test
439 indicated that there was no reboot.
440 """
441 if not self.wait_up(timeout=self._REBOOT_TIMEOUT):
442 raise error.TestFail(
443 'client failed to reboot after %d seconds' %
444 self._REBOOT_TIMEOUT)
445 elif old_boot_id:
446 if self.get_boot_id() == old_boot_id:
447 raise error.TestFail(
448 'client is back up, but did not reboot'
449 ' (boot %s)' % old_boot_id)