blob: 32587ae3efb459f4e1ab63f99872b22af6cf8984 [file] [log] [blame]
J. Richard Barnette24adbf42012-04-11 15:04:53 -07001# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Dale Curtisaa5eedb2011-08-23 16:18:52 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07005import time
6
J. Richard Barnette45e93de2012-04-11 17:24:15 -07007from autotest_lib.client.bin import utils
J. Richard Barnette134ec2c2012-04-25 12:59:37 -07008from autotest_lib.client.common_lib import global_config, error
J. Richard Barnette45e93de2012-04-11 17:24:15 -07009from autotest_lib.client.common_lib.cros import autoupdater
10from autotest_lib.server import autoserv_parser
11from autotest_lib.server import site_host_attributes
12from autotest_lib.server import site_remote_power
J. Richard Barnette67ccb872012-04-19 16:34:56 -070013from autotest_lib.server.cros import servo
J. Richard Barnette45e93de2012-04-11 17:24:15 -070014from autotest_lib.server.hosts import remote
J. Richard Barnette24adbf42012-04-11 15:04:53 -070015
16
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070017def make_ssh_command(user='root', port=22, opts='', hosts_file=None,
18 connect_timeout=None, alive_interval=None):
19 """Override default make_ssh_command to use options tuned for Chrome OS.
20
21 Tuning changes:
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070022 - ConnectTimeout=30; maximum of 30 seconds allowed for an SSH connection
23 failure. Consistency with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070024
Dale Curtisaa5eedb2011-08-23 16:18:52 -070025 - ServerAliveInterval=180; which causes SSH to ping connection every
26 180 seconds. In conjunction with ServerAliveCountMax ensures that if the
27 connection dies, Autotest will bail out quickly. Originally tried 60 secs,
28 but saw frequent job ABORTS where the test completed successfully.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070029
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070030 - ServerAliveCountMax=3; consistency with remote_access.sh.
31
32 - ConnectAttempts=4; reduce flakiness in connection errors; consistency
33 with remote_access.sh.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070034
35 - UserKnownHostsFile=/dev/null; we don't care about the keys. Host keys
36 change with every new installation, don't waste memory/space saving them.
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070037
38 - SSH protocol forced to 2; needed for ServerAliveInterval.
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070039 """
40 base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
41 ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
Chris Sosaf7fcd6e2011-09-27 17:30:47 -070042 ' -o ConnectTimeout=30 -o ServerAliveInterval=180'
43 ' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
44 ' -o Protocol=2 -l %s -p %d')
Dale Curtiscb7bfaf2011-06-07 16:21:57 -070045 return base_command % (opts, user, port)
J. Richard Barnette45e93de2012-04-11 17:24:15 -070046
47
48class SiteHost(remote.RemoteHost):
49 """Chromium OS specific subclass of Host."""
50
51 _parser = autoserv_parser.autoserv_parser
52
53 # Time to wait for new kernel to be marked successful.
Chris Masone163cead2012-05-16 11:49:48 -070054 _KERNEL_UPDATE_TIMEOUT = 120
J. Richard Barnette45e93de2012-04-11 17:24:15 -070055
56 # Ephemeral file to indicate that an update has just occurred.
57 _JUST_UPDATED_FLAG = '/tmp/just_updated'
58
J. Richard Barnette134ec2c2012-04-25 12:59:37 -070059 # Timeout values used in test_wait_for_sleep(), et al.
60 #
61 # _RESUME_TIMEOUT has to be big enough to allow time for WiFi
62 # reconnection.
63 #
64 # _REBOOT_TIMEOUT has to be big enough to allow time for the 30
65 # second dev-mode screen delay _and_ time for network startup,
66 # which takes several seconds longer than boot.
67 #
68 # TODO(jrbarnette): None of these times have been thoroughly
69 # tested empirically; if timeouts are a problem, increasing the
70 # time limit really might be the right answer.
71 _SLEEP_TIMEOUT = 2
72 _RESUME_TIMEOUT = 5
73 _SHUTDOWN_TIMEOUT = 5
74 _REBOOT_TIMEOUT = 45
75
76
J. Richard Barnette67ccb872012-04-19 16:34:56 -070077 def _initialize(self, hostname, require_servo=False, *args, **dargs):
78 """Initialize superclasses, and |self.servo|.
79
80 For creating the host servo object, there are three
81 possibilities: First, if the host is a lab system known to
82 have a servo board, we connect to that servo unconditionally.
83 Second, if we're called from a control file that requires
84 servo features for testing, it will pass |require_servo| set
85 to |True|, and we will start a local servod. If neither of
86 these cases apply, |self.servo| will be |None|.
87
88 """
J. Richard Barnette45e93de2012-04-11 17:24:15 -070089 super(SiteHost, self)._initialize(hostname=hostname,
90 *args, **dargs)
J. Richard Barnette67ccb872012-04-19 16:34:56 -070091 self.servo = servo.Servo.get_lab_servo(hostname)
92 if not self.servo and require_servo:
93 self.servo = servo.Servo()
J. Richard Barnette45e93de2012-04-11 17:24:15 -070094
95
96 def machine_install(self, update_url=None, force_update=False):
97 if not update_url and self._parser.options.image:
98 update_url = self._parser.options.image
99 elif not update_url:
100 raise autoupdater.ChromiumOSError(
101 'Update failed. No update URL provided.')
102
103 # Attempt to update the system.
104 updater = autoupdater.ChromiumOSUpdater(update_url, host=self)
105 if updater.run_update(force_update):
106 # Figure out active and inactive kernel.
107 active_kernel, inactive_kernel = updater.get_kernel_state()
108
109 # Ensure inactive kernel has higher priority than active.
110 if (updater.get_kernel_priority(inactive_kernel)
111 < updater.get_kernel_priority(active_kernel)):
112 raise autoupdater.ChromiumOSError(
113 'Update failed. The priority of the inactive kernel'
114 ' partition is less than that of the active kernel'
115 ' partition.')
116
117 # Updater has returned, successfully, reboot the host.
118 self.reboot(timeout=60, wait=True)
119
120 # Following the reboot, verify the correct version.
121 updater.check_version()
122
123 # Figure out newly active kernel.
124 new_active_kernel, _ = updater.get_kernel_state()
125
126 # Ensure that previously inactive kernel is now the active kernel.
127 if new_active_kernel != inactive_kernel:
128 raise autoupdater.ChromiumOSError(
129 'Update failed. New kernel partition is not active after'
130 ' boot.')
131
132 host_attributes = site_host_attributes.HostAttributes(self.hostname)
133 if host_attributes.has_chromeos_firmware:
134 # Wait until tries == 0 and success, or until timeout.
135 utils.poll_for_condition(
136 lambda: (updater.get_kernel_tries(new_active_kernel) == 0
137 and updater.get_kernel_success(new_active_kernel)),
138 exception=autoupdater.ChromiumOSError(
139 'Update failed. Timed out waiting for system to mark'
140 ' new kernel as successful.'),
141 timeout=self._KERNEL_UPDATE_TIMEOUT, sleep_interval=5)
142
143 # TODO(dalecurtis): Hack for R12 builds to make sure BVT runs of
144 # platform_Shutdown pass correctly.
145 if updater.update_version.startswith('0.12'):
146 self.reboot(timeout=60, wait=True)
147
148 # Mark host as recently updated. Hosts are rebooted at the end of
149 # every test cycle which will remove the file.
150 self.run('touch %s' % self._JUST_UPDATED_FLAG)
151
152 # Clean up any old autotest directories which may be lying around.
153 for path in global_config.global_config.get_config_value(
154 'AUTOSERV', 'client_autodir_paths', type=list):
155 self.run('rm -rf ' + path)
156
157
158 def has_just_updated(self):
159 """Indicates whether the host was updated within this boot."""
160 # Check for the existence of the just updated flag file.
161 return self.run(
162 '[ -f %s ] && echo T || echo F'
163 % self._JUST_UPDATED_FLAG).stdout.strip() == 'T'
164
165
166 def cleanup(self):
167 """Special cleanup method to make sure hosts always get power back."""
168 super(SiteHost, self).cleanup()
169 remote_power = site_remote_power.RemotePower(self.hostname)
170 if remote_power:
171 remote_power.set_power_on()
172
173
174 def verify_software(self):
175 """Ensure the stateful partition has space for Autotest and updates.
176
177 Similar to what is done by AbstractSSH, except instead of checking the
178 Autotest installation path, just check the stateful partition.
179
180 Checking the stateful partition is preferable in case it has been wiped,
181 resulting in an Autotest installation path which doesn't exist and isn't
182 writable. We still want to pass verify in this state since the partition
183 will be recovered with the next install.
184 """
185 super(SiteHost, self).verify_software()
186 self.check_diskspace(
187 '/mnt/stateful_partition',
188 global_config.global_config.get_config_value(
189 'SERVER', 'gb_diskspace_required', type=int,
190 default=20))
J. Richard Barnette134ec2c2012-04-25 12:59:37 -0700191
192
193 def _ping_is_up(self):
194 """Ping the host once, and return whether it responded."""
195 return utils.ping(self.hostname, tries=1, deadline=1) == 0
196
197
198 def _ping_wait_down(self, timeout):
199 """Wait until the host no longer responds to `ping`.
200
201 @param timeout Minimum time to allow before declaring the
202 host to be non-responsive.
203 """
204
205 # This function is a slightly faster version of wait_down().
206 #
207 # In AbstractSSHHost.wait_down(), `ssh` is used to determine
208 # whether the host is down. In some situations (mine, at
209 # least), `ssh` can take over a minute to determine that the
210 # host is down. The `ping` command answers the question
211 # faster, so we use that here instead.
212 #
213 # There is no equivalent for wait_up(), because a target that
214 # answers to `ping` won't necessarily respond to `ssh`.
215 end_time = time.time() + timeout
216 while time.time() <= end_time:
217 if not self._ping_is_up():
218 return True
219
220 # If the timeout is short relative to the run time of
221 # _ping_is_up(), we might be prone to false failures for
222 # lack of checking frequently enough. To be safe, we make
223 # one last check _after_ the deadline.
224 return not self._ping_is_up()
225
226
227 def test_wait_for_sleep(self):
228 """Wait for the client to enter low-power sleep mode.
229
230 The test for "is asleep" can't distinguish a system that is
231 powered off; to confirm that the unit was asleep, it is
232 necessary to force resume, and then call
233 `test_wait_for_resume()`.
234
235 This function is expected to be called from a test as part
236 of a sequence like the following:
237
238 ~~~~~~~~
239 boot_id = host.get_boot_id()
240 # trigger sleep on the host
241 host.test_wait_for_sleep()
242 # trigger resume on the host
243 host.test_wait_for_resume(boot_id)
244 ~~~~~~~~
245
246 @exception TestFail The host did not go to sleep within
247 the allowed time.
248 """
249 if not self._ping_wait_down(timeout=self._SLEEP_TIMEOUT):
250 raise error.TestFail(
251 'client failed to sleep after %d seconds' %
252 self._SLEEP_TIMEOUT)
253
254
255 def test_wait_for_resume(self, old_boot_id):
256 """Wait for the client to resume from low-power sleep mode.
257
258 The `old_boot_id` parameter should be the value from
259 `get_boot_id()` obtained prior to entering sleep mode. A
260 `TestFail` exception is raised if the boot id changes.
261
262 See @ref test_wait_for_sleep for more on this function's
263 usage.
264
265 @param[in] old_boot_id A boot id value obtained before the
266 target host went to sleep.
267
268 @exception TestFail The host did not respond within the
269 allowed time.
270 @exception TestFail The host responded, but the boot id test
271 indicated a reboot rather than a sleep
272 cycle.
273 """
274 if not self.wait_up(timeout=self._RESUME_TIMEOUT):
275 raise error.TestFail(
276 'client failed to resume from sleep after %d seconds' %
277 self._RESUME_TIMEOUT)
278 else:
279 new_boot_id = self.get_boot_id()
280 if new_boot_id != old_boot_id:
281 raise error.TestFail(
282 'client rebooted, but sleep was expected'
283 ' (old boot %s, new boot %s)'
284 % (old_boot_id, new_boot_id))
285
286
287 def test_wait_for_shutdown(self):
288 """Wait for the client to shut down.
289
290 The test for "has shut down" can't distinguish a system that
291 is merely asleep; to confirm that the unit was down, it is
292 necessary to force boot, and then call test_wait_for_boot().
293
294 This function is expected to be called from a test as part
295 of a sequence like the following:
296
297 ~~~~~~~~
298 boot_id = host.get_boot_id()
299 # trigger shutdown on the host
300 host.test_wait_for_shutdown()
301 # trigger boot on the host
302 host.test_wait_for_boot(boot_id)
303 ~~~~~~~~
304
305 @exception TestFail The host did not shut down within the
306 allowed time.
307 """
308 if not self._ping_wait_down(timeout=self._SHUTDOWN_TIMEOUT):
309 raise error.TestFail(
310 'client failed to shut down after %d seconds' %
311 self._SHUTDOWN_TIMEOUT)
312
313
314 def test_wait_for_boot(self, old_boot_id=None):
315 """Wait for the client to boot from cold power.
316
317 The `old_boot_id` parameter should be the value from
318 `get_boot_id()` obtained prior to shutting down. A
319 `TestFail` exception is raised if the boot id does not
320 change. The boot id test is omitted if `old_boot_id` is not
321 specified.
322
323 See @ref test_wait_for_shutdown for more on this function's
324 usage.
325
326 @param[in] old_boot_id A boot id value obtained before the
327 shut down.
328
329 @exception TestFail The host did not respond within the
330 allowed time.
331 @exception TestFail The host responded, but the boot id test
332 indicated that there was no reboot.
333 """
334 if not self.wait_up(timeout=self._REBOOT_TIMEOUT):
335 raise error.TestFail(
336 'client failed to reboot after %d seconds' %
337 self._REBOOT_TIMEOUT)
338 elif old_boot_id:
339 if self.get_boot_id() == old_boot_id:
340 raise error.TestFail(
341 'client is back up, but did not reboot'
342 ' (boot %s)' % old_boot_id)