blob: 1cfafeb1f318e3be90bae7be09c61062563090e3 [file] [log] [blame]
Fang Deng5d518f42013-08-02 14:04:32 -07001# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4#
5# Expects to be run in an environment with sudo and no interactive password
6# prompt, such as within the Chromium OS development chroot.
7
8
9"""This file provides core logic for servo verify/repair process."""
10
11
12import httplib
13import logging
14import socket
15import time
16import xmlrpclib
17
18from autotest_lib.client.bin import utils
19from autotest_lib.client.common_lib import error
beeps5e8c45a2013-12-17 22:05:11 -080020from autotest_lib.client.common_lib import global_config
21from autotest_lib.client.common_lib.cros import autoupdater
22from autotest_lib.client.common_lib.cros import dev_server
Fang Deng5d518f42013-08-02 14:04:32 -070023from autotest_lib.client.common_lib.cros import retry
beeps5e8c45a2013-12-17 22:05:11 -080024from autotest_lib.server import site_utils as server_site_utils
Fang Deng5d518f42013-08-02 14:04:32 -070025from autotest_lib.server.cros.servo import servo
26from autotest_lib.server.hosts import ssh_host
Fang Dengf0ea6142013-10-10 21:43:16 -070027from autotest_lib.site_utils.graphite import stats
Fang Dengd4fe7392013-09-20 12:18:21 -070028from autotest_lib.site_utils.rpm_control_system import rpm_client
Fang Deng5d518f42013-08-02 14:04:32 -070029
30
31class ServoHostException(error.AutoservError):
32 """This is the base class for exceptions raised by ServoHost."""
33 pass
34
35
36class ServoHostVerifyFailure(ServoHostException):
37 """Raised when servo verification fails."""
38 pass
39
40
Fang Dengd4fe7392013-09-20 12:18:21 -070041class ServoHostRepairFailure(ServoHostException):
42 """Raised when a repair method fails to repair a servo host."""
43 pass
44
45
Fang Dengf0ea6142013-10-10 21:43:16 -070046class ServoHostRepairMethodNA(ServoHostException):
47 """Raised when a repair method is not applicable."""
48 pass
49
50
Fang Deng5d518f42013-08-02 14:04:32 -070051class ServoHostRepairTotalFailure(ServoHostException):
52 """Raised if all attempts to repair a servo host fail."""
53 pass
54
55
56def make_servo_hostname(dut_hostname):
57 """Given a DUT's hostname, return the hostname of its servo.
58
59 @param dut_hostname: hostname of a DUT.
60
61 @return hostname of the DUT's servo.
62
63 """
64 host_parts = dut_hostname.split('.')
65 host_parts[0] = host_parts[0] + '-servo'
66 return '.'.join(host_parts)
67
68
69class ServoHost(ssh_host.SSHHost):
70 """Host class for a host that controls a servo, e.g. beaglebone."""
71
72 # Timeout for getting the value of 'pwr_button'.
73 PWR_BUTTON_CMD_TIMEOUT_SECS = 15
74 # Timeout for rebooting servo host.
75 REBOOT_TIMEOUT_SECS = 90
76 HOST_DOWN_TIMEOUT_SECS = 60
77 # Delay after rebooting for servod to become fully functional.
78 REBOOT_DELAY_SECS = 20
79 # Servod process name.
80 SERVOD_PROCESS = 'servod'
81
Fang Dengd4fe7392013-09-20 12:18:21 -070082 _MAX_POWER_CYCLE_ATTEMPTS = 3
beeps5e8c45a2013-12-17 22:05:11 -080083 _timer = stats.Timer('servo_host')
Fang Dengd4fe7392013-09-20 12:18:21 -070084
Fang Deng5d518f42013-08-02 14:04:32 -070085
86 def _initialize(self, servo_host='localhost', servo_port=9999,
87 *args, **dargs):
88 """Initialize a ServoHost instance.
89
90 A ServoHost instance represents a host that controls a servo.
91
92 @param servo_host: Name of the host where the servod process
93 is running.
94 @param servo_port: Port the servod process is listening on.
95
96 """
97 super(ServoHost, self)._initialize(hostname=servo_host,
98 *args, **dargs)
99 self._is_in_lab = utils.host_is_in_lab_zone(self.hostname)
100 self._is_localhost = (self.hostname == 'localhost')
101 remote = 'http://%s:%s' % (self.hostname, servo_port)
102 self._servod_server = xmlrpclib.ServerProxy(remote)
103 # Commands on the servo host must be run by the superuser. Our account
104 # on Beaglebone is root, but locally we might be running as a
105 # different user. If so - `sudo ' will have to be added to the
106 # commands.
107 if self._is_localhost:
108 self._sudo_required = utils.system_output('id -u') != '0'
109 else:
110 self._sudo_required = False
111
112
113 def is_in_lab(self):
114 """Check whether the servo host is a lab device.
115
116 @returns: True if the servo host is in Cros Lab, otherwise False.
117
118 """
119 return self._is_in_lab
120
121
122 def is_localhost(self):
123 """Checks whether the servo host points to localhost.
124
125 @returns: True if it points to localhost, otherwise False.
126
127 """
128 return self._is_localhost
129
130
131 def get_servod_server_proxy(self):
132 """Return a proxy that can be used to communicate with servod server.
133
134 @returns: An xmlrpclib.ServerProxy that is connected to the servod
135 server on the host.
136
137 """
138 return self._servod_server
139
140
141 def get_wait_up_processes(self):
142 """Get the list of local processes to wait for in wait_up.
143
144 Override get_wait_up_processes in
145 autotest_lib.client.common_lib.hosts.base_classes.Host.
146 Wait for servod process to go up. Called by base class when
147 rebooting the device.
148
149 """
150 processes = [self.SERVOD_PROCESS]
151 return processes
152
153
beeps5e8c45a2013-12-17 22:05:11 -0800154 def _is_cros_host(self):
155 """Check if a servo host is running chromeos.
156
157 @return: True if the servo host is running chromeos.
158 False if it isn't, or we don't have enough information.
159 """
160 try:
161 result = self.run('grep -q CHROMEOS /etc/lsb-release',
162 ignore_status=True, timeout=10)
163 except (error.AutoservRunError, error.AutoservSSHTimeout):
164 return False
165 return result.exit_status == 0
166
167
Fang Deng5d518f42013-08-02 14:04:32 -0700168 def make_ssh_command(self, user='root', port=22, opts='', hosts_file=None,
169 connect_timeout=None, alive_interval=None):
170 """Override default make_ssh_command to use tuned options.
171
172 Tuning changes:
173 - ConnectTimeout=30; maximum of 30 seconds allowed for an SSH
174 connection failure. Consistency with remote_access.py.
175
176 - ServerAliveInterval=180; which causes SSH to ping connection every
177 180 seconds. In conjunction with ServerAliveCountMax ensures
178 that if the connection dies, Autotest will bail out quickly.
179
180 - ServerAliveCountMax=3; consistency with remote_access.py.
181
182 - ConnectAttempts=4; reduce flakiness in connection errors;
183 consistency with remote_access.py.
184
185 - UserKnownHostsFile=/dev/null; we don't care about the keys.
186
187 - SSH protocol forced to 2; needed for ServerAliveInterval.
188
189 @param user User name to use for the ssh connection.
190 @param port Port on the target host to use for ssh connection.
191 @param opts Additional options to the ssh command.
192 @param hosts_file Ignored.
193 @param connect_timeout Ignored.
194 @param alive_interval Ignored.
195
196 @returns: An ssh command with the requested settings.
197
198 """
199 base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
200 ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
201 ' -o ConnectTimeout=30 -o ServerAliveInterval=180'
202 ' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
203 ' -o Protocol=2 -l %s -p %d')
204 return base_command % (opts, user, port)
205
206
207 def _make_scp_cmd(self, sources, dest):
208 """Format scp command.
209
210 Given a list of source paths and a destination path, produces the
211 appropriate scp command for encoding it. Remote paths must be
212 pre-encoded. Overrides _make_scp_cmd in AbstractSSHHost
213 to allow additional ssh options.
214
215 @param sources: A list of source paths to copy from.
216 @param dest: Destination path to copy to.
217
218 @returns: An scp command that copies |sources| on local machine to
219 |dest| on the remote servo host.
220
221 """
222 command = ('scp -rq %s -o BatchMode=yes -o StrictHostKeyChecking=no '
223 '-o UserKnownHostsFile=/dev/null -P %d %s "%s"')
224 return command % (self.master_ssh_option,
225 self.port, ' '.join(sources), dest)
226
227
228 def run(self, command, timeout=3600, ignore_status=False,
229 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
230 connect_timeout=30, options='', stdin=None, verbose=True, args=()):
231 """Run a command on the servo host.
232
233 Extends method `run` in SSHHost. If the servo host is a remote device,
234 it will call `run` in SSHost without changing anything.
235 If the servo host is 'localhost', it will call utils.system_output.
236
237 @param command: The command line string.
238 @param timeout: Time limit in seconds before attempting to
239 kill the running process. The run() function
240 will take a few seconds longer than 'timeout'
241 to complete if it has to kill the process.
242 @param ignore_status: Do not raise an exception, no matter
243 what the exit code of the command is.
244 @param stdout_tee/stderr_tee: Where to tee the stdout/stderr.
245 @param connect_timeout: SSH connection timeout (in seconds)
246 Ignored if host is 'localhost'.
247 @param options: String with additional ssh command options
248 Ignored if host is 'localhost'.
249 @param stdin: Stdin to pass (a string) to the executed command.
250 @param verbose: Log the commands.
251 @param args: Sequence of strings to pass as arguments to command by
252 quoting them in " and escaping their contents if necessary.
253
254 @returns: A utils.CmdResult object.
255
256 @raises AutoservRunError if the command failed.
257 @raises AutoservSSHTimeout SSH connection has timed out. Only applies
258 when servo host is not 'localhost'.
259
260 """
261 run_args = {'command': command, 'timeout': timeout,
262 'ignore_status': ignore_status, 'stdout_tee': stdout_tee,
263 'stderr_tee': stderr_tee, 'stdin': stdin,
264 'verbose': verbose, 'args': args}
265 if self.is_localhost():
266 if self._sudo_required:
267 run_args['command'] = 'sudo -n %s' % command
268 try:
269 return utils.run(**run_args)
270 except error.CmdError as e:
271 logging.error(e)
272 raise error.AutoservRunError('command execution error',
273 e.result_obj)
274 else:
275 run_args['connect_timeout'] = connect_timeout
276 run_args['options'] = options
277 return super(ServoHost, self).run(**run_args)
278
279
280 def _check_servod(self):
281 """A sanity check of the servod state."""
282 msg_prefix = 'Servod error: %s'
283 error_msg = None
284 try:
285 timeout, _ = retry.timeout(
286 self._servod_server.get, args=('pwr_button', ),
287 timeout_sec=self.PWR_BUTTON_CMD_TIMEOUT_SECS)
288 if timeout:
289 error_msg = msg_prefix % 'Request timed out.'
290 except (socket.error, xmlrpclib.Error, httplib.BadStatusLine) as e:
291 error_msg = msg_prefix % e
292 if error_msg:
293 raise ServoHostVerifyFailure(error_msg)
294
295
296 def _check_servo_host_usb(self):
297 """A sanity check of the USB device.
298
299 Sometimes the usb gets wedged due to a kernel bug on the beaglebone.
300 A symptom is the presence of /dev/sda without /dev/sda1. The check
301 here ensures that if /dev/sda exists, /dev/sda1 must also exist.
302 See crbug.com/225932.
303
304 @raises ServoHostVerifyFailure if /dev/sda exists without /dev/sda1 on
305 the beaglebone.
306
307 """
308 try:
309 # The following test exits with a non-zero code
310 # and raises AutoserverRunError if error is detected.
311 self.run('test ! -b /dev/sda -o -b /dev/sda1')
312 except (error.AutoservRunError, error.AutoservSSHTimeout) as e:
313 raise ServoHostVerifyFailure(
314 'USB sanity check on %s failed: %s' % (self.hostname, e))
315
316
beeps5e8c45a2013-12-17 22:05:11 -0800317 @_timer.decorate
318 def _update_image(self):
319 """Update the image on the servo host, if needed.
320
321 This method does nothing for servo hosts that are not running chromeos.
322 If the host is running chromeos, and a newer image is available on the
323 devserver, trigger a download and apply it in the background. If an
324 update has already been downloaded and applied, reboot the servo host
325 into the new image. If update_engine_client is in the process of
326 applying an update that was triggered on a previous invocation, do
327 nothing.
328
329 @raises dev_server.DevServerException: If all the devservers are down.
330 @raises site_utils.ParseBuildNameException: If the devserver returns
331 an invalid build name.
332 @raises autoupdater.ChromiumOSError: If something goes wrong in the
333 checking update engine client status or applying an update.
334 @raises AutoservRunError: If the update_engine_client isn't present on
335 the host, and the host is a cros_host.
336 """
337 #TODO(beeps): Remove this check once all servo hosts are using chromeos.
338 if not self._is_cros_host():
339 logging.info('Not attempting an update, either %s is not running '
340 'chromeos or we cannot find enough information about '
341 'the host.', self.hostname)
342 return
343
344 update_branch = global_config.global_config.get_config_value(
345 'CROS', 'servo_builder')
346 ds = dev_server.ImageServer.resolve(self.hostname)
347 latest_build = ds.get_latest_build_in_server(target=update_branch)
348
349 # We might have just purged all the beaglebone builds on the devserver
350 # after having triggered a download the last time we verified this
351 # beaglebone, so we still need to reboot if necessary.
352 if latest_build is None:
353 logging.debug('Could not find any builds for %s on %s',
354 update_branch, ds.url())
355 url = ds.url()
356 latest_build_number = None
357 else:
358 latest_build = '%s/%s' % (update_branch, latest_build)
359 latest_build_number = server_site_utils.ParseBuildName(
360 latest_build)[3]
361 url = ds.get_update_url(latest_build)
362
363 updater = autoupdater.ChromiumOSUpdater(update_url=url, host=self)
364 current_build_number = updater.get_build_id()
365 status = updater.check_update_status()
366
367 if status == autoupdater.UPDATER_NEED_REBOOT:
368 logging.info('Rebooting beaglebone host %s with build %s',
369 self.hostname, current_build_number)
370 kwargs = {
371 'reboot_cmd': ('((reboot & sleep 10; reboot -f &) '
372 '</dev/null >/dev/null 2>&1 &)'),
373 'fastsync': True,
374 'label': None,
375 'wait': True,
376 }
377 self.reboot(**kwargs)
378 current_build_number = updater.get_build_id()
379 logging.info('servo host %s back from reboot, with build %s',
380 self.hostname, current_build_number)
381
382 if status in autoupdater.UPDATER_PROCESSING_UPDATE:
383 logging.info('servo host %s already processing an update, update '
384 'engine client status=%s', self.hostname, status)
385 elif (latest_build_number and
386 current_build_number != latest_build_number):
387 logging.info('Using devserver url: %s to trigger update on '
388 'servo host %s, from %s to %s', url, self.hostname,
389 current_build_number, latest_build_number)
390 try:
391 updater.trigger_update()
392 except autoupdater.RootFSUpdateError as e:
393 trigger_download_status = 'failed with %s' % str(e)
394 stats.Counter('servo_host.RootFSUpdateError').increment()
395 else:
396 trigger_download_status = 'passed'
397 logging.info('Triggered download and update %s for %s, '
398 'update engine currently in status %s',
399 trigger_download_status, self.hostname,
400 updater.check_update_status())
401 else:
402 logging.info('servo host %s does not require an update.',
403 self.hostname)
404
405
Fang Deng5d518f42013-08-02 14:04:32 -0700406 def verify_software(self):
beeps5e8c45a2013-12-17 22:05:11 -0800407 """Update the servo host and verify it's in a good state.
Fang Deng5d518f42013-08-02 14:04:32 -0700408
409 It overrides the base class function for verify_software.
beeps5e8c45a2013-12-17 22:05:11 -0800410 If an update is available, downloads and applies it. Then verifies:
Fang Deng5d518f42013-08-02 14:04:32 -0700411 1) Whether basic servo command can run successfully.
412 2) Whether USB is in a good state. crbug.com/225932
413
414 @raises ServoHostVerifyFailure if servo host does not pass the checks.
415
416 """
beeps5e8c45a2013-12-17 22:05:11 -0800417 logging.info('Applying an update to the servo host, if necessary.')
418 self._update_image()
419
Fang Deng5d518f42013-08-02 14:04:32 -0700420 logging.info('Verifying servo host %s with sanity checks.',
421 self.hostname)
422 self._check_servod()
423 self._check_servo_host_usb()
424 logging.info('Sanity checks pass on servo host %s', self.hostname)
425
426
427 def _repair_with_sysrq_reboot(self):
428 """Reboot with magic SysRq key."""
429 self.reboot(timeout=self.REBOOT_TIMEOUT_SECS,
430 down_timeout=self.HOST_DOWN_TIMEOUT_SECS,
431 reboot_cmd='echo "b" > /proc/sysrq-trigger &',
432 fastsync=True)
433 time.sleep(self.REBOOT_DELAY_SECS)
434
435
Fang Dengd4fe7392013-09-20 12:18:21 -0700436 def has_power(self):
437 """Return whether or not the servo host is powered by PoE."""
438 # TODO(fdeng): See crbug.com/302791
439 # For now, assume all servo hosts in the lab have power.
440 return self.is_in_lab()
441
442
443 def power_cycle(self):
444 """Cycle power to this host via PoE if it is a lab device.
445
446 @raises ServoHostRepairFailure if it fails to power cycle the
447 servo host.
448
449 """
450 if self.has_power():
451 try:
452 rpm_client.set_power(self.hostname, 'CYCLE')
453 except (socket.error, xmlrpclib.Error,
454 httplib.BadStatusLine,
455 rpm_client.RemotePowerException) as e:
456 raise ServoHostRepairFailure(
457 'Power cycling %s failed: %s' % (self.hostname, e))
458 else:
459 logging.info('Skipping power cycling, not a lab device.')
460
461
Fang Deng5d518f42013-08-02 14:04:32 -0700462 def _powercycle_to_repair(self):
Fang Dengd4fe7392013-09-20 12:18:21 -0700463 """Power cycle the servo host using PoE.
464
465 @raises ServoHostRepairFailure if it fails to fix the servo host.
Fang Dengf0ea6142013-10-10 21:43:16 -0700466 @raises ServoHostRepairMethodNA if it does not support power.
Fang Dengd4fe7392013-09-20 12:18:21 -0700467
468 """
469 if not self.has_power():
Fang Dengf0ea6142013-10-10 21:43:16 -0700470 raise ServoHostRepairMethodNA('%s does not support power.' %
471 self.hostname)
Fang Dengd4fe7392013-09-20 12:18:21 -0700472 logging.info('Attempting repair via PoE powercycle.')
473 failed_cycles = 0
474 self.power_cycle()
475 while not self.wait_up(timeout=self.REBOOT_TIMEOUT_SECS):
476 failed_cycles += 1
477 if failed_cycles >= self._MAX_POWER_CYCLE_ATTEMPTS:
478 raise ServoHostRepairFailure(
479 'Powercycled host %s %d times; device did not come back'
480 ' online.' % (self.hostname, failed_cycles))
481 self.power_cycle()
482 logging.info('Powercycling was successful after %d failures.',
483 failed_cycles)
484 # Allow some time for servod to get started.
485 time.sleep(self.REBOOT_DELAY_SECS)
Fang Deng5d518f42013-08-02 14:04:32 -0700486
487
488 def repair_full(self):
489 """Attempt to repair servo host.
490
491 This overrides the base class function for repair.
492 Note if the host is not in Cros Lab, the repair procedure
493 will be skipped.
494
495 @raises ServoHostRepairTotalFailure if all attempts fail.
496
497 """
498 if not self.is_in_lab():
499 logging.warn('Skip repairing servo host %s: Not a lab device.',
500 self.hostname)
501 return
502 logging.info('Attempting to repair servo host %s.', self.hostname)
Fang Dengd4fe7392013-09-20 12:18:21 -0700503 repair_funcs = [self._repair_with_sysrq_reboot,
504 self._powercycle_to_repair]
Fang Deng5d518f42013-08-02 14:04:32 -0700505 errors = []
506 for repair_func in repair_funcs:
Fang Dengf0ea6142013-10-10 21:43:16 -0700507 counter_prefix = 'servo_host_repair.%s.' % repair_func.__name__
Fang Deng5d518f42013-08-02 14:04:32 -0700508 try:
509 repair_func()
510 self.verify()
Fang Dengf0ea6142013-10-10 21:43:16 -0700511 stats.Counter(counter_prefix + 'SUCCEEDED').increment()
Fang Deng5d518f42013-08-02 14:04:32 -0700512 return
Fang Dengf0ea6142013-10-10 21:43:16 -0700513 except ServoHostRepairMethodNA as e:
514 logging.warn('Repair method NA: %s', e)
515 stats.Counter(counter_prefix + 'RepairNA').increment()
516 errors.append(str(e))
Fang Deng5d518f42013-08-02 14:04:32 -0700517 except Exception as e:
518 logging.warn('Failed to repair servo: %s', e)
Fang Dengf0ea6142013-10-10 21:43:16 -0700519 stats.Counter(counter_prefix + 'FAILED').increment()
Fang Deng5d518f42013-08-02 14:04:32 -0700520 errors.append(str(e))
Fang Dengf0ea6142013-10-10 21:43:16 -0700521 stats.Counter('servo_host_repair.Full_Repair_Failed').increment()
Fang Deng5d518f42013-08-02 14:04:32 -0700522 raise ServoHostRepairTotalFailure(
523 'All attempts at repairing the servo failed:\n%s' %
524 '\n'.join(errors))
525
526
527 def create_healthy_servo_object(self):
528 """Create a servo.Servo object.
529
530 Create a servo.Servo object. If the servo host is in Cros Lab,
531 this method will first verify the servo host and attempt to repair it if
532 error is detected.
533
534 @raises ServoHostRepairTotalFailure if it fails to fix the servo host.
535 @raises AutoservSshPermissionDeniedError if the DUT is not ssh-able
536 due to permission error.
537
538 """
539 if self.is_in_lab():
540 try:
541 self.verify()
542 except (error.AutoservSSHTimeout,
543 error.AutoservSshPingHostError,
544 error.AutoservHostIsShuttingDownError,
545 ServoHostVerifyFailure):
546 self.repair_full()
547 except error.AutoservSshPermissionDeniedError:
548 raise
549 return servo.Servo(servo_host=self)