blob: 33b0c97347f1a08afa8d4a96b9054715d5162975 [file] [log] [blame]
Fang Deng5d518f42013-08-02 14:04:32 -07001# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4#
5# Expects to be run in an environment with sudo and no interactive password
6# prompt, such as within the Chromium OS development chroot.
7
8
9"""This file provides core logic for servo verify/repair process."""
10
11
12import httplib
13import logging
14import socket
15import time
16import xmlrpclib
17
18from autotest_lib.client.bin import utils
19from autotest_lib.client.common_lib import error
20from autotest_lib.client.common_lib.cros import retry
21from autotest_lib.server.cros.servo import servo
22from autotest_lib.server.hosts import ssh_host
Fang Dengd4fe7392013-09-20 12:18:21 -070023from autotest_lib.site_utils.rpm_control_system import rpm_client
Fang Deng5d518f42013-08-02 14:04:32 -070024
25
26class ServoHostException(error.AutoservError):
27 """This is the base class for exceptions raised by ServoHost."""
28 pass
29
30
31class ServoHostVerifyFailure(ServoHostException):
32 """Raised when servo verification fails."""
33 pass
34
35
Fang Dengd4fe7392013-09-20 12:18:21 -070036class ServoHostRepairFailure(ServoHostException):
37 """Raised when a repair method fails to repair a servo host."""
38 pass
39
40
Fang Deng5d518f42013-08-02 14:04:32 -070041class ServoHostRepairTotalFailure(ServoHostException):
42 """Raised if all attempts to repair a servo host fail."""
43 pass
44
45
46def make_servo_hostname(dut_hostname):
47 """Given a DUT's hostname, return the hostname of its servo.
48
49 @param dut_hostname: hostname of a DUT.
50
51 @return hostname of the DUT's servo.
52
53 """
54 host_parts = dut_hostname.split('.')
55 host_parts[0] = host_parts[0] + '-servo'
56 return '.'.join(host_parts)
57
58
59class ServoHost(ssh_host.SSHHost):
60 """Host class for a host that controls a servo, e.g. beaglebone."""
61
62 # Timeout for getting the value of 'pwr_button'.
63 PWR_BUTTON_CMD_TIMEOUT_SECS = 15
64 # Timeout for rebooting servo host.
65 REBOOT_TIMEOUT_SECS = 90
66 HOST_DOWN_TIMEOUT_SECS = 60
67 # Delay after rebooting for servod to become fully functional.
68 REBOOT_DELAY_SECS = 20
69 # Servod process name.
70 SERVOD_PROCESS = 'servod'
71
Fang Dengd4fe7392013-09-20 12:18:21 -070072 _MAX_POWER_CYCLE_ATTEMPTS = 3
73
Fang Deng5d518f42013-08-02 14:04:32 -070074
75 def _initialize(self, servo_host='localhost', servo_port=9999,
76 *args, **dargs):
77 """Initialize a ServoHost instance.
78
79 A ServoHost instance represents a host that controls a servo.
80
81 @param servo_host: Name of the host where the servod process
82 is running.
83 @param servo_port: Port the servod process is listening on.
84
85 """
86 super(ServoHost, self)._initialize(hostname=servo_host,
87 *args, **dargs)
88 self._is_in_lab = utils.host_is_in_lab_zone(self.hostname)
89 self._is_localhost = (self.hostname == 'localhost')
90 remote = 'http://%s:%s' % (self.hostname, servo_port)
91 self._servod_server = xmlrpclib.ServerProxy(remote)
92 # Commands on the servo host must be run by the superuser. Our account
93 # on Beaglebone is root, but locally we might be running as a
94 # different user. If so - `sudo ' will have to be added to the
95 # commands.
96 if self._is_localhost:
97 self._sudo_required = utils.system_output('id -u') != '0'
98 else:
99 self._sudo_required = False
100
101
102 def is_in_lab(self):
103 """Check whether the servo host is a lab device.
104
105 @returns: True if the servo host is in Cros Lab, otherwise False.
106
107 """
108 return self._is_in_lab
109
110
111 def is_localhost(self):
112 """Checks whether the servo host points to localhost.
113
114 @returns: True if it points to localhost, otherwise False.
115
116 """
117 return self._is_localhost
118
119
120 def get_servod_server_proxy(self):
121 """Return a proxy that can be used to communicate with servod server.
122
123 @returns: An xmlrpclib.ServerProxy that is connected to the servod
124 server on the host.
125
126 """
127 return self._servod_server
128
129
130 def get_wait_up_processes(self):
131 """Get the list of local processes to wait for in wait_up.
132
133 Override get_wait_up_processes in
134 autotest_lib.client.common_lib.hosts.base_classes.Host.
135 Wait for servod process to go up. Called by base class when
136 rebooting the device.
137
138 """
139 processes = [self.SERVOD_PROCESS]
140 return processes
141
142
143 def make_ssh_command(self, user='root', port=22, opts='', hosts_file=None,
144 connect_timeout=None, alive_interval=None):
145 """Override default make_ssh_command to use tuned options.
146
147 Tuning changes:
148 - ConnectTimeout=30; maximum of 30 seconds allowed for an SSH
149 connection failure. Consistency with remote_access.py.
150
151 - ServerAliveInterval=180; which causes SSH to ping connection every
152 180 seconds. In conjunction with ServerAliveCountMax ensures
153 that if the connection dies, Autotest will bail out quickly.
154
155 - ServerAliveCountMax=3; consistency with remote_access.py.
156
157 - ConnectAttempts=4; reduce flakiness in connection errors;
158 consistency with remote_access.py.
159
160 - UserKnownHostsFile=/dev/null; we don't care about the keys.
161
162 - SSH protocol forced to 2; needed for ServerAliveInterval.
163
164 @param user User name to use for the ssh connection.
165 @param port Port on the target host to use for ssh connection.
166 @param opts Additional options to the ssh command.
167 @param hosts_file Ignored.
168 @param connect_timeout Ignored.
169 @param alive_interval Ignored.
170
171 @returns: An ssh command with the requested settings.
172
173 """
174 base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
175 ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
176 ' -o ConnectTimeout=30 -o ServerAliveInterval=180'
177 ' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
178 ' -o Protocol=2 -l %s -p %d')
179 return base_command % (opts, user, port)
180
181
182 def _make_scp_cmd(self, sources, dest):
183 """Format scp command.
184
185 Given a list of source paths and a destination path, produces the
186 appropriate scp command for encoding it. Remote paths must be
187 pre-encoded. Overrides _make_scp_cmd in AbstractSSHHost
188 to allow additional ssh options.
189
190 @param sources: A list of source paths to copy from.
191 @param dest: Destination path to copy to.
192
193 @returns: An scp command that copies |sources| on local machine to
194 |dest| on the remote servo host.
195
196 """
197 command = ('scp -rq %s -o BatchMode=yes -o StrictHostKeyChecking=no '
198 '-o UserKnownHostsFile=/dev/null -P %d %s "%s"')
199 return command % (self.master_ssh_option,
200 self.port, ' '.join(sources), dest)
201
202
203 def run(self, command, timeout=3600, ignore_status=False,
204 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
205 connect_timeout=30, options='', stdin=None, verbose=True, args=()):
206 """Run a command on the servo host.
207
208 Extends method `run` in SSHHost. If the servo host is a remote device,
209 it will call `run` in SSHost without changing anything.
210 If the servo host is 'localhost', it will call utils.system_output.
211
212 @param command: The command line string.
213 @param timeout: Time limit in seconds before attempting to
214 kill the running process. The run() function
215 will take a few seconds longer than 'timeout'
216 to complete if it has to kill the process.
217 @param ignore_status: Do not raise an exception, no matter
218 what the exit code of the command is.
219 @param stdout_tee/stderr_tee: Where to tee the stdout/stderr.
220 @param connect_timeout: SSH connection timeout (in seconds)
221 Ignored if host is 'localhost'.
222 @param options: String with additional ssh command options
223 Ignored if host is 'localhost'.
224 @param stdin: Stdin to pass (a string) to the executed command.
225 @param verbose: Log the commands.
226 @param args: Sequence of strings to pass as arguments to command by
227 quoting them in " and escaping their contents if necessary.
228
229 @returns: A utils.CmdResult object.
230
231 @raises AutoservRunError if the command failed.
232 @raises AutoservSSHTimeout SSH connection has timed out. Only applies
233 when servo host is not 'localhost'.
234
235 """
236 run_args = {'command': command, 'timeout': timeout,
237 'ignore_status': ignore_status, 'stdout_tee': stdout_tee,
238 'stderr_tee': stderr_tee, 'stdin': stdin,
239 'verbose': verbose, 'args': args}
240 if self.is_localhost():
241 if self._sudo_required:
242 run_args['command'] = 'sudo -n %s' % command
243 try:
244 return utils.run(**run_args)
245 except error.CmdError as e:
246 logging.error(e)
247 raise error.AutoservRunError('command execution error',
248 e.result_obj)
249 else:
250 run_args['connect_timeout'] = connect_timeout
251 run_args['options'] = options
252 return super(ServoHost, self).run(**run_args)
253
254
255 def _check_servod(self):
256 """A sanity check of the servod state."""
257 msg_prefix = 'Servod error: %s'
258 error_msg = None
259 try:
260 timeout, _ = retry.timeout(
261 self._servod_server.get, args=('pwr_button', ),
262 timeout_sec=self.PWR_BUTTON_CMD_TIMEOUT_SECS)
263 if timeout:
264 error_msg = msg_prefix % 'Request timed out.'
265 except (socket.error, xmlrpclib.Error, httplib.BadStatusLine) as e:
266 error_msg = msg_prefix % e
267 if error_msg:
268 raise ServoHostVerifyFailure(error_msg)
269
270
271 def _check_servo_host_usb(self):
272 """A sanity check of the USB device.
273
274 Sometimes the usb gets wedged due to a kernel bug on the beaglebone.
275 A symptom is the presence of /dev/sda without /dev/sda1. The check
276 here ensures that if /dev/sda exists, /dev/sda1 must also exist.
277 See crbug.com/225932.
278
279 @raises ServoHostVerifyFailure if /dev/sda exists without /dev/sda1 on
280 the beaglebone.
281
282 """
283 try:
284 # The following test exits with a non-zero code
285 # and raises AutoserverRunError if error is detected.
286 self.run('test ! -b /dev/sda -o -b /dev/sda1')
287 except (error.AutoservRunError, error.AutoservSSHTimeout) as e:
288 raise ServoHostVerifyFailure(
289 'USB sanity check on %s failed: %s' % (self.hostname, e))
290
291
292 def verify_software(self):
293 """Verify that the servo is in a good state.
294
295 It overrides the base class function for verify_software.
296 It checks:
297 1) Whether basic servo command can run successfully.
298 2) Whether USB is in a good state. crbug.com/225932
299
300 @raises ServoHostVerifyFailure if servo host does not pass the checks.
301
302 """
303 logging.info('Verifying servo host %s with sanity checks.',
304 self.hostname)
305 self._check_servod()
306 self._check_servo_host_usb()
307 logging.info('Sanity checks pass on servo host %s', self.hostname)
308
309
310 def _repair_with_sysrq_reboot(self):
311 """Reboot with magic SysRq key."""
312 self.reboot(timeout=self.REBOOT_TIMEOUT_SECS,
313 down_timeout=self.HOST_DOWN_TIMEOUT_SECS,
314 reboot_cmd='echo "b" > /proc/sysrq-trigger &',
315 fastsync=True)
316 time.sleep(self.REBOOT_DELAY_SECS)
317
318
Fang Dengd4fe7392013-09-20 12:18:21 -0700319 def has_power(self):
320 """Return whether or not the servo host is powered by PoE."""
321 # TODO(fdeng): See crbug.com/302791
322 # For now, assume all servo hosts in the lab have power.
323 return self.is_in_lab()
324
325
326 def power_cycle(self):
327 """Cycle power to this host via PoE if it is a lab device.
328
329 @raises ServoHostRepairFailure if it fails to power cycle the
330 servo host.
331
332 """
333 if self.has_power():
334 try:
335 rpm_client.set_power(self.hostname, 'CYCLE')
336 except (socket.error, xmlrpclib.Error,
337 httplib.BadStatusLine,
338 rpm_client.RemotePowerException) as e:
339 raise ServoHostRepairFailure(
340 'Power cycling %s failed: %s' % (self.hostname, e))
341 else:
342 logging.info('Skipping power cycling, not a lab device.')
343
344
Fang Deng5d518f42013-08-02 14:04:32 -0700345 def _powercycle_to_repair(self):
Fang Dengd4fe7392013-09-20 12:18:21 -0700346 """Power cycle the servo host using PoE.
347
348 @raises ServoHostRepairFailure if it fails to fix the servo host.
349
350 """
351 if not self.has_power():
352 raise ServoHostRepairFailure('%s does not support power.' %
353 self.hostname)
354 logging.info('Attempting repair via PoE powercycle.')
355 failed_cycles = 0
356 self.power_cycle()
357 while not self.wait_up(timeout=self.REBOOT_TIMEOUT_SECS):
358 failed_cycles += 1
359 if failed_cycles >= self._MAX_POWER_CYCLE_ATTEMPTS:
360 raise ServoHostRepairFailure(
361 'Powercycled host %s %d times; device did not come back'
362 ' online.' % (self.hostname, failed_cycles))
363 self.power_cycle()
364 logging.info('Powercycling was successful after %d failures.',
365 failed_cycles)
366 # Allow some time for servod to get started.
367 time.sleep(self.REBOOT_DELAY_SECS)
Fang Deng5d518f42013-08-02 14:04:32 -0700368
369
370 def repair_full(self):
371 """Attempt to repair servo host.
372
373 This overrides the base class function for repair.
374 Note if the host is not in Cros Lab, the repair procedure
375 will be skipped.
376
377 @raises ServoHostRepairTotalFailure if all attempts fail.
378
379 """
380 if not self.is_in_lab():
381 logging.warn('Skip repairing servo host %s: Not a lab device.',
382 self.hostname)
383 return
384 logging.info('Attempting to repair servo host %s.', self.hostname)
Fang Dengd4fe7392013-09-20 12:18:21 -0700385 repair_funcs = [self._repair_with_sysrq_reboot,
386 self._powercycle_to_repair]
Fang Deng5d518f42013-08-02 14:04:32 -0700387 errors = []
388 for repair_func in repair_funcs:
389 try:
390 repair_func()
391 self.verify()
392 return
393 except Exception as e:
394 logging.warn('Failed to repair servo: %s', e)
395 errors.append(str(e))
396 raise ServoHostRepairTotalFailure(
397 'All attempts at repairing the servo failed:\n%s' %
398 '\n'.join(errors))
399
400
401 def create_healthy_servo_object(self):
402 """Create a servo.Servo object.
403
404 Create a servo.Servo object. If the servo host is in Cros Lab,
405 this method will first verify the servo host and attempt to repair it if
406 error is detected.
407
408 @raises ServoHostRepairTotalFailure if it fails to fix the servo host.
409 @raises AutoservSshPermissionDeniedError if the DUT is not ssh-able
410 due to permission error.
411
412 """
413 if self.is_in_lab():
414 try:
415 self.verify()
416 except (error.AutoservSSHTimeout,
417 error.AutoservSshPingHostError,
418 error.AutoservHostIsShuttingDownError,
419 ServoHostVerifyFailure):
420 self.repair_full()
421 except error.AutoservSshPermissionDeniedError:
422 raise
423 return servo.Servo(servo_host=self)