blob: d15df38b65596b83ab5db0ec83ef9c675fdcaf36 [file] [log] [blame]
Fang Deng5d518f42013-08-02 14:04:32 -07001# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4#
5# Expects to be run in an environment with sudo and no interactive password
6# prompt, such as within the Chromium OS development chroot.
7
8
9"""This file provides core logic for servo verify/repair process."""
10
11
12import httplib
13import logging
14import socket
15import time
16import xmlrpclib
17
18from autotest_lib.client.bin import utils
19from autotest_lib.client.common_lib import error
20from autotest_lib.client.common_lib.cros import retry
21from autotest_lib.server.cros.servo import servo
22from autotest_lib.server.hosts import ssh_host
Fang Dengf0ea6142013-10-10 21:43:16 -070023from autotest_lib.site_utils.graphite import stats
Fang Dengd4fe7392013-09-20 12:18:21 -070024from autotest_lib.site_utils.rpm_control_system import rpm_client
Fang Deng5d518f42013-08-02 14:04:32 -070025
26
27class ServoHostException(error.AutoservError):
28 """This is the base class for exceptions raised by ServoHost."""
29 pass
30
31
32class ServoHostVerifyFailure(ServoHostException):
33 """Raised when servo verification fails."""
34 pass
35
36
Fang Dengd4fe7392013-09-20 12:18:21 -070037class ServoHostRepairFailure(ServoHostException):
38 """Raised when a repair method fails to repair a servo host."""
39 pass
40
41
Fang Dengf0ea6142013-10-10 21:43:16 -070042class ServoHostRepairMethodNA(ServoHostException):
43 """Raised when a repair method is not applicable."""
44 pass
45
46
Fang Deng5d518f42013-08-02 14:04:32 -070047class ServoHostRepairTotalFailure(ServoHostException):
48 """Raised if all attempts to repair a servo host fail."""
49 pass
50
51
52def make_servo_hostname(dut_hostname):
53 """Given a DUT's hostname, return the hostname of its servo.
54
55 @param dut_hostname: hostname of a DUT.
56
57 @return hostname of the DUT's servo.
58
59 """
60 host_parts = dut_hostname.split('.')
61 host_parts[0] = host_parts[0] + '-servo'
62 return '.'.join(host_parts)
63
64
65class ServoHost(ssh_host.SSHHost):
66 """Host class for a host that controls a servo, e.g. beaglebone."""
67
68 # Timeout for getting the value of 'pwr_button'.
69 PWR_BUTTON_CMD_TIMEOUT_SECS = 15
70 # Timeout for rebooting servo host.
71 REBOOT_TIMEOUT_SECS = 90
72 HOST_DOWN_TIMEOUT_SECS = 60
73 # Delay after rebooting for servod to become fully functional.
74 REBOOT_DELAY_SECS = 20
75 # Servod process name.
76 SERVOD_PROCESS = 'servod'
77
Fang Dengd4fe7392013-09-20 12:18:21 -070078 _MAX_POWER_CYCLE_ATTEMPTS = 3
79
Fang Deng5d518f42013-08-02 14:04:32 -070080
81 def _initialize(self, servo_host='localhost', servo_port=9999,
82 *args, **dargs):
83 """Initialize a ServoHost instance.
84
85 A ServoHost instance represents a host that controls a servo.
86
87 @param servo_host: Name of the host where the servod process
88 is running.
89 @param servo_port: Port the servod process is listening on.
90
91 """
92 super(ServoHost, self)._initialize(hostname=servo_host,
93 *args, **dargs)
94 self._is_in_lab = utils.host_is_in_lab_zone(self.hostname)
95 self._is_localhost = (self.hostname == 'localhost')
96 remote = 'http://%s:%s' % (self.hostname, servo_port)
97 self._servod_server = xmlrpclib.ServerProxy(remote)
98 # Commands on the servo host must be run by the superuser. Our account
99 # on Beaglebone is root, but locally we might be running as a
100 # different user. If so - `sudo ' will have to be added to the
101 # commands.
102 if self._is_localhost:
103 self._sudo_required = utils.system_output('id -u') != '0'
104 else:
105 self._sudo_required = False
106
107
108 def is_in_lab(self):
109 """Check whether the servo host is a lab device.
110
111 @returns: True if the servo host is in Cros Lab, otherwise False.
112
113 """
114 return self._is_in_lab
115
116
117 def is_localhost(self):
118 """Checks whether the servo host points to localhost.
119
120 @returns: True if it points to localhost, otherwise False.
121
122 """
123 return self._is_localhost
124
125
126 def get_servod_server_proxy(self):
127 """Return a proxy that can be used to communicate with servod server.
128
129 @returns: An xmlrpclib.ServerProxy that is connected to the servod
130 server on the host.
131
132 """
133 return self._servod_server
134
135
136 def get_wait_up_processes(self):
137 """Get the list of local processes to wait for in wait_up.
138
139 Override get_wait_up_processes in
140 autotest_lib.client.common_lib.hosts.base_classes.Host.
141 Wait for servod process to go up. Called by base class when
142 rebooting the device.
143
144 """
145 processes = [self.SERVOD_PROCESS]
146 return processes
147
148
149 def make_ssh_command(self, user='root', port=22, opts='', hosts_file=None,
150 connect_timeout=None, alive_interval=None):
151 """Override default make_ssh_command to use tuned options.
152
153 Tuning changes:
154 - ConnectTimeout=30; maximum of 30 seconds allowed for an SSH
155 connection failure. Consistency with remote_access.py.
156
157 - ServerAliveInterval=180; which causes SSH to ping connection every
158 180 seconds. In conjunction with ServerAliveCountMax ensures
159 that if the connection dies, Autotest will bail out quickly.
160
161 - ServerAliveCountMax=3; consistency with remote_access.py.
162
163 - ConnectAttempts=4; reduce flakiness in connection errors;
164 consistency with remote_access.py.
165
166 - UserKnownHostsFile=/dev/null; we don't care about the keys.
167
168 - SSH protocol forced to 2; needed for ServerAliveInterval.
169
170 @param user User name to use for the ssh connection.
171 @param port Port on the target host to use for ssh connection.
172 @param opts Additional options to the ssh command.
173 @param hosts_file Ignored.
174 @param connect_timeout Ignored.
175 @param alive_interval Ignored.
176
177 @returns: An ssh command with the requested settings.
178
179 """
180 base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
181 ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
182 ' -o ConnectTimeout=30 -o ServerAliveInterval=180'
183 ' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
184 ' -o Protocol=2 -l %s -p %d')
185 return base_command % (opts, user, port)
186
187
188 def _make_scp_cmd(self, sources, dest):
189 """Format scp command.
190
191 Given a list of source paths and a destination path, produces the
192 appropriate scp command for encoding it. Remote paths must be
193 pre-encoded. Overrides _make_scp_cmd in AbstractSSHHost
194 to allow additional ssh options.
195
196 @param sources: A list of source paths to copy from.
197 @param dest: Destination path to copy to.
198
199 @returns: An scp command that copies |sources| on local machine to
200 |dest| on the remote servo host.
201
202 """
203 command = ('scp -rq %s -o BatchMode=yes -o StrictHostKeyChecking=no '
204 '-o UserKnownHostsFile=/dev/null -P %d %s "%s"')
205 return command % (self.master_ssh_option,
206 self.port, ' '.join(sources), dest)
207
208
209 def run(self, command, timeout=3600, ignore_status=False,
210 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
211 connect_timeout=30, options='', stdin=None, verbose=True, args=()):
212 """Run a command on the servo host.
213
214 Extends method `run` in SSHHost. If the servo host is a remote device,
215 it will call `run` in SSHost without changing anything.
216 If the servo host is 'localhost', it will call utils.system_output.
217
218 @param command: The command line string.
219 @param timeout: Time limit in seconds before attempting to
220 kill the running process. The run() function
221 will take a few seconds longer than 'timeout'
222 to complete if it has to kill the process.
223 @param ignore_status: Do not raise an exception, no matter
224 what the exit code of the command is.
225 @param stdout_tee/stderr_tee: Where to tee the stdout/stderr.
226 @param connect_timeout: SSH connection timeout (in seconds)
227 Ignored if host is 'localhost'.
228 @param options: String with additional ssh command options
229 Ignored if host is 'localhost'.
230 @param stdin: Stdin to pass (a string) to the executed command.
231 @param verbose: Log the commands.
232 @param args: Sequence of strings to pass as arguments to command by
233 quoting them in " and escaping their contents if necessary.
234
235 @returns: A utils.CmdResult object.
236
237 @raises AutoservRunError if the command failed.
238 @raises AutoservSSHTimeout SSH connection has timed out. Only applies
239 when servo host is not 'localhost'.
240
241 """
242 run_args = {'command': command, 'timeout': timeout,
243 'ignore_status': ignore_status, 'stdout_tee': stdout_tee,
244 'stderr_tee': stderr_tee, 'stdin': stdin,
245 'verbose': verbose, 'args': args}
246 if self.is_localhost():
247 if self._sudo_required:
248 run_args['command'] = 'sudo -n %s' % command
249 try:
250 return utils.run(**run_args)
251 except error.CmdError as e:
252 logging.error(e)
253 raise error.AutoservRunError('command execution error',
254 e.result_obj)
255 else:
256 run_args['connect_timeout'] = connect_timeout
257 run_args['options'] = options
258 return super(ServoHost, self).run(**run_args)
259
260
261 def _check_servod(self):
262 """A sanity check of the servod state."""
263 msg_prefix = 'Servod error: %s'
264 error_msg = None
265 try:
266 timeout, _ = retry.timeout(
267 self._servod_server.get, args=('pwr_button', ),
268 timeout_sec=self.PWR_BUTTON_CMD_TIMEOUT_SECS)
269 if timeout:
270 error_msg = msg_prefix % 'Request timed out.'
271 except (socket.error, xmlrpclib.Error, httplib.BadStatusLine) as e:
272 error_msg = msg_prefix % e
273 if error_msg:
274 raise ServoHostVerifyFailure(error_msg)
275
276
277 def _check_servo_host_usb(self):
278 """A sanity check of the USB device.
279
280 Sometimes the usb gets wedged due to a kernel bug on the beaglebone.
281 A symptom is the presence of /dev/sda without /dev/sda1. The check
282 here ensures that if /dev/sda exists, /dev/sda1 must also exist.
283 See crbug.com/225932.
284
285 @raises ServoHostVerifyFailure if /dev/sda exists without /dev/sda1 on
286 the beaglebone.
287
288 """
289 try:
290 # The following test exits with a non-zero code
291 # and raises AutoserverRunError if error is detected.
292 self.run('test ! -b /dev/sda -o -b /dev/sda1')
293 except (error.AutoservRunError, error.AutoservSSHTimeout) as e:
294 raise ServoHostVerifyFailure(
295 'USB sanity check on %s failed: %s' % (self.hostname, e))
296
297
298 def verify_software(self):
299 """Verify that the servo is in a good state.
300
301 It overrides the base class function for verify_software.
302 It checks:
303 1) Whether basic servo command can run successfully.
304 2) Whether USB is in a good state. crbug.com/225932
305
306 @raises ServoHostVerifyFailure if servo host does not pass the checks.
307
308 """
309 logging.info('Verifying servo host %s with sanity checks.',
310 self.hostname)
311 self._check_servod()
312 self._check_servo_host_usb()
313 logging.info('Sanity checks pass on servo host %s', self.hostname)
314
315
316 def _repair_with_sysrq_reboot(self):
317 """Reboot with magic SysRq key."""
318 self.reboot(timeout=self.REBOOT_TIMEOUT_SECS,
319 down_timeout=self.HOST_DOWN_TIMEOUT_SECS,
320 reboot_cmd='echo "b" > /proc/sysrq-trigger &',
321 fastsync=True)
322 time.sleep(self.REBOOT_DELAY_SECS)
323
324
Fang Dengd4fe7392013-09-20 12:18:21 -0700325 def has_power(self):
326 """Return whether or not the servo host is powered by PoE."""
327 # TODO(fdeng): See crbug.com/302791
328 # For now, assume all servo hosts in the lab have power.
329 return self.is_in_lab()
330
331
332 def power_cycle(self):
333 """Cycle power to this host via PoE if it is a lab device.
334
335 @raises ServoHostRepairFailure if it fails to power cycle the
336 servo host.
337
338 """
339 if self.has_power():
340 try:
341 rpm_client.set_power(self.hostname, 'CYCLE')
342 except (socket.error, xmlrpclib.Error,
343 httplib.BadStatusLine,
344 rpm_client.RemotePowerException) as e:
345 raise ServoHostRepairFailure(
346 'Power cycling %s failed: %s' % (self.hostname, e))
347 else:
348 logging.info('Skipping power cycling, not a lab device.')
349
350
Fang Deng5d518f42013-08-02 14:04:32 -0700351 def _powercycle_to_repair(self):
Fang Dengd4fe7392013-09-20 12:18:21 -0700352 """Power cycle the servo host using PoE.
353
354 @raises ServoHostRepairFailure if it fails to fix the servo host.
Fang Dengf0ea6142013-10-10 21:43:16 -0700355 @raises ServoHostRepairMethodNA if it does not support power.
Fang Dengd4fe7392013-09-20 12:18:21 -0700356
357 """
358 if not self.has_power():
Fang Dengf0ea6142013-10-10 21:43:16 -0700359 raise ServoHostRepairMethodNA('%s does not support power.' %
360 self.hostname)
Fang Dengd4fe7392013-09-20 12:18:21 -0700361 logging.info('Attempting repair via PoE powercycle.')
362 failed_cycles = 0
363 self.power_cycle()
364 while not self.wait_up(timeout=self.REBOOT_TIMEOUT_SECS):
365 failed_cycles += 1
366 if failed_cycles >= self._MAX_POWER_CYCLE_ATTEMPTS:
367 raise ServoHostRepairFailure(
368 'Powercycled host %s %d times; device did not come back'
369 ' online.' % (self.hostname, failed_cycles))
370 self.power_cycle()
371 logging.info('Powercycling was successful after %d failures.',
372 failed_cycles)
373 # Allow some time for servod to get started.
374 time.sleep(self.REBOOT_DELAY_SECS)
Fang Deng5d518f42013-08-02 14:04:32 -0700375
376
377 def repair_full(self):
378 """Attempt to repair servo host.
379
380 This overrides the base class function for repair.
381 Note if the host is not in Cros Lab, the repair procedure
382 will be skipped.
383
384 @raises ServoHostRepairTotalFailure if all attempts fail.
385
386 """
387 if not self.is_in_lab():
388 logging.warn('Skip repairing servo host %s: Not a lab device.',
389 self.hostname)
390 return
391 logging.info('Attempting to repair servo host %s.', self.hostname)
Fang Dengd4fe7392013-09-20 12:18:21 -0700392 repair_funcs = [self._repair_with_sysrq_reboot,
393 self._powercycle_to_repair]
Fang Deng5d518f42013-08-02 14:04:32 -0700394 errors = []
395 for repair_func in repair_funcs:
Fang Dengf0ea6142013-10-10 21:43:16 -0700396 counter_prefix = 'servo_host_repair.%s.' % repair_func.__name__
Fang Deng5d518f42013-08-02 14:04:32 -0700397 try:
398 repair_func()
399 self.verify()
Fang Dengf0ea6142013-10-10 21:43:16 -0700400 stats.Counter(counter_prefix + 'SUCCEEDED').increment()
Fang Deng5d518f42013-08-02 14:04:32 -0700401 return
Fang Dengf0ea6142013-10-10 21:43:16 -0700402 except ServoHostRepairMethodNA as e:
403 logging.warn('Repair method NA: %s', e)
404 stats.Counter(counter_prefix + 'RepairNA').increment()
405 errors.append(str(e))
Fang Deng5d518f42013-08-02 14:04:32 -0700406 except Exception as e:
407 logging.warn('Failed to repair servo: %s', e)
Fang Dengf0ea6142013-10-10 21:43:16 -0700408 stats.Counter(counter_prefix + 'FAILED').increment()
Fang Deng5d518f42013-08-02 14:04:32 -0700409 errors.append(str(e))
Fang Dengf0ea6142013-10-10 21:43:16 -0700410 stats.Counter('servo_host_repair.Full_Repair_Failed').increment()
Fang Deng5d518f42013-08-02 14:04:32 -0700411 raise ServoHostRepairTotalFailure(
412 'All attempts at repairing the servo failed:\n%s' %
413 '\n'.join(errors))
414
415
416 def create_healthy_servo_object(self):
417 """Create a servo.Servo object.
418
419 Create a servo.Servo object. If the servo host is in Cros Lab,
420 this method will first verify the servo host and attempt to repair it if
421 error is detected.
422
423 @raises ServoHostRepairTotalFailure if it fails to fix the servo host.
424 @raises AutoservSshPermissionDeniedError if the DUT is not ssh-able
425 due to permission error.
426
427 """
428 if self.is_in_lab():
429 try:
430 self.verify()
431 except (error.AutoservSSHTimeout,
432 error.AutoservSshPingHostError,
433 error.AutoservHostIsShuttingDownError,
434 ServoHostVerifyFailure):
435 self.repair_full()
436 except error.AutoservSshPermissionDeniedError:
437 raise
438 return servo.Servo(servo_host=self)