blob: 98658da91311a2d44b66e6dd5f8aa656fd2bee3f [file] [log] [blame]
Fang Deng5d518f42013-08-02 14:04:32 -07001# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4#
5# Expects to be run in an environment with sudo and no interactive password
6# prompt, such as within the Chromium OS development chroot.
7
8
9"""This file provides core logic for servo verify/repair process."""
10
11
12import httplib
13import logging
14import socket
Kevin Cheng79589982016-10-25 13:26:04 -070015import traceback
Fang Deng5d518f42013-08-02 14:04:32 -070016import xmlrpclib
17
18from autotest_lib.client.bin import utils
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -070019from autotest_lib.client.common_lib import control_data
Fang Deng5d518f42013-08-02 14:04:32 -070020from autotest_lib.client.common_lib import error
beeps5e8c45a2013-12-17 22:05:11 -080021from autotest_lib.client.common_lib import global_config
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -070022from autotest_lib.client.common_lib import host_states
Richard Barnette9a26ad62016-06-10 12:03:08 -070023from autotest_lib.client.common_lib import hosts
Dan Shi0942b1d2015-03-31 11:07:00 -070024from autotest_lib.client.common_lib import lsbrelease_utils
beeps5e8c45a2013-12-17 22:05:11 -080025from autotest_lib.client.common_lib.cros import autoupdater
26from autotest_lib.client.common_lib.cros import dev_server
Fang Deng5d518f42013-08-02 14:04:32 -070027from autotest_lib.client.common_lib.cros import retry
Kevin Cheng79589982016-10-25 13:26:04 -070028from autotest_lib.client.common_lib.cros.graphite import autotest_es
Christopher Wileycef1f902014-06-19 11:11:23 -070029from autotest_lib.client.common_lib.cros.network import ping_runner
Hsinyu Chaoe0b08e62015-08-11 10:50:37 +000030from autotest_lib.client.cros import constants as client_constants
Richard Barnettee519dcd2016-08-15 17:37:17 -070031from autotest_lib.server import afe_utils
beeps5e8c45a2013-12-17 22:05:11 -080032from autotest_lib.server import site_utils as server_site_utils
Cheng-Yi Chiang22612862015-08-20 20:39:57 +080033from autotest_lib.server.cros import dnsname_mangler
Simran Basi0739d682015-02-25 16:22:56 -080034from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -070035from autotest_lib.server.cros.dynamic_suite import control_file_getter
Richard Barnette9a26ad62016-06-10 12:03:08 -070036from autotest_lib.server.cros.servo import servo
37from autotest_lib.server.hosts import servo_repair
Fang Deng5d518f42013-08-02 14:04:32 -070038from autotest_lib.server.hosts import ssh_host
Fang Dengd4fe7392013-09-20 12:18:21 -070039from autotest_lib.site_utils.rpm_control_system import rpm_client
Fang Deng5d518f42013-08-02 14:04:32 -070040
Dan Shi5e2efb72017-02-07 11:40:23 -080041try:
42 from chromite.lib import metrics
43except ImportError:
44 metrics = utils.metrics_mock
45
Fang Deng5d518f42013-08-02 14:04:32 -070046
Simran Basi0739d682015-02-25 16:22:56 -080047# Names of the host attributes in the database that represent the values for
48# the servo_host and servo_port for a servo connected to the DUT.
49SERVO_HOST_ATTR = 'servo_host'
50SERVO_PORT_ATTR = 'servo_port'
Richard Barnettee519dcd2016-08-15 17:37:17 -070051SERVO_BOARD_ATTR = 'servo_board'
Kevin Cheng643ce8a2016-09-15 15:42:12 -070052SERVO_SERIAL_ATTR = 'servo_serial'
Simran Basi0739d682015-02-25 16:22:56 -080053
Dan Shi3b2adf62015-09-02 17:46:54 -070054_CONFIG = global_config.global_config
xixuan6cf6d2f2016-01-29 15:29:00 -080055ENABLE_SSH_TUNNEL_FOR_SERVO = _CONFIG.get_config_value(
56 'CROS', 'enable_ssh_tunnel_for_servo', type=bool, default=False)
Simran Basi0739d682015-02-25 16:22:56 -080057
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -070058AUTOTEST_BASE = _CONFIG.get_config_value(
59 'SCHEDULER', 'drone_installation_directory',
60 default='/usr/local/autotest')
61
62_SERVO_HOST_REBOOT_TEST_NAME = 'servohost_Reboot'
Kevin Cheng55265902016-10-19 12:46:50 -070063_SERVO_HOST_FORCE_REBOOT_TEST_NAME = 'servohost_Reboot.force_reboot'
Fang Deng5d518f42013-08-02 14:04:32 -070064
Fang Deng5d518f42013-08-02 14:04:32 -070065class ServoHost(ssh_host.SSHHost):
66 """Host class for a host that controls a servo, e.g. beaglebone."""
67
Richard Barnette9a26ad62016-06-10 12:03:08 -070068 DEFAULT_PORT = 9999
69
Dan Shie5b3c512014-08-21 12:12:09 -070070 # Timeout for initializing servo signals.
71 INITIALIZE_SERVO_TIMEOUT_SECS = 30
Richard Barnette9a26ad62016-06-10 12:03:08 -070072
xixuan6cf6d2f2016-01-29 15:29:00 -080073 # Ready test function
74 SERVO_READY_METHOD = 'get_version'
Fang Deng5d518f42013-08-02 14:04:32 -070075
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -070076 REBOOT_CMD = 'sleep 1; reboot & sleep 10; reboot -f'
77
Fang Deng5d518f42013-08-02 14:04:32 -070078
Richard Barnette17bfc6c2016-08-04 18:41:43 -070079 def _initialize(self, servo_host='localhost',
Richard Barnettee519dcd2016-08-15 17:37:17 -070080 servo_port=DEFAULT_PORT, servo_board=None,
Kevin Cheng643ce8a2016-09-15 15:42:12 -070081 servo_serial=None, is_in_lab=None, *args, **dargs):
Fang Deng5d518f42013-08-02 14:04:32 -070082 """Initialize a ServoHost instance.
83
84 A ServoHost instance represents a host that controls a servo.
85
86 @param servo_host: Name of the host where the servod process
87 is running.
88 @param servo_port: Port the servod process is listening on.
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -070089 @param servo_board: Board that the servo is connected to.
Dan Shi4d478522014-02-14 13:46:32 -080090 @param is_in_lab: True if the servo host is in Cros Lab. Default is set
91 to None, for which utils.host_is_in_lab_zone will be
92 called to check if the servo host is in Cros lab.
Fang Deng5d518f42013-08-02 14:04:32 -070093
94 """
95 super(ServoHost, self)._initialize(hostname=servo_host,
96 *args, **dargs)
Richard Barnettee519dcd2016-08-15 17:37:17 -070097 self.servo_port = servo_port
98 self.servo_board = servo_board
Kevin Cheng643ce8a2016-09-15 15:42:12 -070099 self.servo_serial = servo_serial
Richard Barnettee519dcd2016-08-15 17:37:17 -0700100 self._servo = None
Richard Barnette9a26ad62016-06-10 12:03:08 -0700101 self._repair_strategy = (
102 servo_repair.create_servo_repair_strategy())
Richard Barnettee519dcd2016-08-15 17:37:17 -0700103 self._is_localhost = (self.hostname == 'localhost')
104 if self._is_localhost:
105 self._is_in_lab = False
106 elif is_in_lab is None:
Dan Shi4d478522014-02-14 13:46:32 -0800107 self._is_in_lab = utils.host_is_in_lab_zone(self.hostname)
108 else:
109 self._is_in_lab = is_in_lab
xixuan6cf6d2f2016-01-29 15:29:00 -0800110
Richard Barnettee519dcd2016-08-15 17:37:17 -0700111 # Commands on the servo host must be run by the superuser.
112 # Our account on a remote host is root, but if our target is
113 # localhost then we might be running unprivileged. If so,
114 # `sudo` will have to be added to the commands.
Fang Deng5d518f42013-08-02 14:04:32 -0700115 if self._is_localhost:
116 self._sudo_required = utils.system_output('id -u') != '0'
117 else:
118 self._sudo_required = False
Richard Barnettee519dcd2016-08-15 17:37:17 -0700119
Richard Barnette9a26ad62016-06-10 12:03:08 -0700120
121 def connect_servo(self):
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700122 """Establish a connection to the servod server on this host.
Richard Barnette9a26ad62016-06-10 12:03:08 -0700123
124 Initializes `self._servo` and then verifies that all network
125 connections are working. This will create an ssh tunnel if
126 it's required.
127
128 As a side effect of testing the connection, all signals on the
129 target servo are reset to default values, and the USB stick is
130 set to the neutral (off) position.
131 """
Kevin Cheng643ce8a2016-09-15 15:42:12 -0700132 servo_obj = servo.Servo(servo_host=self, servo_serial=self.servo_serial)
Richard Barnette9a26ad62016-06-10 12:03:08 -0700133 timeout, _ = retry.timeout(
134 servo_obj.initialize_dut,
135 timeout_sec=self.INITIALIZE_SERVO_TIMEOUT_SECS)
136 if timeout:
137 raise hosts.AutoservVerifyError(
138 'Servo initialize timed out.')
139 self._servo = servo_obj
140
141
142 def disconnect_servo(self):
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700143 """Disconnect our servo if it exists.
Richard Barnette9a26ad62016-06-10 12:03:08 -0700144
145 If we've previously successfully connected to our servo,
146 disconnect any established ssh tunnel, and set `self._servo`
147 back to `None`.
148 """
149 if self._servo:
150 # N.B. This call is safe even without a tunnel:
151 # rpc_server_tracker.disconnect() silently ignores
152 # unknown ports.
153 self.rpc_server_tracker.disconnect(self.servo_port)
154 self._servo = None
Fang Deng5d518f42013-08-02 14:04:32 -0700155
156
157 def is_in_lab(self):
158 """Check whether the servo host is a lab device.
159
160 @returns: True if the servo host is in Cros Lab, otherwise False.
161
162 """
163 return self._is_in_lab
164
165
166 def is_localhost(self):
167 """Checks whether the servo host points to localhost.
168
169 @returns: True if it points to localhost, otherwise False.
170
171 """
172 return self._is_localhost
173
174
175 def get_servod_server_proxy(self):
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700176 """Return a proxy that can be used to communicate with servod server.
Fang Deng5d518f42013-08-02 14:04:32 -0700177
178 @returns: An xmlrpclib.ServerProxy that is connected to the servod
179 server on the host.
Fang Deng5d518f42013-08-02 14:04:32 -0700180 """
Richard Barnette9a26ad62016-06-10 12:03:08 -0700181 if ENABLE_SSH_TUNNEL_FOR_SERVO and not self.is_localhost():
182 return self.rpc_server_tracker.xmlrpc_connect(
183 None, self.servo_port,
184 ready_test_name=self.SERVO_READY_METHOD,
185 timeout_seconds=60)
186 else:
187 remote = 'http://%s:%s' % (self.hostname, self.servo_port)
188 return xmlrpclib.ServerProxy(remote)
Fang Deng5d518f42013-08-02 14:04:32 -0700189
190
Richard Barnette9a26ad62016-06-10 12:03:08 -0700191 def is_cros_host(self):
beeps5e8c45a2013-12-17 22:05:11 -0800192 """Check if a servo host is running chromeos.
193
194 @return: True if the servo host is running chromeos.
195 False if it isn't, or we don't have enough information.
196 """
197 try:
198 result = self.run('grep -q CHROMEOS /etc/lsb-release',
199 ignore_status=True, timeout=10)
200 except (error.AutoservRunError, error.AutoservSSHTimeout):
201 return False
202 return result.exit_status == 0
203
204
Fang Deng5d518f42013-08-02 14:04:32 -0700205 def make_ssh_command(self, user='root', port=22, opts='', hosts_file=None,
206 connect_timeout=None, alive_interval=None):
207 """Override default make_ssh_command to use tuned options.
208
209 Tuning changes:
210 - ConnectTimeout=30; maximum of 30 seconds allowed for an SSH
211 connection failure. Consistency with remote_access.py.
212
213 - ServerAliveInterval=180; which causes SSH to ping connection every
214 180 seconds. In conjunction with ServerAliveCountMax ensures
215 that if the connection dies, Autotest will bail out quickly.
216
217 - ServerAliveCountMax=3; consistency with remote_access.py.
218
219 - ConnectAttempts=4; reduce flakiness in connection errors;
220 consistency with remote_access.py.
221
222 - UserKnownHostsFile=/dev/null; we don't care about the keys.
223
224 - SSH protocol forced to 2; needed for ServerAliveInterval.
225
226 @param user User name to use for the ssh connection.
227 @param port Port on the target host to use for ssh connection.
228 @param opts Additional options to the ssh command.
229 @param hosts_file Ignored.
230 @param connect_timeout Ignored.
231 @param alive_interval Ignored.
232
233 @returns: An ssh command with the requested settings.
234
235 """
236 base_command = ('/usr/bin/ssh -a -x %s -o StrictHostKeyChecking=no'
237 ' -o UserKnownHostsFile=/dev/null -o BatchMode=yes'
238 ' -o ConnectTimeout=30 -o ServerAliveInterval=180'
239 ' -o ServerAliveCountMax=3 -o ConnectionAttempts=4'
240 ' -o Protocol=2 -l %s -p %d')
241 return base_command % (opts, user, port)
242
243
244 def _make_scp_cmd(self, sources, dest):
245 """Format scp command.
246
247 Given a list of source paths and a destination path, produces the
248 appropriate scp command for encoding it. Remote paths must be
249 pre-encoded. Overrides _make_scp_cmd in AbstractSSHHost
250 to allow additional ssh options.
251
252 @param sources: A list of source paths to copy from.
253 @param dest: Destination path to copy to.
254
255 @returns: An scp command that copies |sources| on local machine to
256 |dest| on the remote servo host.
257
258 """
259 command = ('scp -rq %s -o BatchMode=yes -o StrictHostKeyChecking=no '
260 '-o UserKnownHostsFile=/dev/null -P %d %s "%s"')
261 return command % (self.master_ssh_option,
262 self.port, ' '.join(sources), dest)
263
264
265 def run(self, command, timeout=3600, ignore_status=False,
266 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
Luigi Semenzatobfbd1f32017-01-06 10:41:18 -0800267 connect_timeout=30, ssh_failure_retry_ok=False,
268 options='', stdin=None, verbose=True, args=()):
Fang Deng5d518f42013-08-02 14:04:32 -0700269 """Run a command on the servo host.
270
271 Extends method `run` in SSHHost. If the servo host is a remote device,
272 it will call `run` in SSHost without changing anything.
273 If the servo host is 'localhost', it will call utils.system_output.
274
275 @param command: The command line string.
276 @param timeout: Time limit in seconds before attempting to
277 kill the running process. The run() function
278 will take a few seconds longer than 'timeout'
279 to complete if it has to kill the process.
280 @param ignore_status: Do not raise an exception, no matter
281 what the exit code of the command is.
282 @param stdout_tee/stderr_tee: Where to tee the stdout/stderr.
283 @param connect_timeout: SSH connection timeout (in seconds)
284 Ignored if host is 'localhost'.
285 @param options: String with additional ssh command options
286 Ignored if host is 'localhost'.
Luigi Semenzatobfbd1f32017-01-06 10:41:18 -0800287 @param ssh_failure_retry_ok: when True and ssh connection failure is
288 suspected, OK to retry command (but not
289 compulsory, and likely not needed here)
Fang Deng5d518f42013-08-02 14:04:32 -0700290 @param stdin: Stdin to pass (a string) to the executed command.
291 @param verbose: Log the commands.
292 @param args: Sequence of strings to pass as arguments to command by
293 quoting them in " and escaping their contents if necessary.
294
295 @returns: A utils.CmdResult object.
296
297 @raises AutoservRunError if the command failed.
298 @raises AutoservSSHTimeout SSH connection has timed out. Only applies
299 when servo host is not 'localhost'.
300
301 """
302 run_args = {'command': command, 'timeout': timeout,
303 'ignore_status': ignore_status, 'stdout_tee': stdout_tee,
304 'stderr_tee': stderr_tee, 'stdin': stdin,
305 'verbose': verbose, 'args': args}
306 if self.is_localhost():
307 if self._sudo_required:
Michael Tangf9b3ada2016-11-18 16:01:05 -0800308 run_args['command'] = 'sudo -n sh -c "%s"' % utils.sh_escape(
309 command)
Fang Deng5d518f42013-08-02 14:04:32 -0700310 try:
311 return utils.run(**run_args)
312 except error.CmdError as e:
313 logging.error(e)
314 raise error.AutoservRunError('command execution error',
315 e.result_obj)
316 else:
317 run_args['connect_timeout'] = connect_timeout
318 run_args['options'] = options
319 return super(ServoHost, self).run(**run_args)
320
321
Richard Barnette9a26ad62016-06-10 12:03:08 -0700322 def _get_release_version(self):
Dan Shi0942b1d2015-03-31 11:07:00 -0700323 """Get the value of attribute CHROMEOS_RELEASE_VERSION from lsb-release.
324
325 @returns The version string in lsb-release, under attribute
326 CHROMEOS_RELEASE_VERSION.
327 """
328 lsb_release_content = self.run(
329 'cat "%s"' % client_constants.LSB_RELEASE).stdout.strip()
330 return lsbrelease_utils.get_chromeos_release_version(
331 lsb_release_content=lsb_release_content)
332
333
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700334 def get_attached_duts(self, afe):
335 """Gather a list of duts that use this servo host.
336
337 @param afe: afe instance.
338
339 @returns list of duts.
Richard Barnette3a7697f2016-04-20 11:33:27 -0700340 """
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700341 return afe.get_hosts_by_attribute(
342 attribute=SERVO_HOST_ATTR, value=self.hostname)
343
344
345 def get_board(self):
346 """Determine the board for this servo host.
347
348 @returns a string representing this servo host's board.
349 """
350 return lsbrelease_utils.get_current_board(
351 lsb_release_content=self.run('cat /etc/lsb-release').stdout)
352
353
354 def _choose_dut_for_synchronized_reboot(self, dut_list, afe):
355 """Choose which dut to schedule servo host reboot job.
356
357 We'll want a semi-deterministic way of selecting which host should be
358 scheduled for the servo host reboot job. For now we'll sort the
359 list with the expectation the dut list will stay consistent.
360 From there we'll grab the first dut that is available so we
361 don't schedule a job on a dut that will never run.
362
363 @param dut_list: List of the dut hostnames to choose from.
364 @param afe: Instance of the AFE.
365
366 @return hostname of dut to schedule job on.
367 """
368 afe_hosts = afe.get_hosts(dut_list)
369 afe_hosts.sort()
370 for afe_host in afe_hosts:
371 if afe_host.status not in host_states.UNAVAILABLE_STATES:
372 return afe_host.hostname
373 # If they're all unavailable, just return the first sorted dut.
374 dut_list.sort()
375 return dut_list[0]
376
377
378 def _sync_job_scheduled_for_duts(self, dut_list, afe):
379 """Checks if a synchronized reboot has been scheduled for these duts.
380
381 Grab all the host queue entries that aren't completed for the duts and
382 see if any of them have the expected job name.
383
384 @param dut_list: List of duts to check on.
385 @param afe: Instance of the AFE.
386
387 @returns True if the job is scheduled, False otherwise.
388 """
389 afe_hosts = afe.get_hosts(dut_list)
390 for afe_host in afe_hosts:
391 hqes = afe.get_host_queue_entries(host=afe_host.id, complete=0)
392 for hqe in hqes:
393 job = afe.get_jobs(id=hqe.job.id)
Kevin Cheng55265902016-10-19 12:46:50 -0700394 if job and job[0].name in (_SERVO_HOST_REBOOT_TEST_NAME,
395 _SERVO_HOST_FORCE_REBOOT_TEST_NAME):
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700396 return True
397 return False
398
399
Kevin Cheng55265902016-10-19 12:46:50 -0700400 def schedule_synchronized_reboot(self, dut_list, afe, force_reboot=False):
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700401 """Schedule a job to reboot the servo host.
402
403 When we schedule a job, it will create a ServoHost object which will
404 go through this entire flow of checking if a reboot is needed and
405 trying to schedule it. There is probably a better approach to setting
406 up a synchronized reboot but I'm coming up short on better ideas so I
407 apologize for this circus show.
408
Kevin Cheng55265902016-10-19 12:46:50 -0700409 @param dut_list: List of duts that need to be locked.
410 @param afe: Instance of afe.
411 @param force_reboot: Boolean to indicate if a forced reboot should be
412 scheduled or not.
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700413 """
414 # If we've already scheduled job on a dut, we're done here.
415 if self._sync_job_scheduled_for_duts(dut_list, afe):
416 return
417
418 # Looks like we haven't scheduled a job yet.
Kevin Cheng55265902016-10-19 12:46:50 -0700419 test = (_SERVO_HOST_REBOOT_TEST_NAME if not force_reboot
420 else _SERVO_HOST_FORCE_REBOOT_TEST_NAME)
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700421 dut = self._choose_dut_for_synchronized_reboot(dut_list, afe)
422 getter = control_file_getter.FileSystemGetter([AUTOTEST_BASE])
Kevin Cheng55265902016-10-19 12:46:50 -0700423 control_file = getter.get_control_file_contents_by_name(test)
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700424 control_type = control_data.CONTROL_TYPE_NAMES.SERVER
Kevin Cheng79589982016-10-25 13:26:04 -0700425 try:
426 afe.create_job(control_file=control_file, name=test,
427 control_type=control_type, hosts=[dut])
428 except Exception as e:
429 # Sometimes creating the job will raise an exception. We'll log it
430 # but we don't want to fail because of it.
Aviv Keshet5ae0a002017-05-05 10:23:33 -0700431 logging.exception('Scheduling reboot job failed due to Exception.')
Kevin Cheng79589982016-10-25 13:26:04 -0700432 metadata = {'dut': dut,
433 'servo_host': self.hostname,
434 'error': str(e),
435 'details': traceback.format_exc()}
436 # We want to track how often we fail here so we can justify
437 # investing some effort into hardening up afe.create_job().
438 autotest_es.post(use_http=True,
439 type_str='servohost_Reboot_schedule_fail',
440 metadata=metadata)
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700441
442
443 def reboot(self, *args, **dargs):
444 """Reboot using special servo host reboot command."""
445 super(ServoHost, self).reboot(reboot_cmd=self.REBOOT_CMD,
446 *args, **dargs)
447
448
449 def _check_for_reboot(self, updater):
450 """Reboot this servo host if an upgrade is waiting.
Richard Barnette3a7697f2016-04-20 11:33:27 -0700451
452 If the host has successfully downloaded and finalized a new
453 build, reboot.
454
455 @param updater: a ChromiumOSUpdater instance for checking
456 whether reboot is needed.
457 @return Return a (status, build) tuple reflecting the
458 update_engine status and current build of the host
459 at the end of the call.
460 """
Richard Barnette9a26ad62016-06-10 12:03:08 -0700461 current_build_number = self._get_release_version()
Richard Barnette3a7697f2016-04-20 11:33:27 -0700462 status = updater.check_update_status()
463 if status == autoupdater.UPDATER_NEED_REBOOT:
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700464 # Check if we need to schedule an organized reboot.
Kevin Cheng79589982016-10-25 13:26:04 -0700465 afe = frontend_wrappers.RetryingAFE(
466 timeout_min=5, delay_sec=10,
467 server=server_site_utils.get_global_afe_hostname())
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700468 dut_list = self.get_attached_duts(afe)
469 logging.info('servo host has the following duts: %s', dut_list)
470 if len(dut_list) > 1:
471 logging.info('servo host has multiple duts, scheduling '
472 'synchronized reboot')
473 self.schedule_synchronized_reboot(dut_list, afe)
474 return status, current_build_number
475
476 logging.info('Rebooting servo host %s from build %s',
Richard Barnette3a7697f2016-04-20 11:33:27 -0700477 self.hostname, current_build_number)
478 # Tell the reboot() call not to wait for completion.
479 # Otherwise, the call will log reboot failure if servo does
480 # not come back. The logged reboot failure will lead to
481 # test job failure. If the test does not require servo, we
482 # don't want servo failure to fail the test with error:
483 # `Host did not return from reboot` in status.log.
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700484 self.reboot(fastsync=True, wait=False)
Richard Barnette3a7697f2016-04-20 11:33:27 -0700485
486 # We told the reboot() call not to wait, but we need to wait
487 # for the reboot before we continue. Alas. The code from
488 # here below is basically a copy of Host.wait_for_restart(),
489 # with the logging bits ripped out, so that they can't cause
490 # the failure logging problem described above.
491 #
492 # The black stain that this has left on my soul can never be
493 # erased.
494 old_boot_id = self.get_boot_id()
495 if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT,
496 warning_timer=self.WAIT_DOWN_REBOOT_WARNING,
497 old_boot_id=old_boot_id):
498 raise error.AutoservHostError(
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700499 'servo host %s failed to shut down.' %
500 self.hostname)
Richard Barnette3a7697f2016-04-20 11:33:27 -0700501 if self.wait_up(timeout=120):
Richard Barnette9a26ad62016-06-10 12:03:08 -0700502 current_build_number = self._get_release_version()
Richard Barnette3a7697f2016-04-20 11:33:27 -0700503 status = updater.check_update_status()
504 logging.info('servo host %s back from reboot, with build %s',
505 self.hostname, current_build_number)
506 else:
507 raise error.AutoservHostError(
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700508 'servo host %s failed to come back from reboot.' %
509 self.hostname)
Richard Barnette3a7697f2016-04-20 11:33:27 -0700510 return status, current_build_number
511
512
Richard Barnette3a7697f2016-04-20 11:33:27 -0700513 def update_image(self, wait_for_update=False):
beeps5e8c45a2013-12-17 22:05:11 -0800514 """Update the image on the servo host, if needed.
515
J. Richard Barnette84895392015-04-30 12:31:01 -0700516 This method recognizes the following cases:
517 * If the Host is not running Chrome OS, do nothing.
518 * If a previously triggered update is now complete, reboot
519 to the new version.
520 * If the host is processing a previously triggered update,
521 do nothing.
522 * If the host is running a version of Chrome OS different
523 from the default for servo Hosts, trigger an update, but
524 don't wait for it to complete.
beeps5e8c45a2013-12-17 22:05:11 -0800525
Richard Barnette3a7697f2016-04-20 11:33:27 -0700526 @param wait_for_update If an update needs to be applied and
527 this is true, then don't return until the update is
528 downloaded and finalized, and the host rebooted.
beeps5e8c45a2013-12-17 22:05:11 -0800529 @raises dev_server.DevServerException: If all the devservers are down.
530 @raises site_utils.ParseBuildNameException: If the devserver returns
531 an invalid build name.
532 @raises autoupdater.ChromiumOSError: If something goes wrong in the
533 checking update engine client status or applying an update.
534 @raises AutoservRunError: If the update_engine_client isn't present on
535 the host, and the host is a cros_host.
J. Richard Barnette84895392015-04-30 12:31:01 -0700536
beeps5e8c45a2013-12-17 22:05:11 -0800537 """
Dan Shib795b5a2015-09-24 13:26:35 -0700538 # servod could be running in a Ubuntu workstation.
Richard Barnette9a26ad62016-06-10 12:03:08 -0700539 if not self.is_cros_host():
beeps5e8c45a2013-12-17 22:05:11 -0800540 logging.info('Not attempting an update, either %s is not running '
541 'chromeos or we cannot find enough information about '
542 'the host.', self.hostname)
543 return
544
Dan Shib795b5a2015-09-24 13:26:35 -0700545 if lsbrelease_utils.is_moblab():
546 logging.info('Not attempting an update, %s is running moblab.',
547 self.hostname)
548 return
549
Richard Barnette383ef9c2016-12-13 11:56:49 -0800550 target_build = afe_utils.get_stable_cros_image_name(self.get_board())
J. Richard Barnette84895392015-04-30 12:31:01 -0700551 target_build_number = server_site_utils.ParseBuildName(
552 target_build)[3]
xixuanfa2d92a2016-12-09 09:45:27 -0800553 # For servo image staging, we want it as more widely distributed as
554 # possible, so that devservers' load can be evenly distributed. So use
555 # hostname instead of target_build as hash.
556 ds = dev_server.ImageServer.resolve(self.hostname,
557 hostname=self.hostname)
J. Richard Barnette84895392015-04-30 12:31:01 -0700558 url = ds.get_update_url(target_build)
beeps5e8c45a2013-12-17 22:05:11 -0800559
560 updater = autoupdater.ChromiumOSUpdater(update_url=url, host=self)
Richard Barnette3a7697f2016-04-20 11:33:27 -0700561 status, current_build_number = self._check_for_reboot(updater)
562 update_pending = True
beeps5e8c45a2013-12-17 22:05:11 -0800563 if status in autoupdater.UPDATER_PROCESSING_UPDATE:
564 logging.info('servo host %s already processing an update, update '
565 'engine client status=%s', self.hostname, status)
J. Richard Barnette84895392015-04-30 12:31:01 -0700566 elif current_build_number != target_build_number:
beeps5e8c45a2013-12-17 22:05:11 -0800567 logging.info('Using devserver url: %s to trigger update on '
568 'servo host %s, from %s to %s', url, self.hostname,
J. Richard Barnette84895392015-04-30 12:31:01 -0700569 current_build_number, target_build_number)
beeps5e8c45a2013-12-17 22:05:11 -0800570 try:
J. Richard Barnette84895392015-04-30 12:31:01 -0700571 ds.stage_artifacts(target_build,
572 artifacts=['full_payload'])
573 except Exception as e:
574 logging.error('Staging artifacts failed: %s', str(e))
575 logging.error('Abandoning update for this cycle.')
beeps5e8c45a2013-12-17 22:05:11 -0800576 else:
J. Richard Barnette84895392015-04-30 12:31:01 -0700577 try:
Richard Barnette7e53aa02016-05-20 10:49:40 -0700578 # TODO(jrbarnette): This 'touch' is a gross hack
579 # to get us past crbug.com/613603. Once that
580 # bug is resolved, we should remove this code.
581 self.run('touch /home/chronos/.oobe_completed')
J. Richard Barnette84895392015-04-30 12:31:01 -0700582 updater.trigger_update()
583 except autoupdater.RootFSUpdateError as e:
584 trigger_download_status = 'failed with %s' % str(e)
Aviv Keshet11836322016-11-22 11:32:01 -0800585 metrics.Counter('chromeos/autotest/servo/'
586 'rootfs_update_failed').increment()
J. Richard Barnette84895392015-04-30 12:31:01 -0700587 else:
588 trigger_download_status = 'passed'
589 logging.info('Triggered download and update %s for %s, '
590 'update engine currently in status %s',
591 trigger_download_status, self.hostname,
592 updater.check_update_status())
beeps5e8c45a2013-12-17 22:05:11 -0800593 else:
594 logging.info('servo host %s does not require an update.',
595 self.hostname)
Richard Barnette3a7697f2016-04-20 11:33:27 -0700596 update_pending = False
597
598 if update_pending and wait_for_update:
599 logging.info('Waiting for servo update to complete.')
600 self.run('update_engine_client --follow', ignore_status=True)
beeps5e8c45a2013-12-17 22:05:11 -0800601
602
Richard Barnette1edbb162016-11-01 11:47:50 -0700603 def verify(self, silent=False):
604 """Update the servo host and verify it's in a good state.
605
606 @param silent If true, suppress logging in `status.log`.
607 """
Richard Barnette79d78c42016-05-25 09:31:21 -0700608 # TODO(jrbarnette) Old versions of beaglebone_servo include
Richard Barnette9a26ad62016-06-10 12:03:08 -0700609 # the powerd package. If you touch the .oobe_completed file
610 # (as we do to work around an update_engine problem), then
611 # powerd will eventually shut down the beaglebone for lack
612 # of (apparent) activity. Current versions of
Richard Barnette79d78c42016-05-25 09:31:21 -0700613 # beaglebone_servo don't have powerd, but until we can purge
614 # the lab of the old images, we need to make sure powerd
615 # isn't running.
616 self.run('stop powerd', ignore_status=True)
Richard Barnette9a26ad62016-06-10 12:03:08 -0700617 try:
Richard Barnette1edbb162016-11-01 11:47:50 -0700618 self._repair_strategy.verify(self, silent)
Richard Barnette9a26ad62016-06-10 12:03:08 -0700619 except:
620 self.disconnect_servo()
621 raise
Fang Deng5d518f42013-08-02 14:04:32 -0700622
623
Richard Barnette1edbb162016-11-01 11:47:50 -0700624 def repair(self, silent=False):
625 """Attempt to repair servo host.
626
627 @param silent If true, suppress logging in `status.log`.
628 """
Richard Barnette9a26ad62016-06-10 12:03:08 -0700629 try:
Richard Barnette1edbb162016-11-01 11:47:50 -0700630 self._repair_strategy.repair(self, silent)
Richard Barnette9a26ad62016-06-10 12:03:08 -0700631 except:
632 self.disconnect_servo()
633 raise
Fang Deng5d518f42013-08-02 14:04:32 -0700634
635
Fang Dengd4fe7392013-09-20 12:18:21 -0700636 def has_power(self):
637 """Return whether or not the servo host is powered by PoE."""
638 # TODO(fdeng): See crbug.com/302791
639 # For now, assume all servo hosts in the lab have power.
640 return self.is_in_lab()
641
642
643 def power_cycle(self):
644 """Cycle power to this host via PoE if it is a lab device.
645
Richard Barnette9a26ad62016-06-10 12:03:08 -0700646 @raises AutoservRepairError if it fails to power cycle the
Fang Dengd4fe7392013-09-20 12:18:21 -0700647 servo host.
648
649 """
650 if self.has_power():
651 try:
652 rpm_client.set_power(self.hostname, 'CYCLE')
653 except (socket.error, xmlrpclib.Error,
654 httplib.BadStatusLine,
655 rpm_client.RemotePowerException) as e:
Richard Barnette9a26ad62016-06-10 12:03:08 -0700656 raise hosts.AutoservRepairError(
Fang Dengd4fe7392013-09-20 12:18:21 -0700657 'Power cycling %s failed: %s' % (self.hostname, e))
658 else:
659 logging.info('Skipping power cycling, not a lab device.')
660
661
Dan Shi4d478522014-02-14 13:46:32 -0800662 def get_servo(self):
663 """Get the cached servo.Servo object.
Fang Deng5d518f42013-08-02 14:04:32 -0700664
Dan Shi4d478522014-02-14 13:46:32 -0800665 @return: a servo.Servo object.
Fang Deng5d518f42013-08-02 14:04:32 -0700666 """
Dan Shi4d478522014-02-14 13:46:32 -0800667 return self._servo
668
669
Richard Barnetteea3e4602016-06-10 12:36:41 -0700670def make_servo_hostname(dut_hostname):
671 """Given a DUT's hostname, return the hostname of its servo.
672
673 @param dut_hostname: hostname of a DUT.
674
675 @return hostname of the DUT's servo.
676
677 """
678 host_parts = dut_hostname.split('.')
679 host_parts[0] = host_parts[0] + '-servo'
680 return '.'.join(host_parts)
681
682
683def servo_host_is_up(servo_hostname):
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700684 """Given a servo host name, return if it's up or not.
Richard Barnetteea3e4602016-06-10 12:36:41 -0700685
686 @param servo_hostname: hostname of the servo host.
687
688 @return True if it's up, False otherwise
689 """
690 # Technically, this duplicates the SSH ping done early in the servo
691 # proxy initialization code. However, this ping ends in a couple
692 # seconds when if fails, rather than the 60 seconds it takes to decide
693 # that an SSH ping has timed out. Specifically, that timeout happens
694 # when our servo DNS name resolves, but there is no host at that IP.
695 logging.info('Pinging servo host at %s', servo_hostname)
696 ping_config = ping_runner.PingConfig(
697 servo_hostname, count=3,
698 ignore_result=True, ignore_status=True)
699 return ping_runner.PingRunner().ping(ping_config).received > 0
700
701
Richard Barnettee519dcd2016-08-15 17:37:17 -0700702def _map_afe_board_to_servo_board(afe_board):
703 """Map a board we get from the AFE to a servo appropriate value.
704
705 Many boards are identical to other boards for servo's purposes.
706 This function makes that mapping.
707
708 @param afe_board string board name received from AFE.
709 @return board we expect servo to have.
710
711 """
712 KNOWN_SUFFIXES = ['-freon', '_freon', '_moblab', '-cheets']
713 BOARD_MAP = {'gizmo': 'panther'}
714 mapped_board = afe_board
715 if afe_board in BOARD_MAP:
716 mapped_board = BOARD_MAP[afe_board]
717 else:
718 for suffix in KNOWN_SUFFIXES:
719 if afe_board.endswith(suffix):
720 mapped_board = afe_board[0:-len(suffix)]
721 break
722 if mapped_board != afe_board:
723 logging.info('Mapping AFE board=%s to %s', afe_board, mapped_board)
724 return mapped_board
725
726
Richard Barnetteea3e4602016-06-10 12:36:41 -0700727def _get_standard_servo_args(dut_host):
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700728 """Return servo data associated with a given DUT.
Richard Barnetteea3e4602016-06-10 12:36:41 -0700729
730 This checks for the presence of servo host and port attached to the
731 given `dut_host`. This data should be stored in the
Kevin Cheng05ae2a42016-06-06 10:12:48 -0700732 `_afe_host.attributes` field in the provided `dut_host` parameter.
Richard Barnetteea3e4602016-06-10 12:36:41 -0700733
734 @param dut_host Instance of `Host` on which to find the servo
735 attributes.
736 @return A tuple of `servo_args` dict with host and an option port,
737 plus an `is_in_lab` flag indicating whether this in the CrOS
738 test lab, or some different environment.
739 """
740 servo_args = None
741 is_in_lab = False
742 is_ssp_moblab = False
743 if utils.is_in_container():
744 is_moblab = _CONFIG.get_config_value(
745 'SSP', 'is_moblab', type=bool, default=False)
746 is_ssp_moblab = is_moblab
747 else:
748 is_moblab = utils.is_moblab()
Kevin Cheng05ae2a42016-06-06 10:12:48 -0700749 attrs = dut_host._afe_host.attributes
Richard Barnetteea3e4602016-06-10 12:36:41 -0700750 if attrs and SERVO_HOST_ATTR in attrs:
751 servo_host = attrs[SERVO_HOST_ATTR]
752 if (is_ssp_moblab and servo_host in ['localhost', '127.0.0.1']):
753 servo_host = _CONFIG.get_config_value(
754 'SSP', 'host_container_ip', type=str, default=None)
755 servo_args = {SERVO_HOST_ATTR: servo_host}
756 if SERVO_PORT_ATTR in attrs:
Kevin Cheng692e5292016-08-14 00:23:24 -0700757 try:
758 servo_port = attrs[SERVO_PORT_ATTR]
759 servo_args[SERVO_PORT_ATTR] = int(servo_port)
760 except ValueError:
761 logging.error('servo port is not an int: %s', servo_port)
762 # Let's set the servo args to None since we're not creating
763 # the ServoHost object with the proper port now.
764 servo_args = None
Kevin Cheng643ce8a2016-09-15 15:42:12 -0700765 if SERVO_SERIAL_ATTR in attrs:
766 servo_args[SERVO_SERIAL_ATTR] = attrs[SERVO_SERIAL_ATTR]
Richard Barnetteea3e4602016-06-10 12:36:41 -0700767 is_in_lab = (not is_moblab
768 and utils.host_is_in_lab_zone(servo_host))
769
770 # TODO(jrbarnette): This test to use the default lab servo hostname
771 # is a legacy that we need only until every host in the DB has
772 # proper attributes.
773 elif (not is_moblab and
774 not dnsname_mangler.is_ip_address(dut_host.hostname)):
775 servo_host = make_servo_hostname(dut_host.hostname)
776 is_in_lab = utils.host_is_in_lab_zone(servo_host)
777 if is_in_lab:
778 servo_args = {SERVO_HOST_ATTR: servo_host}
Richard Barnette9a26ad62016-06-10 12:03:08 -0700779 if servo_args is not None:
Prathmesh Prabhua3bb7652017-02-09 11:42:13 -0800780 info = dut_host.host_info_store.get()
781 if info.board:
782 servo_args[SERVO_BOARD_ATTR] = _map_afe_board_to_servo_board(
783 info.board)
Richard Barnetteea3e4602016-06-10 12:36:41 -0700784 return servo_args, is_in_lab
785
786
Dan Shi023aae32016-05-25 11:13:01 -0700787def create_servo_host(dut, servo_args, try_lab_servo=False,
Richard Barnette9a26ad62016-06-10 12:03:08 -0700788 try_servo_repair=False):
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700789 """Create a ServoHost object for a given DUT, if appropriate.
Dan Shi4d478522014-02-14 13:46:32 -0800790
Richard Barnette9a26ad62016-06-10 12:03:08 -0700791 This function attempts to create and verify or repair a `ServoHost`
792 object for a servo connected to the given `dut`, subject to various
793 constraints imposed by the parameters:
794 * When the `servo_args` parameter is not `None`, a servo
795 host must be created, and must be checked with `repair()`.
796 * Otherwise, if a servo exists in the lab and `try_lab_servo` is
797 true:
798 * If `try_servo_repair` is true, then create a servo host and
799 check it with `repair()`.
800 * Otherwise, if the servo responds to `ping` then create a
801 servo host and check it with `verify()`.
Fang Denge545abb2014-12-30 18:43:47 -0800802
Richard Barnette9a26ad62016-06-10 12:03:08 -0700803 In cases where `servo_args` was not `None`, repair failure
804 exceptions are passed back to the caller; otherwise, exceptions
Richard Barnette07c2e1d2016-10-26 14:24:28 -0700805 are logged and then discarded. Note that this only happens in cases
806 where we're called from a test (not special task) control file that
807 has an explicit dependency on servo. In that case, we require that
808 repair not write to `status.log`, so as to avoid polluting test
809 results.
810
811 TODO(jrbarnette): The special handling for servo in test control
812 files is a thorn in my flesh; I dearly hope to see it cut out before
813 my retirement.
Richard Barnette9a26ad62016-06-10 12:03:08 -0700814
815 Parameters for a servo host consist of a host name, port number, and
816 DUT board, and are determined from one of these sources, in order of
817 priority:
Richard Barnetteea3e4602016-06-10 12:36:41 -0700818 * Servo attributes from the `dut` parameter take precedence over
819 all other sources of information.
820 * If a DNS entry for the servo based on the DUT hostname exists in
821 the CrOS lab network, that hostname is used with the default
Richard Barnette9a26ad62016-06-10 12:03:08 -0700822 port and the DUT's board.
Richard Barnetteea3e4602016-06-10 12:36:41 -0700823 * If no other options are found, the parameters will be taken
Richard Barnette9a26ad62016-06-10 12:03:08 -0700824 from the `servo_args` dict passed in from the caller.
Richard Barnetteea3e4602016-06-10 12:36:41 -0700825
826 @param dut An instance of `Host` from which to take
827 servo parameters (if available).
828 @param servo_args A dictionary with servo parameters to use if
829 they can't be found from `dut`. If this
830 argument is supplied, unrepaired exceptions
831 from `verify()` will be passed back to the
832 caller.
833 @param try_lab_servo If not true, servo host creation will be
834 skipped unless otherwise required by the
835 caller.
Richard Barnette9a26ad62016-06-10 12:03:08 -0700836 @param try_servo_repair If true, check a servo host with
837 `repair()` instead of `verify()`.
Dan Shi4d478522014-02-14 13:46:32 -0800838
839 @returns: A ServoHost object or None. See comments above.
840
841 """
Richard Barnette07c2e1d2016-10-26 14:24:28 -0700842 servo_dependency = servo_args is not None
Richard Barnetteea3e4602016-06-10 12:36:41 -0700843 is_in_lab = False
Richard Barnette07c2e1d2016-10-26 14:24:28 -0700844 if dut is not None and (try_lab_servo or servo_dependency):
Richard Barnetteea3e4602016-06-10 12:36:41 -0700845 servo_args_override, is_in_lab = _get_standard_servo_args(dut)
846 if servo_args_override is not None:
847 servo_args = servo_args_override
848 if servo_args is None:
849 return None
Richard Barnette07c2e1d2016-10-26 14:24:28 -0700850 if (not servo_dependency and not try_servo_repair and
Richard Barnette9a26ad62016-06-10 12:03:08 -0700851 not servo_host_is_up(servo_args[SERVO_HOST_ATTR])):
Dan Shibbb0cb62014-03-24 17:50:57 -0700852 return None
Richard Barnette9a26ad62016-06-10 12:03:08 -0700853 newhost = ServoHost(is_in_lab=is_in_lab, **servo_args)
854 # Note that the logic of repair() includes everything done
855 # by verify(). It's sufficient to call one or the other;
856 # we don't need both.
Richard Barnette07c2e1d2016-10-26 14:24:28 -0700857 if servo_dependency:
858 newhost.repair(silent=True)
Richard Barnette9a26ad62016-06-10 12:03:08 -0700859 else:
860 try:
861 if try_servo_repair:
862 newhost.repair()
863 else:
864 newhost.verify()
Kevin Cheng5f2ba6c2016-09-28 10:20:05 -0700865 except Exception:
Richard Barnette9a26ad62016-06-10 12:03:08 -0700866 operation = 'repair' if try_servo_repair else 'verification'
867 logging.exception('Servo %s failed for %s',
868 operation, newhost.hostname)
869 return newhost