blob: 32a918d3634e52e0acb85d10c0f010772b9fa4d4 [file] [log] [blame]
Derek Beckettf73baca2020-08-19 15:08:47 -07001# Lint as: python2, python3
Richard Barnette90ad4262016-11-17 17:29:24 -08002# Copyright 2016 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
Richard Barnette1bf22a32016-11-18 16:14:31 -08006"""
7Repair actions and verifiers relating to CrOS firmware.
8
9This contains the repair actions and verifiers need to find problems
10with the firmware installed on Chrome OS DUTs, and when necessary, to
11fix problems by updating or re-installing the firmware.
Richard Barnette077665e2016-11-29 16:00:59 -080012
13The operations in the module support two distinct use cases:
14 * DUTs used for FAFT tests can in some cases have problems with
15 corrupted firmware. The module supplies `FirmwareStatusVerifier`
Garry Wangad2a1712020-03-26 15:06:43 -070016 to check for corruption, and supplies `FaftFirmwareRepair` to
17 re-install firmware of current faft stable_version via servo
18 when needed.
Richard Barnette077665e2016-11-29 16:00:59 -080019 * DUTs used for general testing normally should be running a
20 designated "stable" firmware version. This module supplies
21 `FirmwareVersionVerifier` to detect and automatically update
Garry Wangad2a1712020-03-26 15:06:43 -070022 firmware that is out-of-date from the designated version. This model
23 also supplys `GeneralFirmwareRepair` to re-install firmware that
24 tied with current stable_version image via servo when needed.
Richard Barnette077665e2016-11-29 16:00:59 -080025
26For purposes of the operations in the module, we distinguish three kinds
27of DUT, based on pool assignments:
28 * DUTs used for general testing. These DUTs automatically check for
29 and install the stable firmware using `FirmwareVersionVerifier`.
30 * DUTs in pools used for FAFT testing. These check for bad firmware
31 builds with `FirmwareStatusVerifier`, and will fix problems using
32 `FirmwareRepair`. These DUTs don't check for or install the
33 stable firmware.
34 * DUTs not in general pools, and not used for FAFT. These DUTs
35 are expected to be managed by separate processes and are excluded
36 from all of the verification and repair code in this module.
Richard Barnette1bf22a32016-11-18 16:14:31 -080037"""
38
Xixuan Wu93e646c2017-12-07 18:36:10 -080039# pylint: disable=missing-docstring
40
Derek Beckettf73baca2020-08-19 15:08:47 -070041from __future__ import absolute_import
42from __future__ import division
43from __future__ import print_function
44
Hung-Te Lina014dbc2019-11-07 16:41:42 +080045import json
Richard Barnette90ad4262016-11-17 17:29:24 -080046import logging
Richard Barnette90ad4262016-11-17 17:29:24 -080047
48import common
Richard Barnette1bf22a32016-11-18 16:14:31 -080049from autotest_lib.client.common_lib import global_config
Richard Barnette90ad4262016-11-17 17:29:24 -080050from autotest_lib.client.common_lib import hosts
51from autotest_lib.server import afe_utils
Richard Barnette3245ae22018-08-31 11:50:08 -070052from autotest_lib.server.hosts import repair_utils
Gregory Nisbetd3007d22020-09-02 12:04:07 -070053from autotest_lib.server.hosts import cros_constants
54
55from chromite.lib import timeout_util
Derek Beckettf73baca2020-08-19 15:08:47 -070056import six
Richard Barnette1bf22a32016-11-18 16:14:31 -080057
58
Richard Barnette077665e2016-11-29 16:00:59 -080059# _FIRMWARE_REPAIR_POOLS - The set of pools that should be
60# managed by `FirmwareStatusVerifier` and `FirmwareRepair`.
61#
62_FIRMWARE_REPAIR_POOLS = set(
63 global_config.global_config.get_config_value(
64 'CROS',
65 'pools_support_firmware_repair',
66 type=str).split(','))
67
68
Garry Wangad2a1712020-03-26 15:06:43 -070069def _is_firmware_testing_device(host):
Richard Barnette1bf22a32016-11-18 16:14:31 -080070 """
Garry Wangad2a1712020-03-26 15:06:43 -070071 check if a host is dedicated for firmware testing.
Richard Barnette1bf22a32016-11-18 16:14:31 -080072
Richard Barnette077665e2016-11-29 16:00:59 -080073 When this function returns true, the DUT should be managed by
Garry Wangad2a1712020-03-26 15:06:43 -070074 `FirmwareStatusVerifier` and `FaftFirmwareRepair`, but not
75 `FirmwareVersionVerifier` and `GeneralFirmwareRepair.
Richard Barnette1bf22a32016-11-18 16:14:31 -080076
Richard Barnette077665e2016-11-29 16:00:59 -080077 @return A true value if the host should use `FirmwareStatusVerifier`
Garry Wangad2a1712020-03-26 15:06:43 -070078 and `FaftFirmwareRepair`; a false value otherwise.
Richard Barnette1bf22a32016-11-18 16:14:31 -080079 """
Prathmesh Prabhub6cea612017-02-09 15:41:19 -080080 info = host.host_info_store.get()
81 return bool(info.pools & _FIRMWARE_REPAIR_POOLS)
Richard Barnette077665e2016-11-29 16:00:59 -080082
83
84def _is_firmware_update_supported(host):
85 """
86 Return whether a DUT should be running the standard firmware.
87
88 In the test lab, DUTs used for general testing, (e.g. the `bvt`
89 pool) need their firmware kept up-to-date with
90 `FirmwareVersionVerifier`. However, some pools have alternative
91 policies for firmware management. This returns whether a given DUT
92 should be updated via the standard stable version update, or
93 managed by some other procedure.
94
95 @param host The host to be checked for update policy.
96 @return A true value if the host should use
97 `FirmwareVersionVerifier`; a false value otherwise.
98 """
Garry Wangad2a1712020-03-26 15:06:43 -070099 return not _is_firmware_testing_device(host)
Richard Barnette1bf22a32016-11-18 16:14:31 -0800100
101
Ningning Xia05af7402018-02-13 18:19:10 -0800102def _get_available_firmware(host, model):
Hung-Te Lina014dbc2019-11-07 16:41:42 +0800103 """Get the available RW firmware version given the model.
Ningning Xia05af7402018-02-13 18:19:10 -0800104
105 @param host The host to get available firmware for.
106 @param model The model name to get corresponding firmware version.
Hung-Te Lina014dbc2019-11-07 16:41:42 +0800107 @return The available RW firmware version if found, else, None.
Ningning Xia05af7402018-02-13 18:19:10 -0800108 """
Hung-Te Lina014dbc2019-11-07 16:41:42 +0800109 result = host.run('chromeos-firmwareupdate --manifest', ignore_status=True)
Ningning Xia05af7402018-02-13 18:19:10 -0800110
Hung-Te Lina014dbc2019-11-07 16:41:42 +0800111 if result.exit_status != 0:
112 return None
Ningning Xia05af7402018-02-13 18:19:10 -0800113
Hung-Te Lina014dbc2019-11-07 16:41:42 +0800114 # The manifest is a JSON in .model.host.versions.rw
115 data = json.loads(result.stdout) or {}
Derek Beckettf73baca2020-08-19 15:08:47 -0700116 key = model if len(data) > 1 else next(six.iterkeys(data), '')
Hung-Te Lina014dbc2019-11-07 16:41:42 +0800117 key += '.host.versions.rw'
118 for k in key.split('.'):
119 data = data.get(k, {})
120 return data or None
Ningning Xia05af7402018-02-13 18:19:10 -0800121
122
Richard Barnette1bf22a32016-11-18 16:14:31 -0800123class FirmwareStatusVerifier(hosts.Verifier):
124 """
125 Verify that a host's firmware is in a good state.
126
127 For DUTs that run firmware tests, it's possible that the firmware
128 on the DUT can get corrupted. This verifier checks whether it
129 appears that firmware should be re-flashed using servo.
130 """
131
Gregory Nisbetd3007d22020-09-02 12:04:07 -0700132 @timeout_util.TimeoutDecorator(cros_constants.VERIFY_TIMEOUT_SEC)
Richard Barnette1bf22a32016-11-18 16:14:31 -0800133 def verify(self, host):
Garry Wangad2a1712020-03-26 15:06:43 -0700134 if not _is_firmware_testing_device(host):
Richard Barnette1bf22a32016-11-18 16:14:31 -0800135 return
136 try:
137 # Read the AP firmware and dump the sections that we're
138 # interested in.
139 cmd = ('mkdir /tmp/verify_firmware; '
140 'cd /tmp/verify_firmware; '
141 'for section in VBLOCK_A VBLOCK_B FW_MAIN_A FW_MAIN_B; '
Chris McDonald9e6f9df2018-10-03 12:12:06 -0600142 'do flashrom -p host -r -i $section:$section; '
Richard Barnette1bf22a32016-11-18 16:14:31 -0800143 'done')
144 host.run(cmd)
145
146 # Verify the firmware blocks A and B.
147 cmd = ('vbutil_firmware --verify /tmp/verify_firmware/VBLOCK_%c'
148 ' --signpubkey /usr/share/vboot/devkeys/root_key.vbpubk'
149 ' --fv /tmp/verify_firmware/FW_MAIN_%c')
150 for c in ('A', 'B'):
151 rv = host.run(cmd % (c, c), ignore_status=True)
152 if rv.exit_status:
153 raise hosts.AutoservVerifyError(
154 'Firmware %c is in a bad state.' % c)
155 finally:
156 # Remove the temporary files.
157 host.run('rm -rf /tmp/verify_firmware')
158
159 @property
160 def description(self):
161 return 'Firmware on this DUT is clean'
Richard Barnette90ad4262016-11-17 17:29:24 -0800162
163
Richard Barnette077665e2016-11-29 16:00:59 -0800164class FirmwareRepair(hosts.RepairAction):
165 """
166 Reinstall the firmware image using servo.
167
168 This repair function attempts to use servo to install the DUT's
169 designated "stable firmware version".
170
171 This repair method only applies to DUTs used for FAFT.
172 """
Garry Wangad2a1712020-03-26 15:06:43 -0700173 def _get_stable_build(self, host):
Garry Wang61cfe0b2020-08-21 16:26:00 -0700174 raise NotImplementedError(
175 'Class %s does not implement _get_stable_build()'
176 % type(self).__name__)
177
178 def _run_repair(self, host, build):
179 raise NotImplementedError(
180 'Class %s does not implement _run_repair()'
181 % type(self).__name__)
Richard Barnette077665e2016-11-29 16:00:59 -0800182
183 def repair(self, host):
Garry Wang6cac8542020-03-13 16:58:20 -0700184 repair_utils.require_servo(host, ignore_state=True)
Garry Wangad2a1712020-03-26 15:06:43 -0700185 build = self._get_stable_build(host)
186 if not build:
187 raise hosts.AutoservRepairError(
Garry Wangea87f9c2020-06-15 11:29:01 -0700188 'Failed to find stable firmware build for %s, if the DUT is'
189 ' in faft-*pool, faft stable_version needs to be set.'
190 % host.hostname, 'cannot find firmware stable_version')
Garry Wang61cfe0b2020-08-21 16:26:00 -0700191 self._run_repair(host, build)
Richard Barnette077665e2016-11-29 16:00:59 -0800192
Richard Barnette077665e2016-11-29 16:00:59 -0800193
Garry Wangad2a1712020-03-26 15:06:43 -0700194class FaftFirmwareRepair(FirmwareRepair):
195 """
196 Reinstall the firmware for DUTs in faft related pool.
197 """
198 def _get_stable_build(self, host):
199 info = host.host_info_store.get()
200 return afe_utils.get_stable_faft_version_v2(info)
201
Garry Wang61cfe0b2020-08-21 16:26:00 -0700202 def _run_repair(self, host, build):
203 host.firmware_install(build)
204
Garry Wangad2a1712020-03-26 15:06:43 -0700205 def _is_applicable(self, host):
Garry Wang6c5fe582020-03-27 15:16:25 -0700206 return _is_firmware_testing_device(host)
207
208 @property
209 def description(self):
210 return 'Re-install the stable firmware(faft) via servo'
Garry Wangad2a1712020-03-26 15:06:43 -0700211
212
213class GeneralFirmwareRepair(FirmwareRepair):
214 """Reinstall the firmware for non-faft DUTs.
215 We need different RepairAction for non firmware testing DUT because
216 we want only try re-install firmware if all other RepairAction could
217 not restore ssh capability to the DUT.
218 """
219 def _get_stable_build(self, host):
220 # Use firmware in current stable os build.
221 return host.get_cros_repair_image_name()
222
Garry Wang61cfe0b2020-08-21 16:26:00 -0700223 def _run_repair(self, host, build):
224 # As GeneralFirmwareRepair is the last repair action, we expect
225 # stable_version os image is loaded on usbkey during other repair
226 # action runs. And there is also no point to repeat and waste time if
227 # download image to usbkey failed in other repair actions.
228 if host._servo_host.validate_image_usbkey() != build:
229 raise hosts.AutoservRepairError('%s is expected to be preloaded,'
230 'however it\'s not found on the usbkey' % build,
231 'image not loaded on usbkey')
232 ec_image, bios_image = host._servo_host.prepare_repair_firmware_image()
Garry Wang8df98a02020-09-11 21:39:29 -0700233
234 # Before flash firmware we want update the build into health profile.
235 if host.health_profile:
236 host.health_profile.set_firmware_stable_version(build)
237
Garry Wang61cfe0b2020-08-21 16:26:00 -0700238 if ec_image:
239 logging.info('Attempting to flash ec firmware...')
240 host.servo.program_ec(ec_image, copy_image=False)
241 if bios_image:
242 logging.info('Attempting to flash bios firmware...')
Garry Wang991f8562020-09-24 17:04:18 -0700243 host._servo_host.flash_ap_firmware_via_servo(bios_image)
Garry Wang61cfe0b2020-08-21 16:26:00 -0700244
245 logging.info('Cold resetting DUT through servo...')
246 host.servo.get_power_state_controller().reset()
247 host.wait_up(timeout=host.BOOT_TIMEOUT)
Garry Wang8df98a02020-09-11 21:39:29 -0700248 # flash firmware via servo will turn DUT into dev mode, so disable
249 # dev mode and reset gbb flag here.
250 host.run('/usr/share/vboot/bin/set_gbb_flags.sh 0', ignore_status=True)
251 host.run('crossystem disable_dev_request=1', ignore_status=True)
252 host.reboot()
Garry Wang61cfe0b2020-08-21 16:26:00 -0700253
Garry Wangad2a1712020-03-26 15:06:43 -0700254 def _is_applicable(self, host):
Garry Wang8df98a02020-09-11 21:39:29 -0700255 if _is_firmware_testing_device(host):
256 logging.info('GeneralFirmwareRepair is not applicable to DUTs'
257 ' in faft pools.')
258 return False
259 if not host.servo:
260 logging.info(
261 'The current servo state of %s is not met the'
262 ' minimum requirement to flash firmware.', host.hostname)
263 # Flash firmware via servo is consider an expansive opertation, so we
264 # want to check repair data from previous repairs to determine if
265 # firmware repair is need.
266 dhp = host.health_profile
267 if not dhp:
268 logging.info('Device health profile is not available, cannot'
269 ' determine if firmware repair is needed.')
270 return False
Garry Wang991f8562020-09-24 17:04:18 -0700271 repair_fail_count = dhp.get_repair_fail_count()
272 if repair_fail_count < 2:
273 # We want to start with a more conservative strategy, so only try
274 # this action on DUTs that failed repair at least twice.
275 # @TODO(xianuowang@) adjust or remove this threshold.
276 logging.info(
277 'Firmware repair will only applies to DUT that'
278 ' failed at least two AdminRepair, current fail'
279 ' count: %s', repair_fail_count)
280 return False
Garry Wang8df98a02020-09-11 21:39:29 -0700281 flashed_build = dhp.get_firmware_stable_version()
282 candidate_build = self._get_stable_build(host)
283 # If we had an success firmware flash in this repair loop,
284 # there is no need to retry flash the same firmware build.
285 if (dhp.get_succeed_repair_action(self.tag) > 0
286 and flashed_build == candidate_build):
287 logging.info(
288 'Firmware from %s has been already installed on %s,'
289 ' no need to retry.', flashed_build, host.hostname)
290 return False
291 if (dhp.get_failed_repair_action(self.tag) > 2
292 and flashed_build == candidate_build):
293 logging.info(
294 'Firmware from %s has been attempted and failed 3 '
295 'times, no need to retry.', flashed_build)
296 return False
297 return True
Garry Wang6c5fe582020-03-27 15:16:25 -0700298
299 @property
300 def description(self):
301 return 'Re-install the stable firmware(non-faft) via servo'
Garry Wangad2a1712020-03-26 15:06:43 -0700302
303
Richard Barnette90ad4262016-11-17 17:29:24 -0800304class FirmwareVersionVerifier(hosts.Verifier):
305 """
306 Check for a firmware update, and apply it if appropriate.
307
308 This verifier checks to ensure that either the firmware on the DUT
309 is up-to-date, or that the target firmware can be installed from the
310 currently running build.
311
312 Failure occurs when all of the following apply:
Richard Barnette077665e2016-11-29 16:00:59 -0800313 1. The DUT is not excluded from updates. For example, DUTs used
314 for FAFT testing use `FirmwareRepair` instead.
315 2. The DUT's board has an assigned stable firmware version.
Richard Barnette90ad4262016-11-17 17:29:24 -0800316 3. The DUT is not running the assigned stable firmware.
317 4. The firmware supplied in the running OS build is not the
318 assigned stable firmware.
319
320 If the DUT needs an upgrade and the currently running OS build
Richard Barnette077665e2016-11-29 16:00:59 -0800321 supplies the necessary firmware, the verifier installs the new
322 firmware using `chromeos-firmwareupdate`. Failure to install will
323 cause the verifier to fail.
Richard Barnette90ad4262016-11-17 17:29:24 -0800324
325 This verifier nominally breaks the rule that "verifiers must succeed
326 quickly", since it can invoke `reboot()` during the success code
327 path. We're doing it anyway for two reasons:
328 * The time between updates will typically be measured in months,
329 so the amortized cost is low.
330 * The reason we distinguish repair from verify is to allow
331 rescheduling work immediately while the expensive repair happens
332 out-of-band. But a firmware update will likely hit all DUTs at
333 once, so it's pointless to pass the buck to repair.
334
335 N.B. This verifier is a trigger for all repair actions that install
336 the stable repair image. If the firmware is out-of-date, but the
337 stable repair image does *not* contain the proper firmware version,
338 _the target DUT will fail repair, and will be unable to fix itself_.
339 """
340
341 @staticmethod
342 def _get_rw_firmware(host):
343 result = host.run('crossystem fwid', ignore_status=True)
344 if result.exit_status == 0:
345 return result.stdout
346 else:
347 return None
348
349 @staticmethod
Richard Barnette90ad4262016-11-17 17:29:24 -0800350 def _check_hardware_match(version_a, version_b):
351 """
352 Check that two firmware versions identify the same hardware.
353
354 Firmware version strings look like this:
355 Google_Gnawty.5216.239.34
356 The part before the numbers identifies the hardware for which
357 the firmware was built. This function checks that the hardware
358 identified by `version_a` and `version_b` is the same.
359
360 This is a sanity check to protect us from installing the wrong
361 firmware on a DUT when a board label has somehow gone astray.
362
363 @param version_a First firmware version for the comparison.
364 @param version_b Second firmware version for the comparison.
365 """
366 hardware_a = version_a.split('.')[0]
367 hardware_b = version_b.split('.')[0]
368 if hardware_a != hardware_b:
369 message = 'Hardware/Firmware mismatch updating %s to %s'
370 raise hosts.AutoservVerifyError(
371 message % (version_a, version_b))
372
Gregory Nisbetd3007d22020-09-02 12:04:07 -0700373 @timeout_util.TimeoutDecorator(cros_constants.VERIFY_TIMEOUT_SEC)
Richard Barnette90ad4262016-11-17 17:29:24 -0800374 def verify(self, host):
Richard Barnette077665e2016-11-29 16:00:59 -0800375 # Test 1 - The DUT is not excluded from updates.
376 if not _is_firmware_update_supported(host):
Richard Barnette90ad4262016-11-17 17:29:24 -0800377 return
378 # Test 2 - The DUT has an assigned stable firmware version.
Prathmesh Prabhu075fc922017-02-13 11:50:25 -0800379 info = host.host_info_store.get()
Ningning Xia05af7402018-02-13 18:19:10 -0800380 if info.model is None:
Prathmesh Prabhu075fc922017-02-13 11:50:25 -0800381 raise hosts.AutoservVerifyError(
382 'Can not verify firmware version. '
Ningning Xia05af7402018-02-13 18:19:10 -0800383 'No model label value found')
Prathmesh Prabhu075fc922017-02-13 11:50:25 -0800384
C Shapiro70b70672019-05-24 11:26:16 -0600385 stable_firmware = None
386 try:
Gregory Nisbet7fe11c22019-11-22 11:06:06 -0800387 stable_firmware = afe_utils.get_stable_firmware_version_v2(info)
C Shapiro70b70672019-05-24 11:26:16 -0600388 except Exception as e:
389 logging.exception('Failed lookup to AFE for stable fw version '
390 ' with exception: %s', e)
391
Richard Barnette90ad4262016-11-17 17:29:24 -0800392 if stable_firmware is None:
393 # This DUT doesn't have a firmware update target
394 return
395
396 # For tests 3 and 4: If the output from `crossystem` or
397 # `chromeos-firmwareupdate` isn't what we expect, we log an
398 # error, but don't fail: We don't want DUTs unable to test a
399 # build merely because of a bug or change in either of those
400 # commands.
401
402 # Test 3 - The DUT is not running the target stable firmware.
403 current_firmware = self._get_rw_firmware(host)
404 if current_firmware is None:
405 logging.error('DUT firmware version can\'t be determined.')
406 return
407 if current_firmware == stable_firmware:
408 return
409 # Test 4 - The firmware supplied in the running OS build is not
410 # the assigned stable firmware.
Ningning Xia05af7402018-02-13 18:19:10 -0800411 available_firmware = _get_available_firmware(host, info.model)
Richard Barnette90ad4262016-11-17 17:29:24 -0800412 if available_firmware is None:
413 logging.error('Supplied firmware version in OS can\'t be '
414 'determined.')
415 return
416 if available_firmware != stable_firmware:
417 raise hosts.AutoservVerifyError(
418 'DUT firmware requires update from %s to %s' %
419 (current_firmware, stable_firmware))
420 # Time to update the firmware.
421 logging.info('Updating firmware from %s to %s',
422 current_firmware, stable_firmware)
423 self._check_hardware_match(current_firmware, stable_firmware)
424 try:
425 host.run('chromeos-firmwareupdate --mode=autoupdate')
426 host.reboot()
427 except Exception as e:
428 message = ('chromeos-firmwareupdate failed: from '
429 '%s to %s')
430 logging.exception(message, current_firmware, stable_firmware)
431 raise hosts.AutoservVerifyError(
432 message % (current_firmware, stable_firmware))
Richard Barnette1b489932017-02-14 10:50:58 -0800433 final_firmware = self._get_rw_firmware(host)
434 if final_firmware != stable_firmware:
435 message = ('chromeos-firmwareupdate failed: tried upgrade '
436 'to %s, now running %s instead')
437 raise hosts.AutoservVerifyError(
438 message % (stable_firmware, final_firmware))
Richard Barnette90ad4262016-11-17 17:29:24 -0800439
440 @property
441 def description(self):
442 return 'The firmware on this DUT is up-to-date'