toolchain-utils: change naming related to AFE lock

After introducing the new locking mechanism, the behavior of
AFELockManager totally changed. This file changed the naming of all
related code in toochain-utils.

This patch also changed the behavior of locks_dir option, deprecated the
use_file_lock option explicitly.

TEST=Tested with different DUT types.
BUG=chromium:1006434

Change-Id: Ib15efce54ec4d4c5c2a18fecca3f350248462035
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/toolchain-utils/+/1863530
Reviewed-by: Caroline Tice <cmtice@chromium.org>
Reviewed-by: Denis Nikitin <denik@chromium.org>
Commit-Queue: Zhizhou Yang <zhizhouy@google.com>
Tested-by: Zhizhou Yang <zhizhouy@google.com>
diff --git a/lock_machine.py b/lock_machine.py
new file mode 100755
index 0000000..40c7d8f
--- /dev/null
+++ b/lock_machine.py
@@ -0,0 +1,618 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+#
+# Copyright 2019 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""This module controls locking and unlocking of test machines."""
+
+from __future__ import print_function
+
+import argparse
+import enum
+import getpass
+import os
+import sys
+
+import file_lock_machine
+
+from cros_utils import command_executer
+from cros_utils import logger
+from cros_utils import machines
+
+
+class LockException(Exception):
+  """Base class for exceptions in this module."""
+
+
+class MachineNotPingable(LockException):
+  """Raised when machine does not respond to ping."""
+
+
+class LockingError(LockException):
+  """Raised when server fails to lock/unlock machine as requested."""
+
+
+class DontOwnLock(LockException):
+  """Raised when user attmepts to unlock machine locked by someone else."""
+  # This should not be raised if the user specified '--force'
+
+
+class NoAFEServer(LockException):
+  """Raised when cannot find/access the autotest server."""
+
+
+class AFEAccessError(LockException):
+  """Raised when cannot get information about lab machine from lab server."""
+
+
+class MachineType(enum.Enum):
+  """Enum class to hold machine type."""
+  AFE = 'afe'
+  LOCAL = 'local'
+  SKYLAB = 'skylab'
+
+
+class LockManager(object):
+  """Class for locking/unlocking machines vie three different modes.
+
+  This class contains methods for checking the locked status of machines,
+  and for changing the locked status.  It handles HW lab machines (both AFE
+  and Skylab), and local machines, using appropriate locking mechanisms for
+  each.
+
+  !!!IMPORTANT NOTE!!!  The AFE server can only be called from the main
+  thread/process of a program.  If you launch threads and try to call it
+  from a thread, you will get an error.  This has to do with restrictions
+  in the Python virtual machine (and signal handling) and cannot be changed.
+  """
+
+  SKYLAB_PATH = '/usr/local/bin/skylab'
+  LEASE_MINS = 600
+  SKYLAB_CREDENTIAL = '/usr/local/google/home/mobiletc-prebuild' \
+                      '/sheriff_utils/skylab_credential' \
+                      '/chromeos-swarming-credential.json'
+  SWARMING = 'chromite/third_party/swarming.client/swarming.py'
+  SUCCESS = 0
+
+  def __init__(self,
+               remotes,
+               force_option,
+               chromeos_root,
+               locks_dir='',
+               log=None):
+    """Initializes an LockManager object.
+
+    Args:
+      remotes: A list of machine names or ip addresses to be managed.  Names
+        and ip addresses should be represented as strings.  If the list is
+        empty, the lock manager will get all known machines.
+      force_option: A Boolean indicating whether or not to force an unlock of
+        a machine that was locked by someone else.
+      chromeos_root: The ChromeOS chroot to use for the autotest scripts.
+      locks_dir: A directory used for file locking local devices.
+      log: If not None, this is the logger object to be used for writing out
+        informational output messages.  It is expected to be an instance of
+        Logger class from cros_utils/logger.py.
+    """
+    self.chromeos_root = chromeos_root
+    self.user = getpass.getuser()
+    self.logger = log or logger.GetLogger()
+    self.ce = command_executer.GetCommandExecuter(self.logger)
+    autotest_path = os.path.join(chromeos_root,
+                                 'src/third_party/autotest/files')
+
+    sys.path.append(chromeos_root)
+    sys.path.append(autotest_path)
+    sys.path.append(os.path.join(autotest_path, 'server', 'cros'))
+
+    self.locks_dir = locks_dir
+
+    # We have to wait to do these imports until the paths above have
+    # been fixed.
+    # pylint: disable=import-error
+    from client import setup_modules
+    setup_modules.setup(
+        base_path=autotest_path, root_module_name='autotest_lib')
+
+    from dynamic_suite import frontend_wrappers
+
+    self.afe = frontend_wrappers.RetryingAFE(
+        timeout_min=30, delay_sec=10, debug=False, server='cautotest')
+
+    self.machines = list(set(remotes)) or []
+    self.toolchain_lab_machines = self.GetAllToolchainLabMachines()
+
+    if not self.machines:
+      self.machines = self.toolchain_lab_machines
+    self.force = force_option
+
+    self.local_machines = []
+    self.skylab_machines = []
+
+  def CheckMachine(self, machine, error_msg):
+    """Verifies that machine is responding to ping.
+
+    Args:
+      machine: String containing the name or ip address of machine to check.
+      error_msg: Message to print if ping fails.
+
+    Raises:
+      MachineNotPingable:  If machine is not responding to 'ping'
+    """
+    if not machines.MachineIsPingable(machine, logging_level='none'):
+      cros_machine = machine + '.cros'
+      if not machines.MachineIsPingable(cros_machine, logging_level='none'):
+        raise MachineNotPingable(error_msg)
+
+  def GetAllToolchainLabMachines(self):
+    """Gets a list of all the toolchain machines in the ChromeOS HW lab.
+
+    Returns:
+      A list of names of the toolchain machines in the ChromeOS HW lab.
+    """
+    machines_file = os.path.join(
+        os.path.dirname(__file__), 'crosperf', 'default_remotes')
+    machine_list = []
+    with open(machines_file, 'r') as input_file:
+      lines = input_file.readlines()
+      for line in lines:
+        _, remotes = line.split(':')
+        remotes = remotes.strip()
+        for r in remotes.split():
+          machine_list.append(r.strip())
+    return machine_list
+
+  def GetMachineType(self, m):
+    """Get where the machine is located.
+
+    Args:
+      m: String containing the name or ip address of machine.
+
+    Returns:
+      Value of the type in MachineType Enum.
+    """
+    if m in self.local_machines:
+      return MachineType.LOCAL
+    if m in self.skylab_machines:
+      return MachineType.SKYLAB
+    return MachineType.AFE
+
+  def PrintStatusHeader(self):
+    """Prints the status header lines for machines."""
+    print('\nMachine (Board)\t\t\t\t\tStatus')
+    print('---------------\t\t\t\t\t------')
+
+  def PrintStatus(self, m, state, machine_type):
+    """Prints status for a single machine.
+
+    Args:
+      m: String containing the name or ip address of machine.
+      state: A dictionary of the current state of the machine.
+      machine_type: MachineType to determine where the machine is located.
+    """
+    if machine_type == MachineType.AFE and not m.endswith('.cros'):
+      m += '.cros'
+    if state['locked']:
+      print('%s (%s)\t\t%slocked by %s since %s' %
+            (m, state['board'], '\t\t' if machine_type == MachineType.LOCAL else
+             '', state['locked_by'], state['lock_time']))
+    else:
+      print(
+          '%s (%s)\t\t%sunlocked' % (m, state['board'], '\t\t' if
+                                     machine_type == MachineType.LOCAL else ''))
+
+  def AddMachineToLocal(self, machine):
+    """Adds a machine to local machine list.
+
+    Args:
+      machine: The machine to be added.
+    """
+    if machine not in self.local_machines:
+      self.local_machines.append(machine)
+
+  def AddMachineToSkylab(self, machine):
+    """Adds a machine to skylab machine list.
+
+    Args:
+      machine: The machine to be added.
+    """
+    if machine not in self.skylab_machines:
+      self.skylab_machines.append(machine)
+
+  def ListMachineStates(self, machine_states):
+    """Gets and prints the current status for a list of machines.
+
+    Prints out the current status for all of the machines in the current
+    LockManager's list of machines (set when the object is initialized).
+
+    Args:
+      machine_states: A dictionary of the current state of every machine in
+        the current LockManager's list of machines.  Normally obtained by
+        calling LockManager::GetMachineStates.
+    """
+    self.PrintStatusHeader()
+    for m in machine_states:
+      machine_type = self.GetMachineType(m)
+      state = machine_states[m]
+      self.PrintStatus(m, state, machine_type)
+
+  def UpdateLockInAFE(self, should_lock_machine, machine):
+    """Calls an AFE server to lock/unlock a machine.
+
+    Args:
+      should_lock_machine: Boolean indicating whether to lock the machine (True)
+        or unlock the machine (False).
+      machine: The machine to update.
+
+    Returns:
+      True if requested action succeeded, else False.
+    """
+    kwargs = {'locked': should_lock_machine}
+    if should_lock_machine:
+      kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user
+
+    m = machine.split('.')[0]
+    afe_server = self.afe
+
+    try:
+      afe_server.run(
+          'modify_hosts',
+          host_filter_data={'hostname__in': [m]},
+          update_data=kwargs)
+    except Exception:
+      return False
+    return True
+
+  def UpdateLockInSkylab(self, should_lock_machine, machine):
+    """Ask skylab to lease/release a machine.
+
+    Args:
+      should_lock_machine: Boolean indicating whether to lock the machine (True)
+        or unlock the machine (False).
+      machine: The machine to update.
+
+    Returns:
+      True if requested action succeeded, else False.
+    """
+    try:
+      if should_lock_machine:
+        ret = self.LeaseSkylabMachine(machine)
+      else:
+        ret = self.ReleaseSkylabMachine(machine)
+    except Exception:
+      return False
+    return ret
+
+  def UpdateFileLock(self, should_lock_machine, machine):
+    """Use file lock for local machines,
+
+    Args:
+      should_lock_machine: Boolean indicating whether to lock the machine (True)
+        or unlock the machine (False).
+      machine: The machine to update.
+
+    Returns:
+      True if requested action succeeded, else False.
+    """
+    try:
+      if should_lock_machine:
+        ret = file_lock_machine.Machine(machine, self.locks_dir).Lock(
+            True, sys.argv[0])
+      else:
+        ret = file_lock_machine.Machine(machine, self.locks_dir).Unlock(True)
+    except Exception:
+      return False
+    return ret
+
+  def UpdateMachines(self, lock_machines):
+    """Sets the locked state of the machines to the requested value.
+
+    The machines updated are the ones in self.machines (specified when the
+    class object was intialized).
+
+    Args:
+      lock_machines: Boolean indicating whether to lock the machines (True) or
+        unlock the machines (False).
+
+    Returns:
+      A list of the machines whose state was successfully updated.
+    """
+    updated_machines = []
+    action = 'Locking' if lock_machines else 'Unlocking'
+    for m in self.machines:
+      # TODO(zhizhouy): Handling exceptions with more details when locking
+      # doesn't succeed.
+      machine_type = self.GetMachineType(m)
+      if machine_type == MachineType.SKYLAB:
+        ret = self.UpdateLockInSkylab(lock_machines, m)
+      elif machine_type == MachineType.LOCAL:
+        ret = self.UpdateFileLock(lock_machines, m)
+      else:
+        ret = self.UpdateLockInAFE(lock_machines, m)
+
+      if ret:
+        self.logger.LogOutput(
+            '%s %s machine succeeded: %s.' % (action, machine_type.value, m))
+        updated_machines.append(m)
+      else:
+        self.logger.LogOutput(
+            '%s %s machine failed: %s.' % (action, machine_type.value, m))
+
+    self.machines = updated_machines
+    return updated_machines
+
+  def _InternalRemoveMachine(self, machine):
+    """Remove machine from internal list of machines.
+
+    Args:
+      machine: Name of machine to be removed from internal list.
+    """
+    # Check to see if machine is lab machine and if so, make sure it has
+    # ".cros" on the end.
+    cros_machine = machine
+    if machine.find('rack') > 0 and machine.find('row') > 0:
+      if machine.find('.cros') == -1:
+        cros_machine = cros_machine + '.cros'
+
+    self.machines = [
+        m for m in self.machines if m != cros_machine and m != machine
+    ]
+
+  def CheckMachineLocks(self, machine_states, cmd):
+    """Check that every machine in requested list is in the proper state.
+
+    If the cmd is 'unlock' verify that every machine is locked by requestor.
+    If the cmd is 'lock' verify that every machine is currently unlocked.
+
+    Args:
+      machine_states: A dictionary of the current state of every machine in
+        the current LockManager's list of machines.  Normally obtained by
+        calling LockManager::GetMachineStates.
+      cmd: The user-requested action for the machines: 'lock' or 'unlock'.
+
+    Raises:
+      DontOwnLock: The lock on a requested machine is owned by someone else.
+    """
+    for k, state in machine_states.iteritems():
+      if cmd == 'unlock':
+        if not state['locked']:
+          self.logger.LogWarning('Attempt to unlock already unlocked machine '
+                                 '(%s).' % k)
+          self._InternalRemoveMachine(k)
+
+        # TODO(zhizhouy): Skylab doesn't support host info such as locked_by.
+        # Need to update this when skylab supports it.
+        if (state['locked'] and state['locked_by'] and
+            state['locked_by'] != self.user):
+          raise DontOwnLock('Attempt to unlock machine (%s) locked by someone '
+                            'else (%s).' % (k, state['locked_by']))
+      elif cmd == 'lock':
+        if state['locked']:
+          self.logger.LogWarning(
+              'Attempt to lock already locked machine (%s)' % k)
+          self._InternalRemoveMachine(k)
+
+  def GetMachineStates(self, cmd=''):
+    """Gets the current state of all the requested machines.
+
+    Gets the current state of all the requested machines. Stores the data in a
+    dictionary keyed by machine name.
+
+    Args:
+      cmd: The command for which we are getting the machine states. This is
+        important because if one of the requested machines is missing we raise
+        an exception, unless the requested command is 'add'.
+
+    Returns:
+      A dictionary of machine states for all the machines in the LockManager
+      object.
+
+    Raises:
+      NoAFEServer:  Cannot find the HW Lab AFE server.
+      AFEAccessError:  An error occurred when querying the server about a
+        machine.
+    """
+    if not self.afe:
+      raise NoAFEServer('Error: Cannot connect to main AFE server.')
+
+    machine_list = {}
+    for m in self.machines:
+      # For local or skylab machines, we simply set {'locked': status} for them
+      # TODO(zhizhouy): This is a quick fix since skylab cannot return host info
+      # as afe does. We need to get more info such as locked_by when skylab
+      # supports that.
+      if m in self.local_machines or m in self.skylab_machines:
+        values = {
+            'locked': 0 if cmd == 'lock' else 1,
+            'board': '??',
+            'locked_by': '',
+            'lock_time': ''
+        }
+        machine_list[m] = values
+      else:
+        # For autotest machines, we use afe APIs to get locking info.
+        mod_host = m.split('.')[0]
+        host_info = self.afe.get_hosts(hostname=mod_host)
+        if not host_info:
+          raise AFEAccessError('Unable to get information about %s from main'
+                               ' autotest server.' % m)
+        host_info = host_info[0]
+        name = host_info.hostname
+        values = {}
+        values['board'] = host_info.platform if host_info.platform else '??'
+        values['locked'] = host_info.locked
+        if host_info.locked:
+          values['locked_by'] = host_info.locked_by
+          values['lock_time'] = host_info.lock_time
+        else:
+          values['locked_by'] = ''
+          values['lock_time'] = ''
+        machine_list[name] = values
+
+    self.ListMachineStates(machine_list)
+
+    return machine_list
+
+  def CheckMachineInSkylab(self, machine):
+    """Run command to check if machine is in Skylab or not.
+
+    Returns:
+      True if machine in skylab, else False
+    """
+    credential = ''
+    if os.path.exists(self.SKYLAB_CREDENTIAL):
+      credential = '--auth-service-account-json %s' % self.SKYLAB_CREDENTIAL
+    swarming = os.path.join(self.chromeos_root, self.SWARMING)
+    cmd = (('%s query --swarming https://chromeos-swarming.appspot.com ' \
+            "%s 'bots/list?is_dead=FALSE&dimensions=dut_name:%s'") % \
+           (swarming,
+            credential,
+            machine.rstrip('.cros')))
+    ret_tup = self.ce.RunCommandWOutput(cmd)
+    # The command will return a json output as stdout. If machine not in skylab
+    # stdout will look like this:
+    #  {
+    #    "death_timeout": "600",
+    #    "now": "TIMESTAMP"
+    #  }
+    # Otherwise there will be a tuple starting with 'items', we simply detect
+    # this keyword for result.
+    if 'items' not in ret_tup[1]:
+      return False
+    else:
+      return True
+
+  def LeaseSkylabMachine(self, machine):
+    """Run command to lease dut from skylab.
+
+    Returns:
+      True if succeeded, False if failed.
+    """
+    credential = ''
+    if os.path.exists(self.SKYLAB_CREDENTIAL):
+      credential = '-service-account-json %s' % self.SKYLAB_CREDENTIAL
+    cmd = (('%s lease-dut -minutes %s %s %s') % \
+           (self.SKYLAB_PATH,
+            self.LEASE_MINS,
+            credential,
+            machine.rstrip('.cros')))
+    # Wait 120 seconds for server to start the lease task, if not started,
+    # we will treat it as unavailable.
+    check_interval_time = 120
+    retval = self.ce.RunCommand(cmd, command_timeout=check_interval_time)
+    return retval == self.SUCCESS
+
+  def ReleaseSkylabMachine(self, machine):
+    """Run command to release dut from skylab.
+
+    Returns:
+      True if succeeded, False if failed.
+    """
+    credential = ''
+    if os.path.exists(self.SKYLAB_CREDENTIAL):
+      credential = '-service-account-json %s' % self.SKYLAB_CREDENTIAL
+    cmd = (('%s release-dut %s %s') % \
+           (self.SKYLAB_PATH,
+            credential,
+            machine.rstrip('.cros')))
+    retval = self.ce.RunCommand(cmd)
+    return retval == self.SUCCESS
+
+
+def Main(argv):
+  """Parse the options, initialize lock manager and dispatch proper method.
+
+  Args:
+    argv: The options with which this script was invoked.
+
+  Returns:
+    0 unless an exception is raised.
+  """
+  parser = argparse.ArgumentParser()
+
+  parser.add_argument(
+      '--list',
+      dest='cmd',
+      action='store_const',
+      const='status',
+      help='List current status of all known machines.')
+  parser.add_argument(
+      '--lock',
+      dest='cmd',
+      action='store_const',
+      const='lock',
+      help='Lock given machine(s).')
+  parser.add_argument(
+      '--unlock',
+      dest='cmd',
+      action='store_const',
+      const='unlock',
+      help='Unlock given machine(s).')
+  parser.add_argument(
+      '--status',
+      dest='cmd',
+      action='store_const',
+      const='status',
+      help='List current status of given machine(s).')
+  parser.add_argument(
+      '--remote', dest='remote', help='machines on which to operate')
+  parser.add_argument(
+      '--chromeos_root',
+      dest='chromeos_root',
+      required=True,
+      help='ChromeOS root to use for autotest scripts.')
+  parser.add_argument(
+      '--force',
+      dest='force',
+      action='store_true',
+      default=False,
+      help='Force lock/unlock of machines, even if not'
+      ' current lock owner.')
+
+  options = parser.parse_args(argv)
+
+  if not options.remote and options.cmd != 'status':
+    parser.error('No machines specified for operation.')
+
+  if not os.path.isdir(options.chromeos_root):
+    parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root)
+
+  if not options.cmd:
+    parser.error('No operation selected (--list, --status, --lock, --unlock,'
+                 ' --add_machine, --remove_machine).')
+
+  machine_list = []
+  if options.remote:
+    machine_list = options.remote.split()
+
+  lock_manager = LockManager(machine_list, options.force, options.chromeos_root)
+
+  machine_states = lock_manager.GetMachineStates(cmd=options.cmd)
+  cmd = options.cmd
+
+  if cmd == 'status':
+    lock_manager.ListMachineStates(machine_states)
+
+  elif cmd == 'lock':
+    if not lock_manager.force:
+      lock_manager.CheckMachineLocks(machine_states, cmd)
+      lock_manager.UpdateMachines(True)
+
+  elif cmd == 'unlock':
+    if not lock_manager.force:
+      lock_manager.CheckMachineLocks(machine_states, cmd)
+      lock_manager.UpdateMachines(False)
+
+  elif cmd == 'add':
+    lock_manager.AddMachinesToLocalServer()
+
+  elif cmd == 'remove':
+    lock_manager.RemoveMachinesFromLocalServer()
+
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(Main(sys.argv[1:]))