bisect-kit: migrate chromeos bisector to use codechange module

After this CL, chromeos localbuild bisector is more robust and can
handle following issues:
 - add and remove repo projects
 - manifest snapshot racing

Like android local build bisector, the setup step of chromeos checkout
is changed as well. Now you have to make a repo mirror and sync the
tree from the mirror. The exact steps are:

 1. setup a repo mirror
    $ cd $CHROMEOS_REPO_MIRROR_DIR
    $ repo init ...<original flags>... --mirror
    $ mkdir .repo/local_manifests
    $ cat << END > .repo/local_manifests/manifest-versions.xml
    <?xml version="1.0" encoding="UTF-8"?>
    <manifest>
      <project name="chromeos/manifest-versions" remote="cros-internal" />
    </manifest>
    END
    $ repo sync

 2. checkout chromeos tree from the mirror
    $ cd $CHROMEOS_ROOT
    $ repo init ...<original flags>...  --reference=$CHROMEOS_REPO_MIRROR_DIR
    $ repo sync

 3. specify --chromeos_repo_mirror_dir $CHROMEOS_REPO_MIRROR_DIR when you
    use bisect_cros_repo.py and switch_cros_localbuild.py

Due to crbug.com/864886, this bisector can only work with version of
chromite after 2018-07-19. Workaround will be provided in separate CL
later.

BUG=chromium:827092
TEST=unit test and following commands
$ ./bisect_cros_repo.py init --old R69-10883.0.0 --new R69-10884.0.0 \
  --chromeos_root $CHROMEOS_ROOT \
  --chromeos_repo_mirror_dir $CHROMEOS_REPO_MIRROR_DIR \
  --board samus
$ ./switch_cros_localbuild.py samus-test R69-10883.0.0~R69-10884.0.0/10 \
  --chromeos_root $CHROMEOS_ROOT \
  --chromeos_repo_mirror_dir $CHROMEOS_REPO_MIRROR_DIR

Change-Id: I594d850f80a9ff05b7241106bdc7ff191cf12676
Reviewed-on: https://chromium-review.googlesource.com/1143114
Commit-Ready: Kuang-che Wu <kcwu@chromium.org>
Tested-by: Kuang-che Wu <kcwu@chromium.org>
Reviewed-by: Chi-Ngai Wan <cnwan@google.com>
diff --git a/bisect_kit/git_util.py b/bisect_kit/git_util.py
index 0c393b2..f3749fa 100644
--- a/bisect_kit/git_util.py
+++ b/bisect_kit/git_util.py
@@ -150,18 +150,19 @@
   return git_rev
 
 
-def get_commit_time(git_repo, rev):
+def get_commit_time(git_repo, rev, path):
   """Get git commit timestamp.
 
   Args:
     git_repo: path of git repo
-    rev: git commit id
+    rev: git commit id, branch name, tag name, or other git object
+    path: path, relative to git_repo
 
   Returns:
     timestamp (int)
   """
   line = util.check_output(
-      'git', 'log', '-1', '--format=%ct', rev, cwd=git_repo)
+      'git', 'log', '-1', '--format=%ct', rev, '--', path, cwd=git_repo)
   return int(line)
 
 
@@ -180,101 +181,182 @@
       'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_output=False)
 
 
-def get_rev_by_time(git_repo, timestamp, *args):
+def list_dir_from_revision(git_repo, rev, path):
+  """Lists entries of directory of given revision.
+
+  Args:
+    git_repo: path of git repo
+    rev: git commit id
+    path: directory path, relative to git root
+
+  Returns:
+    list of names
+
+  Raises:
+    subprocess.CalledProcessError: if `path` doesn't exists in `rev`
+  """
+  return util.check_output(
+      'git',
+      'ls-tree',
+      '--name-only',
+      '%s:%s' % (rev, path),
+      cwd=git_repo,
+      log_output=False).splitlines()
+
+
+def get_rev_by_time(git_repo, timestamp, path=None, *args):
   """Query commit of given time.
 
   Args:
     git_repo: path of git repo.
     timestamp: timestamp
+    path: only query history of path, relative to git_repo
     args: only the selected subset of history to query. If branch name is
         specified, only parent of the said branch is queried. If omitted, only
         queries the parent of HEAD.
 
   Returns:
-    git commit hash
+    git commit hash. None if path didn't exist at the given time.
   """
   if not args:
     args = ['HEAD']
-  return util.check_output(
+
+  cmd = [
       'git',
       'rev-list',
       '--first-parent',
       '-1',
       '--before',
       str(timestamp),
-      *args,
-      cwd=git_repo).strip()
+  ] + list(args)
+  if path:
+    cmd += ['--', path]
+
+  result = util.check_output(*cmd, cwd=git_repo).strip()
+  return result or None
 
 
-def get_history(git_repo, path, after=None):
+def get_history(git_repo, path, after=None, before=None, padding=False):
   """Get commit history of given path.
 
+  `after` and `before` could be outside of lifetime of `path`. `padding` is
+  used to control what to return for such cases.
+
   Args:
     git_repo: path of git repo.
     path: path to query, relative to git_repo
-    after: limit history after given time
+    after: limit history after given time (inclusive)
+    before: limit history before given time (inclusive)
+    padding: If True, pads returned result with dummy record at exact 'after'
+        and 'before' time, if 'path' existed at that time. Otherwise, only
+        returns real commits.
 
-  Yields:
-    commit timestamp, git hash
+  Returns:
+    List of (timestamp, git hash); They are all events when `path` was added,
+    removed, modified, and start and end time if `padding` is true.
+
+    For each pair, at `timestamp`, the repo state is `git hash`. In other
+    words, `timestamp` is not necessary the commit time of `git hash` for the
+    padded entries.
   """
   cmd = ['git', 'log', '--reverse', '--first-parent', '--format=%ct %H']
   if after:
     cmd += ['--after', str(after)]
-  cmd.append(path)
+  if before:
+    cmd += ['--before', str(before)]
+  # '--' is necessary otherwise if `path` is removed in current revision, git
+  # will complain it's an ambiguous argument which may be path or something
+  # else (like git branch name, tag name, etc.)
+  cmd += ['--', path]
+
+  result = []
   for line in util.check_output(*cmd, cwd=git_repo).splitlines():
     commit_time, git_rev = line.split()
-    yield int(commit_time), git_rev
+    result.append((int(commit_time), git_rev))
+
+  if padding:
+    assert before or after, "padding=True make no sense if they are both None"
+    if before is not None and get_rev_by_time(git_repo, before, path):
+      before = int(before)
+      if not result or result[-1][0] != before:
+        git_rev = get_rev_by_time(git_repo, before)
+        assert git_rev
+        result.append((before, git_rev))
+    if after is not None and get_rev_by_time(git_repo, after, path):
+      after = int(after)
+      if not result or result[0][0] != after:
+        git_rev = get_rev_by_time(git_repo, after)
+        assert git_rev
+        result.insert(0, (after, git_rev))
+
+  return result
 
 
-class Diff(object):
-  """Class to describe the difference between git commits.
+def get_history_recursively(git_repo, path, after, before, parser_callback):
+  """Get commit history of given path and its dependencies.
 
-  Attributes:
-    timestamp: commit timestamp
-    path: git repo path relative to project root
-    action: action to make the diff, possible value: CHECKOUT_TO, ADD, REMOVE.
-    git_rev: git commit hash
+  In comparison to get_history(), get_history_recursively also takes
+  dependencies into consideration. For example, if file A referenced file B,
+  get_history_recursively(A) will return commits of B in addition to A. This
+  applies recursively, so commits of C will be included if file B referenced
+  file C, and so on.
+
+  This function is file type neutral. `parser_callback(filename, content)` will
+  be invoked to parse file content and should return list of filename of
+  dependencies.
+
+  Args:
+    git_repo: path of git repo
+    path: path to query, relative to git_repo
+    after: limit history after given time (inclusive)
+    before: limit history before given time (inclusive)
+    parser_callback: callback to parse file content. See above comment.
+
+  Returns:
+    list of (commit timestamp, git hash)
   """
-  CHECKOUT_TO = 'checkout_to'
-  ADD = 'add'
-  REMOVE = 'remove'
+  history = get_history(
+      git_repo, path, after=after, before=before, padding=True)
 
-  def __init__(self, timestamp, path, action, git_rev=None):
-    self.timestamp = timestamp
-    self.path = path
-    self.action = action
-    self.git_rev = git_rev
+  # Collect include information of each commit.
+  includes = {}
+  for commit_time, git_rev in history:
+    content = get_file_from_revision(git_repo, git_rev, path)
+    for include_name in parser_callback(path, content):
+      if include_name not in includes:
+        includes[include_name] = set()
+      includes[include_name].add(git_rev)
 
-  def apply(self, base_dir):
-    """Applies the diff on disk.
+  # Analyze the start time and end time of each include.
+  dependencies = []
+  for include in includes:
+    appeared = None
+    for commit_time, git_rev in history:
+      if git_rev in includes[include]:
+        if not appeared:
+          appeared = commit_time
+      else:
+        if appeared:
+          dependencies.append((include, appeared, commit_time))
+          appeared = None
 
-    Args:
-      base_dir: the project root where self.path is relative to
-    """
-    assert self.path
-    git_repo = os.path.join(base_dir, self.path)
-    if self.action == Diff.CHECKOUT_TO:
-      checkout_version(git_repo, self.git_rev)
-      return
-    if self.action in ['add', 'remove']:
-      raise NotImplementedError
-    assert 0
+    if appeared is not None:
+      dependencies.append((include, appeared, before))
 
-  def summary(self, base_dir):
-    """Summary string of this diff.
+  # Recursion and merge.
+  result = list(history)
+  for include, appeared, disappeared in dependencies:
+    result += get_history_recursively(git_repo, include, appeared, disappeared,
+                                      parser_callback)
 
-    Args:
-      base_dir: the project root where self.path is relative to
-    """
-    if self.action == Diff.CHECKOUT_TO:
-      git_repo = os.path.join(base_dir, self.path)
-      summary = get_commit_log(git_repo, self.git_rev).splitlines()[0]
-      return '%s %s %r' % (self.git_rev[:10], self.path, summary)
-    return '%s %s' % (self.action, self.path)
+  # Sort and dedup.
+  result2 = []
+  for x in sorted(result):
+    if result2 and result2[-1] == x:
+      continue
+    result2.append(x)
 
-  def __eq__(self, rhs):
-    return (self.timestamp == rhs.timestamp and self.path == rhs.path and
-            self.action == rhs.action and self.git_rev == rhs.git_rev)
+  return result2
 
 
 def list_commits_between_commits(git_repo, old, new):
@@ -323,61 +405,3 @@
     logger.warning('%d timestamps adjusted', adjusted)
 
   return commits
-
-
-def get_difflist_between_two_commit(base_dir, path, old, new):
-  """Get difflist between (old, new].
-
-  Args:
-    base_dir: the project root
-    path: the path relative to the project root
-    old: old commit hash (exclusive)
-    new: new commit hash (inclusive)
-
-  Returns:
-    list of Diff objects
-  """
-  git_repo = os.path.join(base_dir, path)
-  difflist = []
-  for timestamp, git_rev in list_commits_between_commits(git_repo, old, new):
-    difflist.append(Diff(timestamp, path, Diff.CHECKOUT_TO, git_rev))
-  return difflist
-
-
-def get_difflist_between_two_set(base_dir, old_set, new_set):
-  result = []
-  for path in set(old_set) | set(new_set):
-    git_repo = os.path.join(base_dir, path)
-    if path in old_set and path in new_set:
-      old = old_set[path]
-      new = new_set[path]
-      if old == new:
-        # nochange, do nothing
-        pass
-      elif is_ancestor_commit(git_repo, old, new):
-        # normal case
-        for diff in get_difflist_between_two_commit(base_dir, path, old, new):
-          result.append(diff)
-      else:
-        # maybe switch branch?
-        # TODO(kcwu): handle discontinuous properly (crbug.com/827092)
-        logger.warning(
-            'Warning: dependency "%s" discontinuous. Not supported yet', path)
-        return []
-    elif path in old_set:
-      # remove dependency
-      # TODO(kcwu): handle removal properly (crbug.com/827092)
-      logger.warning('Warning: dependency "%s" was removed. Not supported yet',
-                     path)
-      return []
-    else:
-      assert path in new_set
-      # add dependency
-      # TODO(kcwu): handle addition properly (crbug.com/827092)
-      logger.warning('Warning: dependency "%s" was added. Not supported yet',
-                     path)
-      return []
-
-  result.sort(key=lambda diff: (diff.timestamp, diff.path))
-
-  return result