bisect-kit: make git_util.get_rev_by_time faster

On a machine with slow disk io, this change could reduce
bisect_cr_localbuild_internal.py init time from 41 to 8 minutes.

This also helps for init of chromeos local build bisections, but not
much (say, from 8 to 4 minutes).

BUG=b:157413258
TEST=./bisect_cr_localbuild_internal.py init --old 84.0.4142.0 --new 84.0.4147.0

Change-Id: Ide347534c1cfc8062939dda46c5594c4817f4606
diff --git a/bisect_kit/git_util.py b/bisect_kit/git_util.py
index b56da82..82a202b 100644
--- a/bisect_kit/git_util.py
+++ b/bisect_kit/git_util.py
@@ -5,6 +5,7 @@
 """Git utility."""
 
 from __future__ import print_function
+import bisect
 import logging
 import os
 import re
@@ -177,6 +178,150 @@
       raise
 
 
+def _adjust_timestamp_increasingly(commits):
+  """Adjust commit timestamps.
+
+  After adjust, the timestamps are increasing.
+
+  Args:
+    commits: list of (timestamp, commit hash)
+
+  Returns:
+    (adjusted count, list of (timestamp, commit hash))
+  """
+  result = []
+  adjusted = 0
+  last_timestamp = -1
+  for timestamp, git_rev in commits:
+    if timestamp < last_timestamp:
+      adjusted += 1
+      timestamp = last_timestamp
+    else:
+      last_timestamp = timestamp
+    result.append((timestamp, git_rev))
+  return adjusted, result
+
+
+class FastLookupFailed(Exception):
+  """No data is cached for this query.
+
+  The caller should fallback to the original operation.
+  """
+
+
+class FastLookupEntry:
+  """Cached commits from one branch of given time period.
+
+  With this class, we can look up commit via commit hash and timestamp fast.
+  """
+
+  def __init__(self, git_repo, branch):
+    self.git_repo = git_repo
+    self.branch = branch
+    self.optimized_period = None
+    self.cached = []
+    self.commit_to_index = {}
+
+  def optimize(self, period):
+    assert period[0] <= period[1]
+    if (self.optimized_period and self.optimized_period[0] <= period[0] and
+        period[1] <= self.optimized_period[1]):
+      # already done
+      return
+
+    self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
+    self.optimized_period = period
+
+    # Adjust timestamps, so we can do binary search by timestamp
+    _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
+
+    self.commit_to_index = {}
+    for i, (_timestamp, rev) in enumerate(self.cached):
+      self.commit_to_index[rev] = i
+
+  def get_rev_by_time(self, timestamp):
+    if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
+      raise FastLookupFailed
+
+    # Note that, the return value might be different as "git rev-list" if the
+    # actual commit timestamps are not fully increasing.
+    x = (timestamp, '')
+    idx = bisect.bisect_right(self.cached, x)
+    if idx == 0 and timestamp < self.cached[0][0]:
+      return None
+    if idx == len(self.cached) or self.cached[idx][0] != timestamp:
+      idx -= 1
+    return self.cached[idx][1]
+
+  def is_containing_commit(self, rev):
+    if rev in self.commit_to_index:
+      return True
+    raise FastLookupFailed
+
+  def is_ancestor_commit(self, old, new):
+    old_idx = self.commit_to_index.get(old)
+    new_idx = self.commit_to_index.get(new)
+    if old_idx is not None and new_idx is not None:
+      return old_idx < new_idx
+    raise FastLookupFailed
+
+
+class FastLookup:
+  """Collection of FastLookupEntry"""
+
+  def __init__(self):
+    self.entries = {}
+    self.target_period = None
+
+  def optimize(self, period):
+    self.target_period = period
+
+  def disable(self):
+    self.target_period = None
+    self.entries = {}
+
+  def get_rev_by_time(self, git_repo, timestamp, branch):
+    if not self.target_period:
+      raise FastLookupFailed
+    if not self.target_period[0] <= timestamp <= self.target_period[1]:
+      raise FastLookupFailed
+
+    if git_repo not in self.entries:
+      self.entries[git_repo] = {}
+    if branch not in self.entries[git_repo]:
+      self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
+    entry = self.entries[git_repo][branch]
+    entry.optimize(self.target_period)
+    return entry.get_rev_by_time(timestamp)
+
+  def is_containing_commit(self, git_repo, rev):
+    # This function is optimized only after get_rev_by_time() is invoked.
+    if git_repo not in self.entries:
+      raise FastLookupFailed
+
+    for entry in self.entries[git_repo].values():
+      try:
+        return entry.is_containing_commit(rev)
+      except FastLookupFailed:
+        pass
+    raise FastLookupFailed
+
+  def is_ancestor_commit(self, git_repo, old, new):
+    # This function is optimized only after get_rev_by_time() is invoked.
+    if git_repo not in self.entries:
+      raise FastLookupFailed
+
+    for entry in self.entries[git_repo].values():
+      try:
+        return entry.is_ancestor_commit(old, new)
+      except FastLookupFailed:
+        pass
+    raise FastLookupFailed
+
+
+fast_lookup = FastLookup()
+
+
 def is_containing_commit(git_repo, rev):
   """Determines given commit exists.
 
@@ -189,6 +334,11 @@
     returns False as well.
   """
   try:
+    return fast_lookup.is_containing_commit(git_repo, rev)
+  except FastLookupFailed:
+    pass
+
+  try:
     return util.check_output(
         'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
   except subprocess.CalledProcessError:
@@ -209,6 +359,11 @@
     True only if `old` is the ancestor of `new`. One commit is not considered
     as ancestor of itself.
   """
+  try:
+    return fast_lookup.is_ancestor_commit(git_repo, old, new)
+  except FastLookupFailed:
+    pass
+
   return util.check_output(
       'git',
       'rev-list',
@@ -425,6 +580,12 @@
   if not branch:
     branch = 'HEAD'
 
+  if not path:
+    try:
+      return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
+    except FastLookupFailed:
+      pass
+
   cmd = [
       'git',
       'rev-list',
@@ -441,6 +602,38 @@
   return result or None
 
 
+def get_revlist_by_period(git_repo, branch, period):
+  # Find the last commit before period[0].
+  text = util.check_output(
+      'git',
+      'rev-list',
+      '--timestamp',
+      '-1',
+      '--before',
+      str(period[0] - 1),
+      branch,
+      cwd=git_repo)
+
+  # Find commits in the period.
+  text += util.check_output(
+      'git',
+      'rev-list',
+      '--timestamp',
+      '--reverse',
+      '--after',
+      str(period[0]),
+      '--before',
+      str(period[1]),
+      branch,
+      cwd=git_repo)
+
+  result = []
+  for line in text.splitlines():
+    timestamp, commit = line.split()
+    result.append((int(timestamp), commit))
+  return result
+
+
 def reset_hard(git_repo):
   """Restore modified and deleted files.
 
@@ -742,23 +935,16 @@
       '%s..%s' % (old, new),
       cwd=git_repo).splitlines():
     timestamp, git_rev = line.split()
-    commits.append([int(timestamp), git_rev])
+    commits.append((int(timestamp), git_rev))
 
   # bisect-kit has a fundamental assumption that commit timestamps are
   # increasing because we sort and bisect the commits by timestamp across git
   # repos. If not increasing, we have to adjust the timestamp as workaround.
   # This might lead to bad bisect result, however the bad probability is low in
   # practice since most machines' clocks are good enough.
-  if commits != sorted(commits, key=lambda x: x[0]):
+  adjusted, commits = _adjust_timestamp_increasingly(commits)
+  if adjusted != 0:
     logger.warning('Commit timestamps are not increasing')
-    last_timestamp = -1
-    adjusted = 0
-    for commit in commits:
-      if commit[0] < last_timestamp:
-        commit[0] = last_timestamp
-        adjusted += 1
-
-      last_timestamp = commit[0]
     logger.warning('%d timestamps adjusted', adjusted)
 
   return commits