git hyper-blame: Added approx. line number translation.

Previously, when a commit was skipped, it would be blamed on the line
number the line had *after* the skipped commit. This could mean a
totally unrelated commit gets blamed. Now, a heuristic analyses the diff
of the skipped commit to discover approximately what line number the
line had *before* the skipped commit, so it can hopefully be blamed on
the right commit.

BUG=574290

Review URL: https://codereview.chromium.org/1629253002

git-svn-id: svn://svn.chromium.org/chrome/trunk/tools/depot_tools@298609 0039d316-1c4b-4281-b951-d872f2087c98
diff --git a/git_hyper_blame.py b/git_hyper_blame.py
index 1742451..5a7daa0 100755
--- a/git_hyper_blame.py
+++ b/git_hyper_blame.py
@@ -149,6 +149,110 @@
   return list(parse_blame(blame))
 
 
+# Map from (oldrev, newrev) to hunk list (caching the results of git diff, but
+# only the hunk line numbers, not the actual diff contents).
+# hunk list contains (old, new) pairs, where old and new are (start, length)
+# pairs. A hunk list can also be None (if the diff failed).
+diff_hunks_cache = {}
+
+
+def cache_diff_hunks(oldrev, newrev):
+  def parse_start_length(s):
+    # Chop the '-' or '+'.
+    s = s[1:]
+    # Length is optional (defaults to 1).
+    try:
+      start, length = s.split(',')
+    except ValueError:
+      start = s
+      length = 1
+    return int(start), int(length)
+
+  try:
+    return diff_hunks_cache[(oldrev, newrev)]
+  except KeyError:
+    pass
+
+  # Use -U0 to get the smallest possible hunks.
+  diff = git_common.diff(oldrev, newrev, '-U0')
+
+  # Get all the hunks.
+  hunks = []
+  for line in diff.split('\n'):
+    if not line.startswith('@@'):
+      continue
+    ranges = line.split(' ', 3)[1:3]
+    ranges = tuple(parse_start_length(r) for r in ranges)
+    hunks.append(ranges)
+
+  diff_hunks_cache[(oldrev, newrev)] = hunks
+  return hunks
+
+
+def approx_lineno_across_revs(filename, newfilename, revision, newrevision,
+                              lineno):
+  """Computes the approximate movement of a line number between two revisions.
+
+  Consider line |lineno| in |filename| at |revision|. This function computes the
+  line number of that line in |newfilename| at |newrevision|. This is
+  necessarily approximate.
+
+  Args:
+    filename: The file (within the repo) at |revision|.
+    newfilename: The name of the same file at |newrevision|.
+    revision: A git revision.
+    newrevision: Another git revision. Note: Can be ahead or behind |revision|.
+    lineno: Line number within |filename| at |revision|.
+
+  Returns:
+    Line number within |newfilename| at |newrevision|.
+  """
+  # This doesn't work that well if there are a lot of line changes within the
+  # hunk (demonstrated by GitHyperBlameLineMotionTest.testIntraHunkLineMotion).
+  # A fuzzy heuristic that takes the text of the new line and tries to find a
+  # deleted line within the hunk that mostly matches the new line could help.
+
+  # Use the <revision>:<filename> syntax to diff between two blobs. This is the
+  # only way to diff a file that has been renamed.
+  old = '%s:%s' % (revision, filename)
+  new = '%s:%s' % (newrevision, newfilename)
+  hunks = cache_diff_hunks(old, new)
+
+  cumulative_offset = 0
+
+  # Find the hunk containing lineno (if any).
+  for (oldstart, oldlength), (newstart, newlength) in hunks:
+    cumulative_offset += newlength - oldlength
+
+    if lineno >= oldstart + oldlength:
+      # Not there yet.
+      continue
+
+    if lineno < oldstart:
+      # Gone too far.
+      break
+
+    # lineno is in [oldstart, oldlength] at revision; [newstart, newlength] at
+    # newrevision.
+
+    # If newlength == 0, newstart will be the line before the deleted hunk.
+    # Since the line must have been deleted, just return that as the nearest
+    # line in the new file. Caution: newstart can be 0 in this case.
+    if newlength == 0:
+      return max(1, newstart)
+
+    newend = newstart + newlength - 1
+
+    # Move lineno based on the amount the entire hunk shifted.
+    lineno = lineno + newstart - oldstart
+    # Constrain the output within the range [newstart, newend].
+    return min(newend, max(newstart, lineno))
+
+  # Wasn't in a hunk. Figure out the line motion based on the difference in
+  # length between the hunks seen so far.
+  return lineno + cumulative_offset
+
+
 def hyper_blame(ignored, filename, revision='HEAD', out=sys.stdout,
                 err=sys.stderr):
   # Map from commit to parsed blame from that commit.
@@ -189,23 +293,19 @@
         # ignore this commit.
         break
 
-      # line.lineno_then is the line number in question at line.commit.
-      # TODO(mgiuca): This will be incorrect if line.commit added or removed
-      # lines. Translate that line number so that it refers to the position of
-      # the same line on previouscommit.
-      lineno_previous = line.lineno_then
+      # line.lineno_then is the line number in question at line.commit. We need
+      # to translate that line number so that it refers to the position of the
+      # same line on previouscommit.
+      lineno_previous = approx_lineno_across_revs(
+          line.commit.filename, previousfilename, line.commit.commithash,
+          previouscommit, line.lineno_then)
       logging.debug('ignore commit %s on line p%d/t%d/n%d',
                     line.commit.commithash, lineno_previous, line.lineno_then,
                     line.lineno_now)
 
       # Get the line at lineno_previous in the parent commit.
-      assert lineno_previous > 0
-      try:
-        newline = parent_blame[lineno_previous - 1]
-      except IndexError:
-        # lineno_previous is a guess, so it may be past the end of the file.
-        # Just grab the last line in the file.
-        newline = parent_blame[-1]
+      assert 1 <= lineno_previous <= len(parent_blame)
+      newline = parent_blame[lineno_previous - 1]
 
       # Replace the commit and lineno_then, but not the lineno_now or context.
       logging.debug('    replacing with %r', newline)