Add README.patches, clone find_patches.py from ffmpeg repository.

Script allows for automatically generating the README.patches file
so we can track local modifications more easily.

BUG=766721

Change-Id: I4170a5b629ded08d6bafe1d0b131527588773059
diff --git a/find_patches.py b/find_patches.py
new file mode 100755
index 0000000..f579fcd
--- /dev/null
+++ b/find_patches.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python
+#
+# Copyright 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+#
+"""Usage: find_patches.py [origin_branch] [> patch_file]
+
+This will find all changes in |origin_branch| that are not part of upstream,
+and print a report.  It tries to include deleted lines, though these are
+heuristic at best.  If |origin_branch| is omitted, it will default to HEAD.
+
+Changes in the working directory are ignored.
+
+Output will be written to stdout, so you probably want to redirect it.
+
+For example, to generate the patches file for origin/merge-m68:
+find_patches.py origin/merge-m68 > patches.68
+"""
+
+from __future__ import print_function
+import collections
+import os
+import re
+import sys
+import subprocess
+
+# What directory will we look for patches in?
+# TODO(liberato): Should we find the root of the ffmpeg tree?
+PATH = "."
+
+
+def log(str):
+  print("[%s]" % str, file=sys.stderr)
+
+
+def run(command):
+  """ Runs a command and returns stdout.
+
+  Args:
+    command: Array of argv[] entries. E.g., ["path_to_executable", "arg1", ...].
+
+  Returns:
+    stdout as a a string.
+  """
+  return subprocess.Popen(command, stdout=subprocess.PIPE).communicate()[0]
+
+
+class PatchInfo:
+  """ Structure to keep track of one patch in a diff.
+
+  This class encapsulates how to handle inserted / deleted lines in a patch,
+  mostly so that we can decide if we should apply "deleted lines only"
+  processing to any them, to find what commit deleted them.  Because deleted
+  lines result in an approximate search, we want to be reasonably sure that
+  any deleted lines aren't actually just changes ("delete old, add new").
+  """
+
+  def __init__(self):
+    # Does a diff insert any lines?
+    self._does_insert = False
+    # Set of lines that a diff deletes.
+    self._deleted_lines = set()
+
+  def record_inserted_line(self, line):
+    """ Records that |line| was inserted as part of the patch.
+
+    |line| is a string from the patch, e.g., "+ foo that was added;"
+    """
+    self._does_insert = True
+
+  def record_deleted_line(self, line):
+    """ Records that |line| was deleted as part of the patch.
+
+    |line| is a string from the patch, e.g., "- foo that was removed;"
+    """
+    self._deleted_lines.add(line)
+
+  def interesting_deleted_lines(self):
+    """ Return the (possibly empty) set of deleted lines that we should track.
+
+    In general, things that remove but also add probably are changes, and
+    can be ignored as noise.  While, with perfect deleted line tracking,
+    this wouldn't actually change the result, we really just do a text
+    search for deleted lines later.  So, avoiding noise is good.
+
+    Note that this is approximate -- a diff could have deleted and
+    inserted lines near each other, but from different patches.  In other
+    words, patch A could delete lines and patch B could add / change them.
+    If those changes end up in the same diff block, then we'll miss A
+    because of this test.  However, in practice, checking for both seems
+    to remove some noise.
+    """
+    if self._deleted_lines and not self._does_insert:
+      return self._deleted_lines
+    return set()
+
+
+def main(argv):
+  # Origin branch that contains the patches we want to find.
+  # Can specify, for example "origin/merge-m68" to get the  patches file for
+  # that revision, regardless of the state of the working tree.
+  if len(argv) > 1:
+    origin_branch = argv[1]
+  else:
+    origin_branch = "HEAD"
+
+  # Make sure that upstream is up-to-date, else many things will likely not
+  # be reachable from it.  We don't do this if run as part of a script.
+  if subprocess.call(["git", "fetch", "upstream"]):
+    raise Exception("Could not fetch from upstream")
+
+  write_patches_file(origin_branch, sys.stdout)
+
+
+def write_patches_file(origin_branch, output_file):
+  """Write the patches file for |origin_branch| to |output_file|."""
+  # Get the latest upstream commit that's reachable from the origin branch.
+  # We'll use that to compare against.
+  upstream = run(["git", "merge-base", "upstream/master",
+                  origin_branch]).strip()
+  if not upstream:
+    raise Exception("Could not find upstream commit")
+
+  # "Everything reachable from |origin_branch| but not |upstream|".  In other
+  # words, all and only chromium changes.  Note that there are non-chromium
+  # authors here, since it will include cherry-picks to origin.
+  revision_range = "%s..%s" % (upstream, origin_branch)
+
+  log("Origin is %s" % origin_branch)
+  log("Upstream is %s" % upstream)
+
+  # Find diffs between the versions, excluding all files that are only on
+  # origin.  We explicitly exclude .gitignore, since it exists in both places.
+  # Ask for no context, since we ignore it anyway.
+  diff = run([
+      "git", "diff", "--diff-filter=a", "-U0", revision_range, PATH,
+      ":!.gitignore"
+  ])
+
+  # Set of chromium patch sha1s we've seen.
+  sha1s = set()
+  # Map of sha1 to set of files that it affects.
+  sha1ToFiles = collections.defaultdict(set)
+  # Mapping of filename to set of lines that were deleted.
+  files_to_deleted_lines = {}
+  patch_info = PatchInfo()
+  filename = None
+
+  # Process each diff.  Include a dummy line to flush out the last diff.
+  log("Scanning diffs between origin and upstream")
+  for line in diff.splitlines() + ["+++ just to handle deleted lines properly"]:
+    if line.startswith("+++"):
+      # If the previous patch was delete-only, then we need to search for it
+      # differently, since we don't get blame entries for deleted lines.
+      # Add the set of deleted lines to this filename.
+      deleted_lines = patch_info.interesting_deleted_lines()
+      if deleted_lines:
+        files_to_deleted_lines[filename] = deleted_lines
+
+      # Update to the new filename.
+      filename = line[6:]
+      log("Checking diffs in %s" % filename)
+
+      # Start of a new diff.  We don't know if it inserts / deletes lines.
+      patch_info = PatchInfo()
+    elif line.startswith("@@"):
+      # @@ -linespec +linespec @@
+      # linespec is either "line_number,number_of_lines" or "line_number".
+      # Extract the "+linespec", which is what was added by |origin|.
+      # If the number of lines is specified as 0, then it's a deletion only.
+      # If the number of lines is unspecified, then it's 1.
+      added_linespec = re.sub(r"^.*\+(.*) @@.*", r"\1", line)
+      # Figure out the lines to blame.  This is just "starting_line,+number".
+      if "," in added_linespec:
+        # linespec is "line_number,number_of_lines"
+        added_parts = added_linespec.split(",")
+        # Skip if this is a deletion.
+        if added_parts[1] == "0":
+          continue
+        blame_range = "%s,+%s" % (added_parts[0], added_parts[1])
+      else:
+        # One-line change
+        blame_range = "%s,+1" % added_linespec
+
+      blame = run([
+          "git", "blame", "-l",
+          "-L %s" % blame_range, revision_range, "--", filename
+      ])
+
+      # Collect sha1 lines, and create a mapping of files that is changed by
+      # each sha1.
+      for blame_line in blame.splitlines():
+        sha1 = blame_line.split(" ", 1)[0]
+        if sha1:
+          sha1s.add(sha1)
+          sha1ToFiles[sha1].add(filename)
+    elif line.startswith("---"):
+      # Do nothing.  Just avoid matching "---" when we check for "-"
+      pass
+    elif line.startswith("-"):
+      # This diff does delete lines.
+      patch_info.record_deleted_line(line[1:])
+    elif line.startswith("+"):
+      # This diff does insert lines.
+      patch_info.record_inserted_line(line[1:])
+
+  # For all files that have deleted lines, look for the sha1 that deleted them.
+  # This is heuristic only; we're looking for "commits that contain some text".
+  for filename, deleted_lines in files_to_deleted_lines.items():
+    for deleted_line in deleted_lines:
+      # Make sure that the deleted line is long enough to provide context.
+      if len(deleted_line) < 4:
+        continue
+
+      log("Checking for deleted lines in %s" % filename)
+      # Specify "--first-parent" so that we find commits on (presumably) origin.
+      sha1 = run([
+          "git", "log", "-1", revision_range, "--format=%H", "-S", deleted_line,
+          origin_branch, "--", filename
+      ]).strip()
+
+      # Add the sha1 to the sets
+      sha1s.add(sha1)
+      sha1ToFiles[sha1].add(filename)
+
+  # Look up dates from sha1 hashes.  We want to output them in a canonical order
+  # so that we can diff easier.  Date order seems more convenient that sha1.
+  log("Looking up sha1 dates to sort them")
+  sha1_to_date = {}
+  for sha1 in sha1s:
+    date = run(["git", "log", "-1", "--format=%at", "%s" % sha1]).strip()
+    sha1_to_date[sha1] = date
+
+  # Print the patches file.
+  log("Writing patch file")
+  print(
+      "---------------------------------------------------------------------",
+      file=output_file)
+  print(
+      "-- Chromium Patches. Autogenerated by " + os.path.basename(__file__) +
+      ", do not edit --",
+      file=output_file)
+  print(
+      "---------------------------------------------------------------------",
+      file=output_file)
+  print("\n", file=output_file)
+  wd = os.getcwd()
+  for sha1, date in sorted(sha1_to_date.iteritems(), key=lambda (k, v): v):
+    print(
+        "------------------------------------------------------------------",
+        file=output_file)
+    for line in run(["git", "log", "-1", "%s" % sha1]).splitlines():
+      print(line.rstrip(), file=output_file)
+    print("\nAffects:", file=output_file)
+    # TODO(liberato): maybe add the lines that were affected.
+    for file in sorted(sha1ToFiles[sha1]):
+      print("    " + os.path.relpath(file, wd), file=output_file)
+    print(file=output_file)
+
+  log("Done")
+
+
+if __name__ == "__main__":
+  main(sys.argv)