blob: 3a9f87c03bdc45f64d5490470f3c604cef1c1352 [file] [log] [blame]
Dale Curtis417709d2018-11-19 15:46:52 -08001#!/usr/bin/env python
2#
3# Copyright 2018 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6#
7"""Usage: find_patches.py [origin_branch] [> patch_file]
8
9This will find all changes in |origin_branch| that are not part of upstream,
10and print a report. It tries to include deleted lines, though these are
11heuristic at best. If |origin_branch| is omitted, it will default to HEAD.
12
13Changes in the working directory are ignored.
14
15Output will be written to stdout, so you probably want to redirect it.
16
17For example, to generate the patches file for origin/merge-m68:
18find_patches.py origin/merge-m68 > patches.68
19"""
20
21from __future__ import print_function
22import collections
23import os
24import re
25import sys
26import subprocess
27
28# What directory will we look for patches in?
29# TODO(liberato): Should we find the root of the ffmpeg tree?
30PATH = "."
31
32
33def log(str):
34 print("[%s]" % str, file=sys.stderr)
35
36
37def run(command):
38 """ Runs a command and returns stdout.
39
40 Args:
41 command: Array of argv[] entries. E.g., ["path_to_executable", "arg1", ...].
42
43 Returns:
44 stdout as a a string.
45 """
46 return subprocess.Popen(command, stdout=subprocess.PIPE).communicate()[0]
47
48
49class PatchInfo:
50 """ Structure to keep track of one patch in a diff.
51
52 This class encapsulates how to handle inserted / deleted lines in a patch,
53 mostly so that we can decide if we should apply "deleted lines only"
54 processing to any them, to find what commit deleted them. Because deleted
55 lines result in an approximate search, we want to be reasonably sure that
56 any deleted lines aren't actually just changes ("delete old, add new").
57 """
58
59 def __init__(self):
60 # Does a diff insert any lines?
61 self._does_insert = False
62 # Set of lines that a diff deletes.
63 self._deleted_lines = set()
64
65 def record_inserted_line(self, line):
66 """ Records that |line| was inserted as part of the patch.
67
68 |line| is a string from the patch, e.g., "+ foo that was added;"
69 """
70 self._does_insert = True
71
72 def record_deleted_line(self, line):
73 """ Records that |line| was deleted as part of the patch.
74
75 |line| is a string from the patch, e.g., "- foo that was removed;"
76 """
77 self._deleted_lines.add(line)
78
79 def interesting_deleted_lines(self):
80 """ Return the (possibly empty) set of deleted lines that we should track.
81
82 In general, things that remove but also add probably are changes, and
83 can be ignored as noise. While, with perfect deleted line tracking,
84 this wouldn't actually change the result, we really just do a text
85 search for deleted lines later. So, avoiding noise is good.
86
87 Note that this is approximate -- a diff could have deleted and
88 inserted lines near each other, but from different patches. In other
89 words, patch A could delete lines and patch B could add / change them.
90 If those changes end up in the same diff block, then we'll miss A
91 because of this test. However, in practice, checking for both seems
92 to remove some noise.
93 """
94 if self._deleted_lines and not self._does_insert:
95 return self._deleted_lines
96 return set()
97
98
99def main(argv):
100 # Origin branch that contains the patches we want to find.
101 # Can specify, for example "origin/merge-m68" to get the patches file for
102 # that revision, regardless of the state of the working tree.
103 if len(argv) > 1:
104 origin_branch = argv[1]
105 else:
106 origin_branch = "HEAD"
107
108 # Make sure that upstream is up-to-date, else many things will likely not
109 # be reachable from it. We don't do this if run as part of a script.
110 if subprocess.call(["git", "fetch", "upstream"]):
111 raise Exception("Could not fetch from upstream")
112
113 write_patches_file(origin_branch, sys.stdout)
114
115
116def write_patches_file(origin_branch, output_file):
117 """Write the patches file for |origin_branch| to |output_file|."""
118 # Get the latest upstream commit that's reachable from the origin branch.
119 # We'll use that to compare against.
120 upstream = run(["git", "merge-base", "upstream/master",
121 origin_branch]).strip()
122 if not upstream:
123 raise Exception("Could not find upstream commit")
124
125 # "Everything reachable from |origin_branch| but not |upstream|". In other
126 # words, all and only chromium changes. Note that there are non-chromium
127 # authors here, since it will include cherry-picks to origin.
128 revision_range = "%s..%s" % (upstream, origin_branch)
129
130 log("Origin is %s" % origin_branch)
131 log("Upstream is %s" % upstream)
132
133 # Find diffs between the versions, excluding all files that are only on
134 # origin. We explicitly exclude .gitignore, since it exists in both places.
135 # Ask for no context, since we ignore it anyway.
136 diff = run([
137 "git", "diff", "--diff-filter=a", "-U0", revision_range, PATH,
138 ":!.gitignore"
139 ])
140
141 # Set of chromium patch sha1s we've seen.
142 sha1s = set()
143 # Map of sha1 to set of files that it affects.
144 sha1ToFiles = collections.defaultdict(set)
145 # Mapping of filename to set of lines that were deleted.
146 files_to_deleted_lines = {}
147 patch_info = PatchInfo()
148 filename = None
149
150 # Process each diff. Include a dummy line to flush out the last diff.
151 log("Scanning diffs between origin and upstream")
152 for line in diff.splitlines() + ["+++ just to handle deleted lines properly"]:
153 if line.startswith("+++"):
154 # If the previous patch was delete-only, then we need to search for it
155 # differently, since we don't get blame entries for deleted lines.
156 # Add the set of deleted lines to this filename.
157 deleted_lines = patch_info.interesting_deleted_lines()
158 if deleted_lines:
159 files_to_deleted_lines[filename] = deleted_lines
160
161 # Update to the new filename.
162 filename = line[6:]
163 log("Checking diffs in %s" % filename)
164
165 # Start of a new diff. We don't know if it inserts / deletes lines.
166 patch_info = PatchInfo()
167 elif line.startswith("@@"):
168 # @@ -linespec +linespec @@
169 # linespec is either "line_number,number_of_lines" or "line_number".
170 # Extract the "+linespec", which is what was added by |origin|.
171 # If the number of lines is specified as 0, then it's a deletion only.
172 # If the number of lines is unspecified, then it's 1.
173 added_linespec = re.sub(r"^.*\+(.*) @@.*", r"\1", line)
174 # Figure out the lines to blame. This is just "starting_line,+number".
175 if "," in added_linespec:
176 # linespec is "line_number,number_of_lines"
177 added_parts = added_linespec.split(",")
178 # Skip if this is a deletion.
179 if added_parts[1] == "0":
180 continue
181 blame_range = "%s,+%s" % (added_parts[0], added_parts[1])
182 else:
183 # One-line change
184 blame_range = "%s,+1" % added_linespec
185
186 blame = run([
187 "git", "blame", "-l",
188 "-L %s" % blame_range, revision_range, "--", filename
189 ])
190
191 # Collect sha1 lines, and create a mapping of files that is changed by
192 # each sha1.
193 for blame_line in blame.splitlines():
194 sha1 = blame_line.split(" ", 1)[0]
195 if sha1:
196 sha1s.add(sha1)
197 sha1ToFiles[sha1].add(filename)
198 elif line.startswith("---"):
199 # Do nothing. Just avoid matching "---" when we check for "-"
200 pass
201 elif line.startswith("-"):
202 # This diff does delete lines.
203 patch_info.record_deleted_line(line[1:])
204 elif line.startswith("+"):
205 # This diff does insert lines.
206 patch_info.record_inserted_line(line[1:])
207
208 # For all files that have deleted lines, look for the sha1 that deleted them.
209 # This is heuristic only; we're looking for "commits that contain some text".
210 for filename, deleted_lines in files_to_deleted_lines.items():
211 for deleted_line in deleted_lines:
212 # Make sure that the deleted line is long enough to provide context.
213 if len(deleted_line) < 4:
214 continue
215
216 log("Checking for deleted lines in %s" % filename)
217 # Specify "--first-parent" so that we find commits on (presumably) origin.
218 sha1 = run([
219 "git", "log", "-1", revision_range, "--format=%H", "-S", deleted_line,
220 origin_branch, "--", filename
221 ]).strip()
222
223 # Add the sha1 to the sets
224 sha1s.add(sha1)
225 sha1ToFiles[sha1].add(filename)
226
227 # Look up dates from sha1 hashes. We want to output them in a canonical order
228 # so that we can diff easier. Date order seems more convenient that sha1.
229 log("Looking up sha1 dates to sort them")
230 sha1_to_date = {}
231 for sha1 in sha1s:
232 date = run(["git", "log", "-1", "--format=%at", "%s" % sha1]).strip()
233 sha1_to_date[sha1] = date
234
235 # Print the patches file.
236 log("Writing patch file")
237 print(
238 "---------------------------------------------------------------------",
239 file=output_file)
240 print(
241 "-- Chromium Patches. Autogenerated by " + os.path.basename(__file__) +
242 ", do not edit --",
243 file=output_file)
244 print(
245 "---------------------------------------------------------------------",
246 file=output_file)
247 print("\n", file=output_file)
248 wd = os.getcwd()
249 for sha1, date in sorted(sha1_to_date.iteritems(), key=lambda (k, v): v):
250 print(
251 "------------------------------------------------------------------",
252 file=output_file)
253 for line in run(["git", "log", "-1", "%s" % sha1]).splitlines():
254 print(line.rstrip(), file=output_file)
255 print("\nAffects:", file=output_file)
256 # TODO(liberato): maybe add the lines that were affected.
257 for file in sorted(sha1ToFiles[sha1]):
Reid Klecknere62fca42019-08-06 15:24:48 -0700258 relfile = os.path.relpath(file, wd).replace('\\', '/')
259 print(" " + relfile, file=output_file)
Dale Curtis417709d2018-11-19 15:46:52 -0800260 print(file=output_file)
261
262 log("Done")
263
264
265if __name__ == "__main__":
266 main(sys.argv)