Avi Drissman | ea62cfa | 2023-01-24 14:57:29 -0500 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
Dale Curtis | 417709d | 2018-11-19 15:46:52 -0800 | [diff] [blame] | 2 | # |
Avi Drissman | ea62cfa | 2023-01-24 14:57:29 -0500 | [diff] [blame] | 3 | # Copyright 2018 The Chromium Authors |
Dale Curtis | 417709d | 2018-11-19 15:46:52 -0800 | [diff] [blame] | 4 | # Use of this source code is governed by a BSD-style license that can be |
| 5 | # found in the LICENSE file. |
| 6 | # |
| 7 | """Usage: find_patches.py [origin_branch] [> patch_file] |
| 8 | |
| 9 | This will find all changes in |origin_branch| that are not part of upstream, |
| 10 | and print a report. It tries to include deleted lines, though these are |
| 11 | heuristic at best. If |origin_branch| is omitted, it will default to HEAD. |
| 12 | |
| 13 | Changes in the working directory are ignored. |
| 14 | |
| 15 | Output will be written to stdout, so you probably want to redirect it. |
| 16 | |
| 17 | For example, to generate the patches file for origin/merge-m68: |
| 18 | find_patches.py origin/merge-m68 > patches.68 |
| 19 | """ |
| 20 | |
Dale Curtis | 417709d | 2018-11-19 15:46:52 -0800 | [diff] [blame] | 21 | import collections |
| 22 | import os |
| 23 | import re |
| 24 | import sys |
| 25 | import subprocess |
| 26 | |
| 27 | # What directory will we look for patches in? |
| 28 | # TODO(liberato): Should we find the root of the ffmpeg tree? |
| 29 | PATH = "." |
| 30 | |
| 31 | |
| 32 | def log(str): |
| 33 | print("[%s]" % str, file=sys.stderr) |
| 34 | |
| 35 | |
| 36 | def run(command): |
| 37 | """ Runs a command and returns stdout. |
| 38 | |
| 39 | Args: |
| 40 | command: Array of argv[] entries. E.g., ["path_to_executable", "arg1", ...]. |
| 41 | |
| 42 | Returns: |
| 43 | stdout as a a string. |
| 44 | """ |
Avi Drissman | ea62cfa | 2023-01-24 14:57:29 -0500 | [diff] [blame] | 45 | return subprocess.Popen(command, stdout=subprocess.PIPE).communicate()[0].decode() |
Dale Curtis | 417709d | 2018-11-19 15:46:52 -0800 | [diff] [blame] | 46 | |
| 47 | |
| 48 | class PatchInfo: |
| 49 | """ Structure to keep track of one patch in a diff. |
| 50 | |
| 51 | This class encapsulates how to handle inserted / deleted lines in a patch, |
| 52 | mostly so that we can decide if we should apply "deleted lines only" |
| 53 | processing to any them, to find what commit deleted them. Because deleted |
| 54 | lines result in an approximate search, we want to be reasonably sure that |
| 55 | any deleted lines aren't actually just changes ("delete old, add new"). |
| 56 | """ |
| 57 | |
| 58 | def __init__(self): |
| 59 | # Does a diff insert any lines? |
| 60 | self._does_insert = False |
| 61 | # Set of lines that a diff deletes. |
| 62 | self._deleted_lines = set() |
| 63 | |
| 64 | def record_inserted_line(self, line): |
| 65 | """ Records that |line| was inserted as part of the patch. |
| 66 | |
| 67 | |line| is a string from the patch, e.g., "+ foo that was added;" |
| 68 | """ |
| 69 | self._does_insert = True |
| 70 | |
| 71 | def record_deleted_line(self, line): |
| 72 | """ Records that |line| was deleted as part of the patch. |
| 73 | |
| 74 | |line| is a string from the patch, e.g., "- foo that was removed;" |
| 75 | """ |
| 76 | self._deleted_lines.add(line) |
| 77 | |
| 78 | def interesting_deleted_lines(self): |
| 79 | """ Return the (possibly empty) set of deleted lines that we should track. |
| 80 | |
| 81 | In general, things that remove but also add probably are changes, and |
| 82 | can be ignored as noise. While, with perfect deleted line tracking, |
| 83 | this wouldn't actually change the result, we really just do a text |
| 84 | search for deleted lines later. So, avoiding noise is good. |
| 85 | |
| 86 | Note that this is approximate -- a diff could have deleted and |
| 87 | inserted lines near each other, but from different patches. In other |
| 88 | words, patch A could delete lines and patch B could add / change them. |
| 89 | If those changes end up in the same diff block, then we'll miss A |
| 90 | because of this test. However, in practice, checking for both seems |
| 91 | to remove some noise. |
| 92 | """ |
| 93 | if self._deleted_lines and not self._does_insert: |
| 94 | return self._deleted_lines |
| 95 | return set() |
| 96 | |
| 97 | |
| 98 | def main(argv): |
| 99 | # Origin branch that contains the patches we want to find. |
| 100 | # Can specify, for example "origin/merge-m68" to get the patches file for |
| 101 | # that revision, regardless of the state of the working tree. |
| 102 | if len(argv) > 1: |
| 103 | origin_branch = argv[1] |
| 104 | else: |
| 105 | origin_branch = "HEAD" |
| 106 | |
| 107 | # Make sure that upstream is up-to-date, else many things will likely not |
| 108 | # be reachable from it. We don't do this if run as part of a script. |
| 109 | if subprocess.call(["git", "fetch", "upstream"]): |
| 110 | raise Exception("Could not fetch from upstream") |
| 111 | |
| 112 | write_patches_file(origin_branch, sys.stdout) |
| 113 | |
| 114 | |
| 115 | def write_patches_file(origin_branch, output_file): |
| 116 | """Write the patches file for |origin_branch| to |output_file|.""" |
| 117 | # Get the latest upstream commit that's reachable from the origin branch. |
| 118 | # We'll use that to compare against. |
| 119 | upstream = run(["git", "merge-base", "upstream/master", |
| 120 | origin_branch]).strip() |
| 121 | if not upstream: |
| 122 | raise Exception("Could not find upstream commit") |
| 123 | |
| 124 | # "Everything reachable from |origin_branch| but not |upstream|". In other |
| 125 | # words, all and only chromium changes. Note that there are non-chromium |
| 126 | # authors here, since it will include cherry-picks to origin. |
| 127 | revision_range = "%s..%s" % (upstream, origin_branch) |
| 128 | |
| 129 | log("Origin is %s" % origin_branch) |
| 130 | log("Upstream is %s" % upstream) |
| 131 | |
| 132 | # Find diffs between the versions, excluding all files that are only on |
| 133 | # origin. We explicitly exclude .gitignore, since it exists in both places. |
| 134 | # Ask for no context, since we ignore it anyway. |
| 135 | diff = run([ |
| 136 | "git", "diff", "--diff-filter=a", "-U0", revision_range, PATH, |
| 137 | ":!.gitignore" |
| 138 | ]) |
| 139 | |
| 140 | # Set of chromium patch sha1s we've seen. |
| 141 | sha1s = set() |
| 142 | # Map of sha1 to set of files that it affects. |
| 143 | sha1ToFiles = collections.defaultdict(set) |
| 144 | # Mapping of filename to set of lines that were deleted. |
| 145 | files_to_deleted_lines = {} |
| 146 | patch_info = PatchInfo() |
| 147 | filename = None |
| 148 | |
| 149 | # Process each diff. Include a dummy line to flush out the last diff. |
| 150 | log("Scanning diffs between origin and upstream") |
| 151 | for line in diff.splitlines() + ["+++ just to handle deleted lines properly"]: |
| 152 | if line.startswith("+++"): |
| 153 | # If the previous patch was delete-only, then we need to search for it |
| 154 | # differently, since we don't get blame entries for deleted lines. |
| 155 | # Add the set of deleted lines to this filename. |
| 156 | deleted_lines = patch_info.interesting_deleted_lines() |
| 157 | if deleted_lines: |
| 158 | files_to_deleted_lines[filename] = deleted_lines |
| 159 | |
| 160 | # Update to the new filename. |
| 161 | filename = line[6:] |
| 162 | log("Checking diffs in %s" % filename) |
| 163 | |
| 164 | # Start of a new diff. We don't know if it inserts / deletes lines. |
| 165 | patch_info = PatchInfo() |
| 166 | elif line.startswith("@@"): |
| 167 | # @@ -linespec +linespec @@ |
| 168 | # linespec is either "line_number,number_of_lines" or "line_number". |
| 169 | # Extract the "+linespec", which is what was added by |origin|. |
| 170 | # If the number of lines is specified as 0, then it's a deletion only. |
| 171 | # If the number of lines is unspecified, then it's 1. |
| 172 | added_linespec = re.sub(r"^.*\+(.*) @@.*", r"\1", line) |
| 173 | # Figure out the lines to blame. This is just "starting_line,+number". |
| 174 | if "," in added_linespec: |
| 175 | # linespec is "line_number,number_of_lines" |
| 176 | added_parts = added_linespec.split(",") |
| 177 | # Skip if this is a deletion. |
| 178 | if added_parts[1] == "0": |
| 179 | continue |
| 180 | blame_range = "%s,+%s" % (added_parts[0], added_parts[1]) |
| 181 | else: |
| 182 | # One-line change |
| 183 | blame_range = "%s,+1" % added_linespec |
| 184 | |
| 185 | blame = run([ |
| 186 | "git", "blame", "-l", |
| 187 | "-L %s" % blame_range, revision_range, "--", filename |
| 188 | ]) |
| 189 | |
| 190 | # Collect sha1 lines, and create a mapping of files that is changed by |
| 191 | # each sha1. |
| 192 | for blame_line in blame.splitlines(): |
| 193 | sha1 = blame_line.split(" ", 1)[0] |
| 194 | if sha1: |
| 195 | sha1s.add(sha1) |
| 196 | sha1ToFiles[sha1].add(filename) |
| 197 | elif line.startswith("---"): |
| 198 | # Do nothing. Just avoid matching "---" when we check for "-" |
| 199 | pass |
| 200 | elif line.startswith("-"): |
| 201 | # This diff does delete lines. |
| 202 | patch_info.record_deleted_line(line[1:]) |
| 203 | elif line.startswith("+"): |
| 204 | # This diff does insert lines. |
| 205 | patch_info.record_inserted_line(line[1:]) |
| 206 | |
| 207 | # For all files that have deleted lines, look for the sha1 that deleted them. |
| 208 | # This is heuristic only; we're looking for "commits that contain some text". |
| 209 | for filename, deleted_lines in files_to_deleted_lines.items(): |
| 210 | for deleted_line in deleted_lines: |
| 211 | # Make sure that the deleted line is long enough to provide context. |
| 212 | if len(deleted_line) < 4: |
| 213 | continue |
| 214 | |
| 215 | log("Checking for deleted lines in %s" % filename) |
| 216 | # Specify "--first-parent" so that we find commits on (presumably) origin. |
| 217 | sha1 = run([ |
| 218 | "git", "log", "-1", revision_range, "--format=%H", "-S", deleted_line, |
| 219 | origin_branch, "--", filename |
| 220 | ]).strip() |
| 221 | |
| 222 | # Add the sha1 to the sets |
| 223 | sha1s.add(sha1) |
| 224 | sha1ToFiles[sha1].add(filename) |
| 225 | |
| 226 | # Look up dates from sha1 hashes. We want to output them in a canonical order |
| 227 | # so that we can diff easier. Date order seems more convenient that sha1. |
| 228 | log("Looking up sha1 dates to sort them") |
| 229 | sha1_to_date = {} |
| 230 | for sha1 in sha1s: |
| 231 | date = run(["git", "log", "-1", "--format=%at", "%s" % sha1]).strip() |
| 232 | sha1_to_date[sha1] = date |
| 233 | |
| 234 | # Print the patches file. |
| 235 | log("Writing patch file") |
| 236 | print( |
| 237 | "---------------------------------------------------------------------", |
| 238 | file=output_file) |
| 239 | print( |
| 240 | "-- Chromium Patches. Autogenerated by " + os.path.basename(__file__) + |
| 241 | ", do not edit --", |
| 242 | file=output_file) |
| 243 | print( |
| 244 | "---------------------------------------------------------------------", |
| 245 | file=output_file) |
| 246 | print("\n", file=output_file) |
| 247 | wd = os.getcwd() |
Avi Drissman | ea62cfa | 2023-01-24 14:57:29 -0500 | [diff] [blame] | 248 | for sha1, date in sorted(sha1_to_date.items(), key=lambda kv: kv[1]): |
Dale Curtis | 417709d | 2018-11-19 15:46:52 -0800 | [diff] [blame] | 249 | print( |
| 250 | "------------------------------------------------------------------", |
| 251 | file=output_file) |
| 252 | for line in run(["git", "log", "-1", "%s" % sha1]).splitlines(): |
| 253 | print(line.rstrip(), file=output_file) |
| 254 | print("\nAffects:", file=output_file) |
| 255 | # TODO(liberato): maybe add the lines that were affected. |
| 256 | for file in sorted(sha1ToFiles[sha1]): |
Reid Kleckner | e62fca4 | 2019-08-06 15:24:48 -0700 | [diff] [blame] | 257 | relfile = os.path.relpath(file, wd).replace('\\', '/') |
| 258 | print(" " + relfile, file=output_file) |
Dale Curtis | 417709d | 2018-11-19 15:46:52 -0800 | [diff] [blame] | 259 | print(file=output_file) |
| 260 | |
| 261 | log("Done") |
| 262 | |
| 263 | |
| 264 | if __name__ == "__main__": |
| 265 | main(sys.argv) |