blob: 06464d74f65d35c97ecf044bb4552537565eea27 [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
8import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +08009import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080010import re
11import subprocess
12
13from bisect_kit import cli
14from bisect_kit import util
15
16logger = logging.getLogger(__name__)
17
18GIT_FULL_COMMIT_ID_LENGTH = 40
19
20# Minimal acceptable length of git commit id.
21#
22# For chromium, hash collision rate over number of digits:
23# - 6 digits: 4.85%
24# - 7 digits: 0.32%
25# - 8 digits: 0.01%
26# As foolproof check, 7 digits should be enough.
27GIT_MIN_COMMIT_ID_LENGTH = 7
28
29
30def is_git_rev(s):
31 """Is a git hash-like version string.
32
33 It accepts shortened hash with at least 7 digits.
34 """
35 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
36 return False
37 return bool(re.match(r'^[0-9a-f]+$', s))
38
39
40def argtype_git_rev(s):
41 """Validates git hash."""
42 if not is_git_rev(s):
43 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
44 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
45 return s
46
47
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080048def is_git_root(path):
49 """Is given path root of git repo."""
50 return os.path.exists(os.path.join(path, '.git'))
51
52
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080053def clone(git_repo, repo_url, reference=None):
54 if not os.path.exists(git_repo):
55 os.makedirs(git_repo)
56 cmd = ['git', 'clone', repo_url, '.']
57 if reference:
58 cmd += ['--reference', reference]
59 util.check_call(*cmd, cwd=git_repo)
60
61
Kuang-che Wue41e0062017-09-01 19:04:14 +080062def checkout_version(git_repo, rev):
63 """git checkout.
64
65 Args:
66 git_repo: path of git repo.
67 rev: git commit revision to checkout.
68 """
69 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
70
71
72def is_containing_commit(git_repo, rev):
73 """Determines given commit exists.
74
75 Args:
76 git_repo: path of git repo.
77 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080078
79 Returns:
80 True if rev is inside given git repo. If git_repo is not a git folder,
81 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +080082 """
83 try:
84 return util.check_output(
85 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
86 except subprocess.CalledProcessError:
87 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080088 except OSError:
89 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +080090
91
Kuang-che Wubfc4a642018-04-19 11:54:08 +080092def is_ancestor_commit(git_repo, old, new):
93 """Determines `old` commit is ancestor of `new` commit.
94
95 Args:
96 git_repo: path of git repo.
97 old: the ancestor commit.
98 new: the descendant commit.
99
100 Returns:
101 True only if `old` is the ancestor of `new`. One commit is not considered
102 as ancestor of itself.
103 """
104 return util.check_output(
105 'git',
106 'rev-list',
107 '--ancestry-path',
108 '-1',
109 '%s..%s' % (old, new),
110 cwd=git_repo) != ''
111
112
Kuang-che Wue41e0062017-09-01 19:04:14 +0800113def get_revlist(git_repo, old, new):
114 """Enumerates git commit between two revisions (inclusive).
115
116 Args:
117 git_repo: path of git repo.
118 old: git commit revision.
119 new: git commit revision.
120
121 Returns:
122 list of git revisions. The list contains the input revisions, old and new.
123 """
124 assert old
125 assert new
126 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
127 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
128 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800129
130
131def get_commit_log(git_repo, rev):
132 """Get git commit log.
133
134 Args:
135 git_repo: path of git repo.
136 rev: git commit revision.
137
138 Returns:
139 commit log message
140 """
141 cmd = ['git', 'log', '-1', '--format=%B', rev]
142 msg = util.check_output(*cmd, cwd=git_repo)
143 return msg
144
145
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800146def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800147 """Get git commit hash.
148
149 Args:
150 git_repo: path of git repo.
151 rev: could be git tag, branch, or (shortened) commit hash
152
153 Returns:
154 full git commit hash
155 """
156 cmd = ['git', 'rev-parse', rev]
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800157 git_rev = util.check_output(*cmd, cwd=git_repo).strip()
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800158 assert git_rev
159 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800160
161
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800162def get_commit_time(git_repo, rev, path):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800163 """Get git commit timestamp.
164
165 Args:
166 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800167 rev: git commit id, branch name, tag name, or other git object
168 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800169
170 Returns:
171 timestamp (int)
172 """
173 line = util.check_output(
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800174 'git', 'log', '-1', '--format=%ct', rev, '--', path, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800175 return int(line)
176
177
178def get_file_from_revision(git_repo, rev, path):
179 """Get file content of given revision.
180
181 Args:
182 git_repo: path of git repo
183 rev: git commit id
184 path: file path
185
186 Returns:
187 file content (str)
188 """
189 return util.check_output(
190 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_output=False)
191
192
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800193def list_dir_from_revision(git_repo, rev, path):
194 """Lists entries of directory of given revision.
195
196 Args:
197 git_repo: path of git repo
198 rev: git commit id
199 path: directory path, relative to git root
200
201 Returns:
202 list of names
203
204 Raises:
205 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
206 """
207 return util.check_output(
208 'git',
209 'ls-tree',
210 '--name-only',
211 '%s:%s' % (rev, path),
212 cwd=git_repo,
213 log_output=False).splitlines()
214
215
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800216def get_rev_by_time(git_repo, timestamp, path=None, branch='HEAD'):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800217 """Query commit of given time.
218
219 Args:
220 git_repo: path of git repo.
221 timestamp: timestamp
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800222 path: only query history of path, relative to git_repo
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800223 branch: only the selected subset of history to query. If branch name is
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800224 specified, only parent of the said branch is queried. If omitted, only
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800225 queries the parent of HEAD.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800226
227 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800228 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800229 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800230
231 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800232 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800233 'rev-list',
234 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800235 '-1',
236 '--before',
237 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800238 branch,
239 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800240 if path:
241 cmd += ['--', path]
242
243 result = util.check_output(*cmd, cwd=git_repo).strip()
244 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800245
246
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800247def get_history(git_repo, path, after=None, before=None, padding=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800248 """Get commit history of given path.
249
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800250 `after` and `before` could be outside of lifetime of `path`. `padding` is
251 used to control what to return for such cases.
252
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800253 Args:
254 git_repo: path of git repo.
255 path: path to query, relative to git_repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800256 after: limit history after given time (inclusive)
257 before: limit history before given time (inclusive)
258 padding: If True, pads returned result with dummy record at exact 'after'
259 and 'before' time, if 'path' existed at that time. Otherwise, only
260 returns real commits.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800261
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800262 Returns:
263 List of (timestamp, git hash); They are all events when `path` was added,
264 removed, modified, and start and end time if `padding` is true.
265
266 For each pair, at `timestamp`, the repo state is `git hash`. In other
267 words, `timestamp` is not necessary the commit time of `git hash` for the
268 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800269 """
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800270 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=%ct %H']
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800271 if after:
272 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800273 if before:
274 cmd += ['--before', str(before)]
275 # '--' is necessary otherwise if `path` is removed in current revision, git
276 # will complain it's an ambiguous argument which may be path or something
277 # else (like git branch name, tag name, etc.)
278 cmd += ['--', path]
279
280 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800281 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
282 commit_time, git_rev = line.split()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800283 result.append((int(commit_time), git_rev))
284
285 if padding:
286 assert before or after, "padding=True make no sense if they are both None"
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800287 if before is not None and get_rev_by_time(git_repo, before, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800288 before = int(before)
289 if not result or result[-1][0] != before:
290 git_rev = get_rev_by_time(git_repo, before)
291 assert git_rev
292 result.append((before, git_rev))
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800293 if after is not None and get_rev_by_time(git_repo, after, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800294 after = int(after)
295 if not result or result[0][0] != after:
296 git_rev = get_rev_by_time(git_repo, after)
297 assert git_rev
298 result.insert(0, (after, git_rev))
299
300 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800301
302
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800303def get_history_recursively(git_repo, path, after, before, parser_callback):
304 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800305
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800306 In comparison to get_history(), get_history_recursively also takes
307 dependencies into consideration. For example, if file A referenced file B,
308 get_history_recursively(A) will return commits of B in addition to A. This
309 applies recursively, so commits of C will be included if file B referenced
310 file C, and so on.
311
312 This function is file type neutral. `parser_callback(filename, content)` will
313 be invoked to parse file content and should return list of filename of
314 dependencies.
315
316 Args:
317 git_repo: path of git repo
318 path: path to query, relative to git_repo
319 after: limit history after given time (inclusive)
320 before: limit history before given time (inclusive)
321 parser_callback: callback to parse file content. See above comment.
322
323 Returns:
324 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800325 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800326 history = get_history(
327 git_repo, path, after=after, before=before, padding=True)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800328
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800329 # Collect include information of each commit.
330 includes = {}
331 for commit_time, git_rev in history:
332 content = get_file_from_revision(git_repo, git_rev, path)
333 for include_name in parser_callback(path, content):
334 if include_name not in includes:
335 includes[include_name] = set()
336 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800337
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800338 # Analyze the start time and end time of each include.
339 dependencies = []
340 for include in includes:
341 appeared = None
342 for commit_time, git_rev in history:
343 if git_rev in includes[include]:
344 if not appeared:
345 appeared = commit_time
346 else:
347 if appeared:
348 dependencies.append((include, appeared, commit_time))
349 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800350
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800351 if appeared is not None:
352 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800353
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800354 # Recursion and merge.
355 result = list(history)
356 for include, appeared, disappeared in dependencies:
357 result += get_history_recursively(git_repo, include, appeared, disappeared,
358 parser_callback)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800359
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800360 # Sort and dedup.
361 result2 = []
362 for x in sorted(result):
363 if result2 and result2[-1] == x:
364 continue
365 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800366
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800367 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800368
369
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800370def list_commits_between_commits(git_repo, old, new):
371 """Get all commits between (old, new].
372
373 Args:
374 git_repo: path of git repo.
375 old: old commit hash (exclusive)
376 new: new commit hash (inclusive)
377
378 Returns:
379 list of (timestamp, rev)
380 """
381 assert old and new
382 assert old == new or is_ancestor_commit(git_repo, old, new)
383 commits = []
384 # --first-parent is necessary for Android, see following link for more
385 # discussion.
386 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
387 for line in util.check_output(
388 'git',
389 'rev-list',
390 '--timestamp',
391 '--reverse',
392 '--first-parent',
393 '%s..%s' % (old, new),
394 cwd=git_repo).splitlines():
395 timestamp, git_rev = line.split()
396 commits.append([int(timestamp), git_rev])
397
398 # bisect-kit has a fundamental assumption that commit timestamps are
399 # increasing because we sort and bisect the commits by timestamp across git
400 # repos. If not increasing, we have to adjust the timestamp as workaround.
401 # This might lead to bad bisect result, however the bad probability is low in
402 # practice since most machines' clocks are good enough.
403 if commits != sorted(commits, key=lambda x: x[0]):
404 logger.warning('Commit timestamps are not increasing')
405 last_timestamp = -1
406 adjusted = 0
407 for commit in commits:
408 if commit[0] < last_timestamp:
409 commit[0] = last_timestamp
410 adjusted += 1
411
412 last_timestamp = commit[0]
413 logger.warning('%d timestamps adjusted', adjusted)
414
415 return commits