blob: 56a4a22ac60f5f6883f77e870cbb01f95f6697dd [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
8import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +08009import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080010import re
11import subprocess
12
13from bisect_kit import cli
14from bisect_kit import util
15
16logger = logging.getLogger(__name__)
17
18GIT_FULL_COMMIT_ID_LENGTH = 40
19
20# Minimal acceptable length of git commit id.
21#
22# For chromium, hash collision rate over number of digits:
23# - 6 digits: 4.85%
24# - 7 digits: 0.32%
25# - 8 digits: 0.01%
26# As foolproof check, 7 digits should be enough.
27GIT_MIN_COMMIT_ID_LENGTH = 7
28
29
30def is_git_rev(s):
31 """Is a git hash-like version string.
32
33 It accepts shortened hash with at least 7 digits.
34 """
35 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
36 return False
37 return bool(re.match(r'^[0-9a-f]+$', s))
38
39
40def argtype_git_rev(s):
41 """Validates git hash."""
42 if not is_git_rev(s):
43 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
44 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
45 return s
46
47
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080048def is_git_root(path):
49 """Is given path root of git repo."""
50 return os.path.exists(os.path.join(path, '.git'))
51
52
Kuang-che Wu08366542019-01-12 12:37:49 +080053def is_git_bare_dir(path):
54 """Is inside .git folder or bare git checkout."""
55 if not os.path.isdir(path):
56 return False
57 try:
58 return util.check_output(
59 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
60 except subprocess.CalledProcessError:
61 return False
62
63
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080064def clone(git_repo, repo_url, reference=None):
65 if not os.path.exists(git_repo):
66 os.makedirs(git_repo)
67 cmd = ['git', 'clone', repo_url, '.']
68 if reference:
69 cmd += ['--reference', reference]
70 util.check_call(*cmd, cwd=git_repo)
71
72
Kuang-che Wue41e0062017-09-01 19:04:14 +080073def checkout_version(git_repo, rev):
74 """git checkout.
75
76 Args:
77 git_repo: path of git repo.
78 rev: git commit revision to checkout.
79 """
80 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
81
82
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080083def init(git_repo):
84 """git init.
85
86 git_repo and its parent directories will be created if they don't exist.
87
88 Args:
89 git_repo: path of git repo.
90 """
91 if not os.path.exists(git_repo):
92 os.makedirs(git_repo)
93
94 util.check_call('git', 'init', '-q', cwd=git_repo)
95
96
97def commit_file(git_repo,
98 path,
99 message,
100 content,
101 commit_time=None,
102 author_time=None):
103 """Commit a file.
104
105 Args:
106 git_repo: path of git repo
107 path: file path, relative to git_repo
108 message: commit message
109 content: file content
110 commit_time: commit timestamp
111 author_time: author timestamp
112 """
113 if author_time is None:
114 author_time = commit_time
115
116 env = {}
117 if author_time:
118 env['GIT_AUTHOR_DATE'] = str(author_time)
119 if commit_time:
120 env['GIT_COMMITTER_DATE'] = str(commit_time)
121
122 full_path = os.path.join(git_repo, path)
123 dirname = os.path.dirname(full_path)
124 if not os.path.exists(dirname):
125 os.makedirs(dirname)
126 with open(full_path, 'w') as f:
127 f.write(content)
128
129 util.check_call('git', 'add', path, cwd=git_repo)
130 util.check_call(
131 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
132
133
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800134def config(git_repo, *args):
135 """Wrapper of 'git config'.
136
137 Args:
138 git_repo: path of git repo.
139 args: parameters pass to 'git config'
140 """
141 util.check_call('git', 'config', *args, cwd=git_repo)
142
143
144def fetch(git_repo, *args):
145 """Wrapper of 'git fetch'.
146
147 Args:
148 git_repo: path of git repo.
149 args: parameters pass to 'git fetch'
150 """
151 util.check_call('git', 'fetch', *args, cwd=git_repo)
152
153
Kuang-che Wue41e0062017-09-01 19:04:14 +0800154def is_containing_commit(git_repo, rev):
155 """Determines given commit exists.
156
157 Args:
158 git_repo: path of git repo.
159 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800160
161 Returns:
162 True if rev is inside given git repo. If git_repo is not a git folder,
163 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800164 """
165 try:
166 return util.check_output(
167 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
168 except subprocess.CalledProcessError:
169 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800170 except OSError:
171 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800172
173
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800174def is_ancestor_commit(git_repo, old, new):
175 """Determines `old` commit is ancestor of `new` commit.
176
177 Args:
178 git_repo: path of git repo.
179 old: the ancestor commit.
180 new: the descendant commit.
181
182 Returns:
183 True only if `old` is the ancestor of `new`. One commit is not considered
184 as ancestor of itself.
185 """
186 return util.check_output(
187 'git',
188 'rev-list',
189 '--ancestry-path',
190 '-1',
191 '%s..%s' % (old, new),
192 cwd=git_repo) != ''
193
194
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800195def get_commit_metadata(git_repo, rev):
196 """Get metadata of given commit.
197
198 Args:
199 git_repo: path of git repo.
200 rev: git commit revision in query.
201
202 Returns:
203 dict of metadata, including (if available):
204 tree: hash of git tree object
205 parent: list of parent commits; this field is unavailable for the very
206 first commit of git repo.
207 author: name and email of author
208 author_time: author timestamp (without timezone information)
209 committer: name and email of committer
210 committer_time: commit timestamp (without timezone information)
211 message: commit message text
212 """
213 meta = {}
214 data = util.check_output(
215 'git', 'cat-file', '-p', rev, cwd=git_repo, log_output=False)
216 header, meta['message'] = data.split('\n\n', 1)
217 for line in header.splitlines():
218 m = re.match(r'^tree (\w+)', line)
219 if m:
220 meta['tree'] = m.group(1)
221 continue
222
223 m = re.match(r'^parent (\w+)', line)
224 if m:
225 meta['parent'] = line.split()[1:]
226 continue
227
228 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
229 if m:
230 meta[m.group(1)] = m.group(2)
231 meta['%s_time' % m.group(1)] = int(m.group(3))
232 continue
233 return meta
234
235
Kuang-che Wue41e0062017-09-01 19:04:14 +0800236def get_revlist(git_repo, old, new):
237 """Enumerates git commit between two revisions (inclusive).
238
239 Args:
240 git_repo: path of git repo.
241 old: git commit revision.
242 new: git commit revision.
243
244 Returns:
245 list of git revisions. The list contains the input revisions, old and new.
246 """
247 assert old
248 assert new
249 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
250 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
251 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800252
253
254def get_commit_log(git_repo, rev):
255 """Get git commit log.
256
257 Args:
258 git_repo: path of git repo.
259 rev: git commit revision.
260
261 Returns:
262 commit log message
263 """
264 cmd = ['git', 'log', '-1', '--format=%B', rev]
265 msg = util.check_output(*cmd, cwd=git_repo)
266 return msg
267
268
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800269def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800270 """Get git commit hash.
271
272 Args:
273 git_repo: path of git repo.
274 rev: could be git tag, branch, or (shortened) commit hash
275
276 Returns:
277 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800278
279 Raises:
280 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800281 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800282 try:
283 # Use '^{commit}' to restrict search only commits.
284 # Use '--' to avoid ambiguity, like matching rev against path name.
285 output = util.check_output(
286 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
287 git_rev = output.rstrip('-\n')
288 except subprocess.CalledProcessError:
289 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
290 # because it searches objects other than commits as well.
291 raise ValueError('%s is not unique or does not exist' % rev)
292 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800293 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800294
295
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800296def get_commit_time(git_repo, rev, path):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800297 """Get git commit timestamp.
298
299 Args:
300 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800301 rev: git commit id, branch name, tag name, or other git object
302 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800303
304 Returns:
305 timestamp (int)
306 """
307 line = util.check_output(
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800308 'git', 'log', '-1', '--format=%ct', rev, '--', path, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800309 return int(line)
310
311
312def get_file_from_revision(git_repo, rev, path):
313 """Get file content of given revision.
314
315 Args:
316 git_repo: path of git repo
317 rev: git commit id
318 path: file path
319
320 Returns:
321 file content (str)
322 """
323 return util.check_output(
324 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_output=False)
325
326
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800327def list_dir_from_revision(git_repo, rev, path):
328 """Lists entries of directory of given revision.
329
330 Args:
331 git_repo: path of git repo
332 rev: git commit id
333 path: directory path, relative to git root
334
335 Returns:
336 list of names
337
338 Raises:
339 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
340 """
341 return util.check_output(
342 'git',
343 'ls-tree',
344 '--name-only',
345 '%s:%s' % (rev, path),
346 cwd=git_repo,
347 log_output=False).splitlines()
348
349
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800350def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800351 """Query commit of given time.
352
353 Args:
354 git_repo: path of git repo.
355 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800356 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
357 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800358 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800359
360 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800361 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800362 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800363 if not branch:
364 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800365
366 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800367 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800368 'rev-list',
369 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800370 '-1',
371 '--before',
372 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800373 branch,
374 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800375 if path:
376 cmd += ['--', path]
377
378 result = util.check_output(*cmd, cwd=git_repo).strip()
379 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800380
381
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800382def get_history(git_repo,
383 path,
384 branch=None,
385 after=None,
386 before=None,
387 padding=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800388 """Get commit history of given path.
389
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800390 `after` and `before` could be outside of lifetime of `path`. `padding` is
391 used to control what to return for such cases.
392
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800393 Args:
394 git_repo: path of git repo.
395 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800396 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800397 after: limit history after given time (inclusive)
398 before: limit history before given time (inclusive)
399 padding: If True, pads returned result with dummy record at exact 'after'
400 and 'before' time, if 'path' existed at that time. Otherwise, only
401 returns real commits.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800402
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800403 Returns:
404 List of (timestamp, git hash); They are all events when `path` was added,
405 removed, modified, and start and end time if `padding` is true.
406
407 For each pair, at `timestamp`, the repo state is `git hash`. In other
408 words, `timestamp` is not necessary the commit time of `git hash` for the
409 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800410 """
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800411 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=%ct %H']
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800412 if after:
413 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800414 if before:
415 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800416 if branch:
417 assert not is_git_rev(branch)
418 cmd += [branch]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800419 # '--' is necessary otherwise if `path` is removed in current revision, git
420 # will complain it's an ambiguous argument which may be path or something
421 # else (like git branch name, tag name, etc.)
422 cmd += ['--', path]
423
424 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800425 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
426 commit_time, git_rev = line.split()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800427 result.append((int(commit_time), git_rev))
428
429 if padding:
430 assert before or after, "padding=True make no sense if they are both None"
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800431 if before is not None and get_rev_by_time(
432 git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800433 before = int(before)
434 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800435 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800436 assert git_rev
437 result.append((before, git_rev))
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800438 if after is not None and get_rev_by_time(
439 git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800440 after = int(after)
441 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800442 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800443 assert git_rev
444 result.insert(0, (after, git_rev))
445
446 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800447
448
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800449def get_history_recursively(git_repo, path, after, before, parser_callback):
450 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800451
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800452 In comparison to get_history(), get_history_recursively also takes
453 dependencies into consideration. For example, if file A referenced file B,
454 get_history_recursively(A) will return commits of B in addition to A. This
455 applies recursively, so commits of C will be included if file B referenced
456 file C, and so on.
457
458 This function is file type neutral. `parser_callback(filename, content)` will
459 be invoked to parse file content and should return list of filename of
460 dependencies.
461
462 Args:
463 git_repo: path of git repo
464 path: path to query, relative to git_repo
465 after: limit history after given time (inclusive)
466 before: limit history before given time (inclusive)
467 parser_callback: callback to parse file content. See above comment.
468
469 Returns:
470 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800471 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800472 history = get_history(
473 git_repo, path, after=after, before=before, padding=True)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800474
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800475 # Collect include information of each commit.
476 includes = {}
477 for commit_time, git_rev in history:
478 content = get_file_from_revision(git_repo, git_rev, path)
479 for include_name in parser_callback(path, content):
480 if include_name not in includes:
481 includes[include_name] = set()
482 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800483
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800484 # Analyze the start time and end time of each include.
485 dependencies = []
486 for include in includes:
487 appeared = None
488 for commit_time, git_rev in history:
489 if git_rev in includes[include]:
490 if not appeared:
491 appeared = commit_time
492 else:
493 if appeared:
494 dependencies.append((include, appeared, commit_time))
495 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800496
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800497 if appeared is not None:
498 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800499
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800500 # Recursion and merge.
501 result = list(history)
502 for include, appeared, disappeared in dependencies:
503 result += get_history_recursively(git_repo, include, appeared, disappeared,
504 parser_callback)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800505
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800506 # Sort and dedup.
507 result2 = []
Kuang-che Wuebb023c2018-11-29 15:49:32 +0800508 for x in sorted(result, key=lambda x: x[0]):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800509 if result2 and result2[-1] == x:
510 continue
511 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800512
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800513 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800514
515
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800516def list_commits_between_commits(git_repo, old, new):
517 """Get all commits between (old, new].
518
519 Args:
520 git_repo: path of git repo.
521 old: old commit hash (exclusive)
522 new: new commit hash (inclusive)
523
524 Returns:
525 list of (timestamp, rev)
526 """
527 assert old and new
528 assert old == new or is_ancestor_commit(git_repo, old, new)
529 commits = []
530 # --first-parent is necessary for Android, see following link for more
531 # discussion.
532 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
533 for line in util.check_output(
534 'git',
535 'rev-list',
536 '--timestamp',
537 '--reverse',
538 '--first-parent',
539 '%s..%s' % (old, new),
540 cwd=git_repo).splitlines():
541 timestamp, git_rev = line.split()
542 commits.append([int(timestamp), git_rev])
543
544 # bisect-kit has a fundamental assumption that commit timestamps are
545 # increasing because we sort and bisect the commits by timestamp across git
546 # repos. If not increasing, we have to adjust the timestamp as workaround.
547 # This might lead to bad bisect result, however the bad probability is low in
548 # practice since most machines' clocks are good enough.
549 if commits != sorted(commits, key=lambda x: x[0]):
550 logger.warning('Commit timestamps are not increasing')
551 last_timestamp = -1
552 adjusted = 0
553 for commit in commits:
554 if commit[0] < last_timestamp:
555 commit[0] = last_timestamp
556 adjusted += 1
557
558 last_timestamp = commit[0]
559 logger.warning('%d timestamps adjusted', adjusted)
560
561 return commits