blob: f633dc003f2958f8b77330428b461046c7228379 [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
8import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +08009import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080010import re
11import subprocess
12
13from bisect_kit import cli
14from bisect_kit import util
15
16logger = logging.getLogger(__name__)
17
18GIT_FULL_COMMIT_ID_LENGTH = 40
19
20# Minimal acceptable length of git commit id.
21#
22# For chromium, hash collision rate over number of digits:
23# - 6 digits: 4.85%
24# - 7 digits: 0.32%
25# - 8 digits: 0.01%
26# As foolproof check, 7 digits should be enough.
27GIT_MIN_COMMIT_ID_LENGTH = 7
28
29
30def is_git_rev(s):
31 """Is a git hash-like version string.
32
33 It accepts shortened hash with at least 7 digits.
34 """
35 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
36 return False
37 return bool(re.match(r'^[0-9a-f]+$', s))
38
39
40def argtype_git_rev(s):
41 """Validates git hash."""
42 if not is_git_rev(s):
43 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
44 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
45 return s
46
47
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080048def is_git_root(path):
49 """Is given path root of git repo."""
50 return os.path.exists(os.path.join(path, '.git'))
51
52
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080053def clone(git_repo, repo_url, reference=None):
54 if not os.path.exists(git_repo):
55 os.makedirs(git_repo)
56 cmd = ['git', 'clone', repo_url, '.']
57 if reference:
58 cmd += ['--reference', reference]
59 util.check_call(*cmd, cwd=git_repo)
60
61
Kuang-che Wue41e0062017-09-01 19:04:14 +080062def checkout_version(git_repo, rev):
63 """git checkout.
64
65 Args:
66 git_repo: path of git repo.
67 rev: git commit revision to checkout.
68 """
69 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
70
71
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080072def init(git_repo):
73 """git init.
74
75 git_repo and its parent directories will be created if they don't exist.
76
77 Args:
78 git_repo: path of git repo.
79 """
80 if not os.path.exists(git_repo):
81 os.makedirs(git_repo)
82
83 util.check_call('git', 'init', '-q', cwd=git_repo)
84
85
86def commit_file(git_repo,
87 path,
88 message,
89 content,
90 commit_time=None,
91 author_time=None):
92 """Commit a file.
93
94 Args:
95 git_repo: path of git repo
96 path: file path, relative to git_repo
97 message: commit message
98 content: file content
99 commit_time: commit timestamp
100 author_time: author timestamp
101 """
102 if author_time is None:
103 author_time = commit_time
104
105 env = {}
106 if author_time:
107 env['GIT_AUTHOR_DATE'] = str(author_time)
108 if commit_time:
109 env['GIT_COMMITTER_DATE'] = str(commit_time)
110
111 full_path = os.path.join(git_repo, path)
112 dirname = os.path.dirname(full_path)
113 if not os.path.exists(dirname):
114 os.makedirs(dirname)
115 with open(full_path, 'w') as f:
116 f.write(content)
117
118 util.check_call('git', 'add', path, cwd=git_repo)
119 util.check_call(
120 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
121
122
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800123def config(git_repo, *args):
124 """Wrapper of 'git config'.
125
126 Args:
127 git_repo: path of git repo.
128 args: parameters pass to 'git config'
129 """
130 util.check_call('git', 'config', *args, cwd=git_repo)
131
132
133def fetch(git_repo, *args):
134 """Wrapper of 'git fetch'.
135
136 Args:
137 git_repo: path of git repo.
138 args: parameters pass to 'git fetch'
139 """
140 util.check_call('git', 'fetch', *args, cwd=git_repo)
141
142
Kuang-che Wue41e0062017-09-01 19:04:14 +0800143def is_containing_commit(git_repo, rev):
144 """Determines given commit exists.
145
146 Args:
147 git_repo: path of git repo.
148 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800149
150 Returns:
151 True if rev is inside given git repo. If git_repo is not a git folder,
152 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800153 """
154 try:
155 return util.check_output(
156 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
157 except subprocess.CalledProcessError:
158 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800159 except OSError:
160 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800161
162
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800163def is_ancestor_commit(git_repo, old, new):
164 """Determines `old` commit is ancestor of `new` commit.
165
166 Args:
167 git_repo: path of git repo.
168 old: the ancestor commit.
169 new: the descendant commit.
170
171 Returns:
172 True only if `old` is the ancestor of `new`. One commit is not considered
173 as ancestor of itself.
174 """
175 return util.check_output(
176 'git',
177 'rev-list',
178 '--ancestry-path',
179 '-1',
180 '%s..%s' % (old, new),
181 cwd=git_repo) != ''
182
183
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800184def get_commit_metadata(git_repo, rev):
185 """Get metadata of given commit.
186
187 Args:
188 git_repo: path of git repo.
189 rev: git commit revision in query.
190
191 Returns:
192 dict of metadata, including (if available):
193 tree: hash of git tree object
194 parent: list of parent commits; this field is unavailable for the very
195 first commit of git repo.
196 author: name and email of author
197 author_time: author timestamp (without timezone information)
198 committer: name and email of committer
199 committer_time: commit timestamp (without timezone information)
200 message: commit message text
201 """
202 meta = {}
203 data = util.check_output(
204 'git', 'cat-file', '-p', rev, cwd=git_repo, log_output=False)
205 header, meta['message'] = data.split('\n\n', 1)
206 for line in header.splitlines():
207 m = re.match(r'^tree (\w+)', line)
208 if m:
209 meta['tree'] = m.group(1)
210 continue
211
212 m = re.match(r'^parent (\w+)', line)
213 if m:
214 meta['parent'] = line.split()[1:]
215 continue
216
217 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
218 if m:
219 meta[m.group(1)] = m.group(2)
220 meta['%s_time' % m.group(1)] = int(m.group(3))
221 continue
222 return meta
223
224
Kuang-che Wue41e0062017-09-01 19:04:14 +0800225def get_revlist(git_repo, old, new):
226 """Enumerates git commit between two revisions (inclusive).
227
228 Args:
229 git_repo: path of git repo.
230 old: git commit revision.
231 new: git commit revision.
232
233 Returns:
234 list of git revisions. The list contains the input revisions, old and new.
235 """
236 assert old
237 assert new
238 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
239 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
240 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800241
242
243def get_commit_log(git_repo, rev):
244 """Get git commit log.
245
246 Args:
247 git_repo: path of git repo.
248 rev: git commit revision.
249
250 Returns:
251 commit log message
252 """
253 cmd = ['git', 'log', '-1', '--format=%B', rev]
254 msg = util.check_output(*cmd, cwd=git_repo)
255 return msg
256
257
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800258def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800259 """Get git commit hash.
260
261 Args:
262 git_repo: path of git repo.
263 rev: could be git tag, branch, or (shortened) commit hash
264
265 Returns:
266 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800267
268 Raises:
269 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800270 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800271 try:
272 # Use '^{commit}' to restrict search only commits.
273 # Use '--' to avoid ambiguity, like matching rev against path name.
274 output = util.check_output(
275 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
276 git_rev = output.rstrip('-\n')
277 except subprocess.CalledProcessError:
278 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
279 # because it searches objects other than commits as well.
280 raise ValueError('%s is not unique or does not exist' % rev)
281 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800282 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800283
284
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800285def get_commit_time(git_repo, rev, path):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800286 """Get git commit timestamp.
287
288 Args:
289 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800290 rev: git commit id, branch name, tag name, or other git object
291 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800292
293 Returns:
294 timestamp (int)
295 """
296 line = util.check_output(
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800297 'git', 'log', '-1', '--format=%ct', rev, '--', path, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800298 return int(line)
299
300
301def get_file_from_revision(git_repo, rev, path):
302 """Get file content of given revision.
303
304 Args:
305 git_repo: path of git repo
306 rev: git commit id
307 path: file path
308
309 Returns:
310 file content (str)
311 """
312 return util.check_output(
313 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_output=False)
314
315
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800316def list_dir_from_revision(git_repo, rev, path):
317 """Lists entries of directory of given revision.
318
319 Args:
320 git_repo: path of git repo
321 rev: git commit id
322 path: directory path, relative to git root
323
324 Returns:
325 list of names
326
327 Raises:
328 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
329 """
330 return util.check_output(
331 'git',
332 'ls-tree',
333 '--name-only',
334 '%s:%s' % (rev, path),
335 cwd=git_repo,
336 log_output=False).splitlines()
337
338
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800339def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800340 """Query commit of given time.
341
342 Args:
343 git_repo: path of git repo.
344 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800345 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
346 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800347 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800348
349 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800350 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800351 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800352 if not branch:
353 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800354
355 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800356 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800357 'rev-list',
358 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800359 '-1',
360 '--before',
361 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800362 branch,
363 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800364 if path:
365 cmd += ['--', path]
366
367 result = util.check_output(*cmd, cwd=git_repo).strip()
368 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800369
370
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800371def get_history(git_repo,
372 path,
373 branch=None,
374 after=None,
375 before=None,
376 padding=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800377 """Get commit history of given path.
378
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800379 `after` and `before` could be outside of lifetime of `path`. `padding` is
380 used to control what to return for such cases.
381
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800382 Args:
383 git_repo: path of git repo.
384 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800385 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800386 after: limit history after given time (inclusive)
387 before: limit history before given time (inclusive)
388 padding: If True, pads returned result with dummy record at exact 'after'
389 and 'before' time, if 'path' existed at that time. Otherwise, only
390 returns real commits.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800391
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800392 Returns:
393 List of (timestamp, git hash); They are all events when `path` was added,
394 removed, modified, and start and end time if `padding` is true.
395
396 For each pair, at `timestamp`, the repo state is `git hash`. In other
397 words, `timestamp` is not necessary the commit time of `git hash` for the
398 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800399 """
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800400 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=%ct %H']
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800401 if after:
402 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800403 if before:
404 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800405 if branch:
406 assert not is_git_rev(branch)
407 cmd += [branch]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800408 # '--' is necessary otherwise if `path` is removed in current revision, git
409 # will complain it's an ambiguous argument which may be path or something
410 # else (like git branch name, tag name, etc.)
411 cmd += ['--', path]
412
413 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800414 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
415 commit_time, git_rev = line.split()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800416 result.append((int(commit_time), git_rev))
417
418 if padding:
419 assert before or after, "padding=True make no sense if they are both None"
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800420 if before is not None and get_rev_by_time(
421 git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800422 before = int(before)
423 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800424 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800425 assert git_rev
426 result.append((before, git_rev))
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800427 if after is not None and get_rev_by_time(
428 git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800429 after = int(after)
430 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800431 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800432 assert git_rev
433 result.insert(0, (after, git_rev))
434
435 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800436
437
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800438def get_history_recursively(git_repo, path, after, before, parser_callback):
439 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800440
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800441 In comparison to get_history(), get_history_recursively also takes
442 dependencies into consideration. For example, if file A referenced file B,
443 get_history_recursively(A) will return commits of B in addition to A. This
444 applies recursively, so commits of C will be included if file B referenced
445 file C, and so on.
446
447 This function is file type neutral. `parser_callback(filename, content)` will
448 be invoked to parse file content and should return list of filename of
449 dependencies.
450
451 Args:
452 git_repo: path of git repo
453 path: path to query, relative to git_repo
454 after: limit history after given time (inclusive)
455 before: limit history before given time (inclusive)
456 parser_callback: callback to parse file content. See above comment.
457
458 Returns:
459 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800460 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800461 history = get_history(
462 git_repo, path, after=after, before=before, padding=True)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800463
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800464 # Collect include information of each commit.
465 includes = {}
466 for commit_time, git_rev in history:
467 content = get_file_from_revision(git_repo, git_rev, path)
468 for include_name in parser_callback(path, content):
469 if include_name not in includes:
470 includes[include_name] = set()
471 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800472
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800473 # Analyze the start time and end time of each include.
474 dependencies = []
475 for include in includes:
476 appeared = None
477 for commit_time, git_rev in history:
478 if git_rev in includes[include]:
479 if not appeared:
480 appeared = commit_time
481 else:
482 if appeared:
483 dependencies.append((include, appeared, commit_time))
484 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800485
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800486 if appeared is not None:
487 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800488
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800489 # Recursion and merge.
490 result = list(history)
491 for include, appeared, disappeared in dependencies:
492 result += get_history_recursively(git_repo, include, appeared, disappeared,
493 parser_callback)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800494
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800495 # Sort and dedup.
496 result2 = []
Kuang-che Wuebb023c2018-11-29 15:49:32 +0800497 for x in sorted(result, key=lambda x: x[0]):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800498 if result2 and result2[-1] == x:
499 continue
500 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800501
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800502 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800503
504
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800505def list_commits_between_commits(git_repo, old, new):
506 """Get all commits between (old, new].
507
508 Args:
509 git_repo: path of git repo.
510 old: old commit hash (exclusive)
511 new: new commit hash (inclusive)
512
513 Returns:
514 list of (timestamp, rev)
515 """
516 assert old and new
517 assert old == new or is_ancestor_commit(git_repo, old, new)
518 commits = []
519 # --first-parent is necessary for Android, see following link for more
520 # discussion.
521 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
522 for line in util.check_output(
523 'git',
524 'rev-list',
525 '--timestamp',
526 '--reverse',
527 '--first-parent',
528 '%s..%s' % (old, new),
529 cwd=git_repo).splitlines():
530 timestamp, git_rev = line.split()
531 commits.append([int(timestamp), git_rev])
532
533 # bisect-kit has a fundamental assumption that commit timestamps are
534 # increasing because we sort and bisect the commits by timestamp across git
535 # repos. If not increasing, we have to adjust the timestamp as workaround.
536 # This might lead to bad bisect result, however the bad probability is low in
537 # practice since most machines' clocks are good enough.
538 if commits != sorted(commits, key=lambda x: x[0]):
539 logger.warning('Commit timestamps are not increasing')
540 last_timestamp = -1
541 adjusted = 0
542 for commit in commits:
543 if commit[0] < last_timestamp:
544 commit[0] = last_timestamp
545 adjusted += 1
546
547 last_timestamp = commit[0]
548 logger.warning('%d timestamps adjusted', adjusted)
549
550 return commits