blob: c34fa3dbbba75b7d2d470fdf12bb24a24c41f378 [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
8import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +08009import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080010import re
11import subprocess
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080012import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080013
14from bisect_kit import cli
15from bisect_kit import util
16
17logger = logging.getLogger(__name__)
18
19GIT_FULL_COMMIT_ID_LENGTH = 40
20
21# Minimal acceptable length of git commit id.
22#
23# For chromium, hash collision rate over number of digits:
24# - 6 digits: 4.85%
25# - 7 digits: 0.32%
26# - 8 digits: 0.01%
27# As foolproof check, 7 digits should be enough.
28GIT_MIN_COMMIT_ID_LENGTH = 7
29
30
31def is_git_rev(s):
32 """Is a git hash-like version string.
33
34 It accepts shortened hash with at least 7 digits.
35 """
36 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
37 return False
38 return bool(re.match(r'^[0-9a-f]+$', s))
39
40
41def argtype_git_rev(s):
42 """Validates git hash."""
43 if not is_git_rev(s):
44 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
45 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
46 return s
47
48
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080049def is_git_root(path):
50 """Is given path root of git repo."""
51 return os.path.exists(os.path.join(path, '.git'))
52
53
Kuang-che Wu08366542019-01-12 12:37:49 +080054def is_git_bare_dir(path):
55 """Is inside .git folder or bare git checkout."""
56 if not os.path.isdir(path):
57 return False
58 try:
59 return util.check_output(
60 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
61 except subprocess.CalledProcessError:
62 return False
63
64
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080065def clone(git_repo, repo_url, reference=None):
66 if not os.path.exists(git_repo):
67 os.makedirs(git_repo)
68 cmd = ['git', 'clone', repo_url, '.']
69 if reference:
70 cmd += ['--reference', reference]
71 util.check_call(*cmd, cwd=git_repo)
72
73
Kuang-che Wue41e0062017-09-01 19:04:14 +080074def checkout_version(git_repo, rev):
75 """git checkout.
76
77 Args:
78 git_repo: path of git repo.
79 rev: git commit revision to checkout.
80 """
81 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
82
83
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080084def init(git_repo):
85 """git init.
86
87 git_repo and its parent directories will be created if they don't exist.
88
89 Args:
90 git_repo: path of git repo.
91 """
92 if not os.path.exists(git_repo):
93 os.makedirs(git_repo)
94
95 util.check_call('git', 'init', '-q', cwd=git_repo)
96
97
98def commit_file(git_repo,
99 path,
100 message,
101 content,
102 commit_time=None,
103 author_time=None):
104 """Commit a file.
105
106 Args:
107 git_repo: path of git repo
108 path: file path, relative to git_repo
109 message: commit message
110 content: file content
111 commit_time: commit timestamp
112 author_time: author timestamp
113 """
114 if author_time is None:
115 author_time = commit_time
116
117 env = {}
118 if author_time:
119 env['GIT_AUTHOR_DATE'] = str(author_time)
120 if commit_time:
121 env['GIT_COMMITTER_DATE'] = str(commit_time)
122
123 full_path = os.path.join(git_repo, path)
124 dirname = os.path.dirname(full_path)
125 if not os.path.exists(dirname):
126 os.makedirs(dirname)
127 with open(full_path, 'w') as f:
128 f.write(content)
129
130 util.check_call('git', 'add', path, cwd=git_repo)
131 util.check_call(
132 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
133
134
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800135def config(git_repo, *args):
136 """Wrapper of 'git config'.
137
138 Args:
139 git_repo: path of git repo.
140 args: parameters pass to 'git config'
141 """
142 util.check_call('git', 'config', *args, cwd=git_repo)
143
144
145def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800146 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800147
148 Args:
149 git_repo: path of git repo.
150 args: parameters pass to 'git fetch'
151 """
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800152 for tries in range(5):
153 if tries > 0:
154 delay = min(60, 10 * 2**tries)
155 logger.warning('git fetch failed, will retry %s seconds later', delay)
156 time.sleep(delay)
157
158 stderr_lines = []
159 try:
160 util.check_call(
161 'git',
162 'fetch',
163 *args,
164 cwd=git_repo,
165 stderr_callback=stderr_lines.append)
166 break
167 except subprocess.CalledProcessError:
168 stderr = ''.join(stderr_lines)
169 # only retry 5xx internal server error
170 if 'The requested URL returned error: 5' not in stderr:
171 raise
172 else:
173 # Reached retry limit but haven't succeeded.
174 # In other words, there must be exceptions raised inside above loop.
175 logger.error('git fetch failed too much times')
176 # It's okay to raise because we are in the same scope as above loop.
177 # pylint: disable=misplaced-bare-raise
178 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800179
180
Kuang-che Wue41e0062017-09-01 19:04:14 +0800181def is_containing_commit(git_repo, rev):
182 """Determines given commit exists.
183
184 Args:
185 git_repo: path of git repo.
186 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800187
188 Returns:
189 True if rev is inside given git repo. If git_repo is not a git folder,
190 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800191 """
192 try:
193 return util.check_output(
194 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
195 except subprocess.CalledProcessError:
196 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800197 except OSError:
198 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800199
200
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800201def is_ancestor_commit(git_repo, old, new):
202 """Determines `old` commit is ancestor of `new` commit.
203
204 Args:
205 git_repo: path of git repo.
206 old: the ancestor commit.
207 new: the descendant commit.
208
209 Returns:
210 True only if `old` is the ancestor of `new`. One commit is not considered
211 as ancestor of itself.
212 """
213 return util.check_output(
214 'git',
215 'rev-list',
216 '--ancestry-path',
217 '-1',
218 '%s..%s' % (old, new),
219 cwd=git_repo) != ''
220
221
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800222def get_commit_metadata(git_repo, rev):
223 """Get metadata of given commit.
224
225 Args:
226 git_repo: path of git repo.
227 rev: git commit revision in query.
228
229 Returns:
230 dict of metadata, including (if available):
231 tree: hash of git tree object
232 parent: list of parent commits; this field is unavailable for the very
233 first commit of git repo.
234 author: name and email of author
235 author_time: author timestamp (without timezone information)
236 committer: name and email of committer
237 committer_time: commit timestamp (without timezone information)
238 message: commit message text
239 """
240 meta = {}
241 data = util.check_output(
242 'git', 'cat-file', '-p', rev, cwd=git_repo, log_output=False)
243 header, meta['message'] = data.split('\n\n', 1)
244 for line in header.splitlines():
245 m = re.match(r'^tree (\w+)', line)
246 if m:
247 meta['tree'] = m.group(1)
248 continue
249
250 m = re.match(r'^parent (\w+)', line)
251 if m:
252 meta['parent'] = line.split()[1:]
253 continue
254
255 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
256 if m:
257 meta[m.group(1)] = m.group(2)
258 meta['%s_time' % m.group(1)] = int(m.group(3))
259 continue
260 return meta
261
262
Kuang-che Wue41e0062017-09-01 19:04:14 +0800263def get_revlist(git_repo, old, new):
264 """Enumerates git commit between two revisions (inclusive).
265
266 Args:
267 git_repo: path of git repo.
268 old: git commit revision.
269 new: git commit revision.
270
271 Returns:
272 list of git revisions. The list contains the input revisions, old and new.
273 """
274 assert old
275 assert new
276 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
277 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
278 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800279
280
281def get_commit_log(git_repo, rev):
282 """Get git commit log.
283
284 Args:
285 git_repo: path of git repo.
286 rev: git commit revision.
287
288 Returns:
289 commit log message
290 """
291 cmd = ['git', 'log', '-1', '--format=%B', rev]
292 msg = util.check_output(*cmd, cwd=git_repo)
293 return msg
294
295
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800296def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800297 """Get git commit hash.
298
299 Args:
300 git_repo: path of git repo.
301 rev: could be git tag, branch, or (shortened) commit hash
302
303 Returns:
304 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800305
306 Raises:
307 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800308 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800309 try:
310 # Use '^{commit}' to restrict search only commits.
311 # Use '--' to avoid ambiguity, like matching rev against path name.
312 output = util.check_output(
313 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
314 git_rev = output.rstrip('-\n')
315 except subprocess.CalledProcessError:
316 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
317 # because it searches objects other than commits as well.
318 raise ValueError('%s is not unique or does not exist' % rev)
319 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800320 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800321
322
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800323def get_commit_time(git_repo, rev, path):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800324 """Get git commit timestamp.
325
326 Args:
327 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800328 rev: git commit id, branch name, tag name, or other git object
329 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800330
331 Returns:
332 timestamp (int)
333 """
334 line = util.check_output(
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800335 'git', 'log', '-1', '--format=%ct', rev, '--', path, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800336 return int(line)
337
338
339def get_file_from_revision(git_repo, rev, path):
340 """Get file content of given revision.
341
342 Args:
343 git_repo: path of git repo
344 rev: git commit id
345 path: file path
346
347 Returns:
348 file content (str)
349 """
350 return util.check_output(
351 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_output=False)
352
353
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800354def list_dir_from_revision(git_repo, rev, path):
355 """Lists entries of directory of given revision.
356
357 Args:
358 git_repo: path of git repo
359 rev: git commit id
360 path: directory path, relative to git root
361
362 Returns:
363 list of names
364
365 Raises:
366 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
367 """
368 return util.check_output(
369 'git',
370 'ls-tree',
371 '--name-only',
372 '%s:%s' % (rev, path),
373 cwd=git_repo,
374 log_output=False).splitlines()
375
376
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800377def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800378 """Query commit of given time.
379
380 Args:
381 git_repo: path of git repo.
382 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800383 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
384 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800385 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800386
387 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800388 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800389 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800390 if not branch:
391 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800392
393 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800394 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800395 'rev-list',
396 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800397 '-1',
398 '--before',
399 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800400 branch,
401 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800402 if path:
403 cmd += ['--', path]
404
405 result = util.check_output(*cmd, cwd=git_repo).strip()
406 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800407
408
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800409def get_history(git_repo,
410 path,
411 branch=None,
412 after=None,
413 before=None,
414 padding=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800415 """Get commit history of given path.
416
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800417 `after` and `before` could be outside of lifetime of `path`. `padding` is
418 used to control what to return for such cases.
419
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800420 Args:
421 git_repo: path of git repo.
422 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800423 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800424 after: limit history after given time (inclusive)
425 before: limit history before given time (inclusive)
426 padding: If True, pads returned result with dummy record at exact 'after'
427 and 'before' time, if 'path' existed at that time. Otherwise, only
428 returns real commits.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800429
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800430 Returns:
431 List of (timestamp, git hash); They are all events when `path` was added,
432 removed, modified, and start and end time if `padding` is true.
433
434 For each pair, at `timestamp`, the repo state is `git hash`. In other
435 words, `timestamp` is not necessary the commit time of `git hash` for the
436 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800437 """
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800438 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=%ct %H']
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800439 if after:
440 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800441 if before:
442 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800443 if branch:
444 assert not is_git_rev(branch)
445 cmd += [branch]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800446 # '--' is necessary otherwise if `path` is removed in current revision, git
447 # will complain it's an ambiguous argument which may be path or something
448 # else (like git branch name, tag name, etc.)
449 cmd += ['--', path]
450
451 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800452 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
453 commit_time, git_rev = line.split()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800454 result.append((int(commit_time), git_rev))
455
456 if padding:
457 assert before or after, "padding=True make no sense if they are both None"
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800458 if before is not None and get_rev_by_time(
459 git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800460 before = int(before)
461 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800462 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800463 assert git_rev
464 result.append((before, git_rev))
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800465 if after is not None and get_rev_by_time(
466 git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800467 after = int(after)
468 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800469 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800470 assert git_rev
471 result.insert(0, (after, git_rev))
472
473 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800474
475
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800476def get_history_recursively(git_repo, path, after, before, parser_callback):
477 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800478
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800479 In comparison to get_history(), get_history_recursively also takes
480 dependencies into consideration. For example, if file A referenced file B,
481 get_history_recursively(A) will return commits of B in addition to A. This
482 applies recursively, so commits of C will be included if file B referenced
483 file C, and so on.
484
485 This function is file type neutral. `parser_callback(filename, content)` will
486 be invoked to parse file content and should return list of filename of
487 dependencies.
488
489 Args:
490 git_repo: path of git repo
491 path: path to query, relative to git_repo
492 after: limit history after given time (inclusive)
493 before: limit history before given time (inclusive)
494 parser_callback: callback to parse file content. See above comment.
495
496 Returns:
497 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800498 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800499 history = get_history(
500 git_repo, path, after=after, before=before, padding=True)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800501
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800502 # Collect include information of each commit.
503 includes = {}
504 for commit_time, git_rev in history:
505 content = get_file_from_revision(git_repo, git_rev, path)
506 for include_name in parser_callback(path, content):
507 if include_name not in includes:
508 includes[include_name] = set()
509 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800510
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800511 # Analyze the start time and end time of each include.
512 dependencies = []
513 for include in includes:
514 appeared = None
515 for commit_time, git_rev in history:
516 if git_rev in includes[include]:
517 if not appeared:
518 appeared = commit_time
519 else:
520 if appeared:
521 dependencies.append((include, appeared, commit_time))
522 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800523
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800524 if appeared is not None:
525 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800526
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800527 # Recursion and merge.
528 result = list(history)
529 for include, appeared, disappeared in dependencies:
530 result += get_history_recursively(git_repo, include, appeared, disappeared,
531 parser_callback)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800532
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800533 # Sort and dedup.
534 result2 = []
Kuang-che Wuebb023c2018-11-29 15:49:32 +0800535 for x in sorted(result, key=lambda x: x[0]):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800536 if result2 and result2[-1] == x:
537 continue
538 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800539
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800540 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800541
542
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800543def list_commits_between_commits(git_repo, old, new):
544 """Get all commits between (old, new].
545
546 Args:
547 git_repo: path of git repo.
548 old: old commit hash (exclusive)
549 new: new commit hash (inclusive)
550
551 Returns:
552 list of (timestamp, rev)
553 """
554 assert old and new
555 assert old == new or is_ancestor_commit(git_repo, old, new)
556 commits = []
557 # --first-parent is necessary for Android, see following link for more
558 # discussion.
559 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
560 for line in util.check_output(
561 'git',
562 'rev-list',
563 '--timestamp',
564 '--reverse',
565 '--first-parent',
566 '%s..%s' % (old, new),
567 cwd=git_repo).splitlines():
568 timestamp, git_rev = line.split()
569 commits.append([int(timestamp), git_rev])
570
571 # bisect-kit has a fundamental assumption that commit timestamps are
572 # increasing because we sort and bisect the commits by timestamp across git
573 # repos. If not increasing, we have to adjust the timestamp as workaround.
574 # This might lead to bad bisect result, however the bad probability is low in
575 # practice since most machines' clocks are good enough.
576 if commits != sorted(commits, key=lambda x: x[0]):
577 logger.warning('Commit timestamps are not increasing')
578 last_timestamp = -1
579 adjusted = 0
580 for commit in commits:
581 if commit[0] < last_timestamp:
582 commit[0] = last_timestamp
583 adjusted += 1
584
585 last_timestamp = commit[0]
586 logger.warning('%d timestamps adjusted', adjusted)
587
588 return commits