blob: 9d7ea3c756ff7975763c7836b52230302ca6f80f [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
Kuang-che Wued1bb622020-05-30 23:06:23 +08008import bisect
Kuang-che Wue41e0062017-09-01 19:04:14 +08009import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +080010import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080011import re
Kuang-che Wu3d04eda2019-09-05 23:56:40 +080012import shutil
Zheng-Jie Chang29144442020-02-18 11:53:25 +080013import stat
Kuang-che Wue41e0062017-09-01 19:04:14 +080014import subprocess
Kuang-che Wu13acc7b2020-06-15 10:45:35 +080015import tempfile
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080016import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080017
Kuang-che Wufcbcc502020-06-01 11:48:20 +080018from bisect_kit import cache_util
Kuang-che Wue41e0062017-09-01 19:04:14 +080019from bisect_kit import cli
20from bisect_kit import util
21
22logger = logging.getLogger(__name__)
23
24GIT_FULL_COMMIT_ID_LENGTH = 40
25
26# Minimal acceptable length of git commit id.
27#
28# For chromium, hash collision rate over number of digits:
29# - 6 digits: 4.85%
30# - 7 digits: 0.32%
31# - 8 digits: 0.01%
32# As foolproof check, 7 digits should be enough.
33GIT_MIN_COMMIT_ID_LENGTH = 7
34
35
36def is_git_rev(s):
37 """Is a git hash-like version string.
38
39 It accepts shortened hash with at least 7 digits.
40 """
41 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
42 return False
43 return bool(re.match(r'^[0-9a-f]+$', s))
44
45
46def argtype_git_rev(s):
47 """Validates git hash."""
48 if not is_git_rev(s):
49 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
50 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
51 return s
52
53
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080054def is_git_root(path):
55 """Is given path root of git repo."""
56 return os.path.exists(os.path.join(path, '.git'))
57
58
Kuang-che Wu08366542019-01-12 12:37:49 +080059def is_git_bare_dir(path):
60 """Is inside .git folder or bare git checkout."""
61 if not os.path.isdir(path):
62 return False
63 try:
64 return util.check_output(
65 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
66 except subprocess.CalledProcessError:
67 return False
68
69
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080070def clone(git_repo, repo_url, reference=None):
71 if not os.path.exists(git_repo):
72 os.makedirs(git_repo)
73 cmd = ['git', 'clone', repo_url, '.']
74 if reference:
75 cmd += ['--reference', reference]
76 util.check_call(*cmd, cwd=git_repo)
77
78
Kuang-che Wue41e0062017-09-01 19:04:14 +080079def checkout_version(git_repo, rev):
80 """git checkout.
81
82 Args:
83 git_repo: path of git repo.
84 rev: git commit revision to checkout.
85 """
86 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
87
88
Kuang-che Wu88e96312020-10-20 16:21:11 +080089def init(git_repo, initial_branch='main'):
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080090 """git init.
91
92 git_repo and its parent directories will be created if they don't exist.
93
94 Args:
95 git_repo: path of git repo.
Kuang-che Wuf0bfd182020-10-26 15:52:29 +080096 initial_branch: the default branch after git init
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080097 """
98 if not os.path.exists(git_repo):
99 os.makedirs(git_repo)
100
Kuang-che Wu88e96312020-10-20 16:21:11 +0800101 util.check_call(
102 'git', 'init', '-q', '--initial-branch', initial_branch, cwd=git_repo)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800103
104
105def commit_file(git_repo,
106 path,
107 message,
108 content,
109 commit_time=None,
110 author_time=None):
111 """Commit a file.
112
113 Args:
114 git_repo: path of git repo
115 path: file path, relative to git_repo
116 message: commit message
117 content: file content
118 commit_time: commit timestamp
119 author_time: author timestamp
120 """
121 if author_time is None:
122 author_time = commit_time
123
124 env = {}
125 if author_time:
126 env['GIT_AUTHOR_DATE'] = str(author_time)
127 if commit_time:
128 env['GIT_COMMITTER_DATE'] = str(commit_time)
129
130 full_path = os.path.join(git_repo, path)
131 dirname = os.path.dirname(full_path)
132 if not os.path.exists(dirname):
133 os.makedirs(dirname)
134 with open(full_path, 'w') as f:
135 f.write(content)
136
137 util.check_call('git', 'add', path, cwd=git_repo)
138 util.check_call(
139 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
140
141
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800142def config(git_repo, *args):
143 """Wrapper of 'git config'.
144
145 Args:
146 git_repo: path of git repo.
147 args: parameters pass to 'git config'
148 """
149 util.check_call('git', 'config', *args, cwd=git_repo)
150
151
152def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800153 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800154
155 Args:
156 git_repo: path of git repo.
157 args: parameters pass to 'git fetch'
158 """
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800159 tries = 0
160 while True:
161 tries += 1
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800162 stderr_lines = []
163 try:
164 util.check_call(
165 'git',
166 'fetch',
167 *args,
168 cwd=git_repo,
169 stderr_callback=stderr_lines.append)
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800170 return
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800171 except subprocess.CalledProcessError:
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800172 if tries >= 5:
173 logger.error('git fetch failed too much times')
174 raise
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800175 stderr = ''.join(stderr_lines)
176 # only retry 5xx internal server error
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800177 if 'The requested URL returned error: 5' in stderr:
178 delay = min(60, 10 * 2**tries)
179 logger.warning('git fetch failed, will retry %s seconds later', delay)
180 time.sleep(delay)
181 continue
182 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800183
184
Kuang-che Wued1bb622020-05-30 23:06:23 +0800185def _adjust_timestamp_increasingly(commits):
186 """Adjust commit timestamps.
187
188 After adjust, the timestamps are increasing.
189
190 Args:
191 commits: list of (timestamp, commit hash)
192
193 Returns:
194 (adjusted count, list of (timestamp, commit hash))
195 """
196 result = []
197 adjusted = 0
198 last_timestamp = -1
199 for timestamp, git_rev in commits:
200 if timestamp < last_timestamp:
201 adjusted += 1
202 timestamp = last_timestamp
203 else:
204 last_timestamp = timestamp
205 result.append((timestamp, git_rev))
206 return adjusted, result
207
208
209class FastLookupFailed(Exception):
210 """No data is cached for this query.
211
212 The caller should fallback to the original operation.
213 """
214
215
216class FastLookupEntry:
217 """Cached commits from one branch of given time period.
218
219 With this class, we can look up commit via commit hash and timestamp fast.
220 """
221
222 def __init__(self, git_repo, branch):
223 self.git_repo = git_repo
224 self.branch = branch
225 self.optimized_period = None
226 self.cached = []
227 self.commit_to_index = {}
228
229 def optimize(self, period):
230 assert period[0] <= period[1]
231 if (self.optimized_period and self.optimized_period[0] <= period[0] and
232 period[1] <= self.optimized_period[1]):
233 # already done
234 return
235
236 self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
237 self.optimized_period = period
238
239 # Adjust timestamps, so we can do binary search by timestamp
240 _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
241
242 self.commit_to_index = {}
243 for i, (_timestamp, rev) in enumerate(self.cached):
244 self.commit_to_index[rev] = i
245
246 def get_rev_by_time(self, timestamp):
247 if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
248 raise FastLookupFailed
249
250 # Note that, the return value might be different as "git rev-list" if the
251 # actual commit timestamps are not fully increasing.
252 x = (timestamp, '')
253 idx = bisect.bisect_right(self.cached, x)
254 if idx == 0 and timestamp < self.cached[0][0]:
255 return None
256 if idx == len(self.cached) or self.cached[idx][0] != timestamp:
257 idx -= 1
258 return self.cached[idx][1]
259
260 def is_containing_commit(self, rev):
261 if rev in self.commit_to_index:
262 return True
263 raise FastLookupFailed
264
Kuang-che Wued1bb622020-05-30 23:06:23 +0800265
266class FastLookup:
267 """Collection of FastLookupEntry"""
268
269 def __init__(self):
270 self.entries = {}
271 self.target_period = None
272
273 def optimize(self, period):
274 self.target_period = period
275
276 def disable(self):
277 self.target_period = None
278 self.entries = {}
279
280 def get_rev_by_time(self, git_repo, timestamp, branch):
281 if not self.target_period:
282 raise FastLookupFailed
283 if not self.target_period[0] <= timestamp <= self.target_period[1]:
284 raise FastLookupFailed
285
286 if git_repo not in self.entries:
287 self.entries[git_repo] = {}
288 if branch not in self.entries[git_repo]:
289 self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
290 entry = self.entries[git_repo][branch]
291 entry.optimize(self.target_period)
292 return entry.get_rev_by_time(timestamp)
293
294 def is_containing_commit(self, git_repo, rev):
295 # This function is optimized only after get_rev_by_time() is invoked.
296 if git_repo not in self.entries:
297 raise FastLookupFailed
298
299 for entry in self.entries[git_repo].values():
300 try:
301 return entry.is_containing_commit(rev)
302 except FastLookupFailed:
303 pass
304 raise FastLookupFailed
305
Kuang-che Wued1bb622020-05-30 23:06:23 +0800306
307fast_lookup = FastLookup()
308
309
Kuang-che Wu98d98462020-06-19 17:07:22 +0800310@cache_util.Cache.default_disabled
Kuang-che Wue41e0062017-09-01 19:04:14 +0800311def is_containing_commit(git_repo, rev):
312 """Determines given commit exists.
313
314 Args:
315 git_repo: path of git repo.
316 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800317
318 Returns:
319 True if rev is inside given git repo. If git_repo is not a git folder,
320 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800321 """
322 try:
Kuang-che Wued1bb622020-05-30 23:06:23 +0800323 return fast_lookup.is_containing_commit(git_repo, rev)
324 except FastLookupFailed:
325 pass
326
327 try:
Kuang-che Wue41e0062017-09-01 19:04:14 +0800328 return util.check_output(
329 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
330 except subprocess.CalledProcessError:
331 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800332 except OSError:
333 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800334
335
Zheng-Jie Changad174a42020-06-20 15:28:10 +0800336@cache_util.Cache.default_disabled
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800337def is_ancestor_commit(git_repo, old, new):
338 """Determines `old` commit is ancestor of `new` commit.
339
340 Args:
341 git_repo: path of git repo.
342 old: the ancestor commit.
343 new: the descendant commit.
344
345 Returns:
346 True only if `old` is the ancestor of `new`. One commit is not considered
347 as ancestor of itself.
348 """
349 return util.check_output(
350 'git',
351 'rev-list',
352 '--ancestry-path',
353 '-1',
354 '%s..%s' % (old, new),
355 cwd=git_repo) != ''
356
357
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800358def _parse_commit_object(s):
359 meta = {}
360 header, meta['message'] = s.split('\n\n', 1)
361 for line in header.splitlines():
362 m = re.match(r'^tree (\w+)', line)
363 if m:
364 meta['tree'] = m.group(1)
365 continue
366
367 m = re.match(r'^parent (\w+)', line)
368 if m:
369 meta['parent'] = line.split()[1:]
370 continue
371
372 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
373 if m:
374 meta[m.group(1)] = m.group(2)
375 meta['%s_time' % m.group(1)] = int(m.group(3))
376 continue
377 return meta
378
379
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800380@cache_util.Cache.default_disabled
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800381def get_commit_metadata(git_repo, rev):
382 """Get metadata of given commit.
383
384 Args:
385 git_repo: path of git repo.
386 rev: git commit revision in query.
387
388 Returns:
389 dict of metadata, including (if available):
390 tree: hash of git tree object
391 parent: list of parent commits; this field is unavailable for the very
392 first commit of git repo.
393 author: name and email of author
394 author_time: author timestamp (without timezone information)
395 committer: name and email of committer
396 committer_time: commit timestamp (without timezone information)
397 message: commit message text
398 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800399 data = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800400 'git', 'cat-file', '-p', rev, cwd=git_repo, log_stdout=False)
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800401 return _parse_commit_object(data)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800402
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800403
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800404def get_batch_commit_metadata(git_repo, revs):
405 query = '\n'.join(revs)
406 logger.debug('get_batch_commit_metadata %r', query)
407 with tempfile.NamedTemporaryFile('w+t') as f:
408 f.write(query)
409 f.flush()
410 # util.check_output doesn't support stdin, so use shell
411 # redirect instead.
412 # binary=True because we need to count size in bytes later.
413 data = util.check_output(
414 'sh',
415 '-c',
416 'git cat-file --batch < ' + f.name,
417 cwd=git_repo,
418 binary=True)
419
420 metas = {}
421 while data:
422 first_line, data = data.split(b'\n', 1)
423 m = re.match(r'^(\w+) (\w+)(?: (\d+))?', first_line.decode('utf8'))
424 assert m, repr(first_line)
425 object_name, object_type = m.group(1, 2)
426 if not m.group(3):
427 metas[object_name] = None
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800428 continue
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800429 assert object_type == 'commit', 'unsupported object type: %s' % object_type
430 object_size = int(m.group(3))
431 assert data[object_size] == ord(b'\n'), repr(data[object_size])
432 obj, data = data[:object_size], data[object_size + 1:]
433 metas[object_name] = _parse_commit_object(obj.decode('utf8'))
434 return metas
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800435
436
Kuang-che Wue41e0062017-09-01 19:04:14 +0800437def get_revlist(git_repo, old, new):
438 """Enumerates git commit between two revisions (inclusive).
439
440 Args:
441 git_repo: path of git repo.
442 old: git commit revision.
443 new: git commit revision.
444
445 Returns:
446 list of git revisions. The list contains the input revisions, old and new.
447 """
448 assert old
449 assert new
Kuang-che Wuea002f62020-11-09 19:28:52 +0800450 cmd = [
451 'git', 'rev-list', '--first-parent', '--reverse',
452 '%s^..%s' % (old, new)
453 ]
Kuang-che Wue41e0062017-09-01 19:04:14 +0800454 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
455 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800456
457
458def get_commit_log(git_repo, rev):
459 """Get git commit log.
460
461 Args:
462 git_repo: path of git repo.
463 rev: git commit revision.
464
465 Returns:
466 commit log message
467 """
468 cmd = ['git', 'log', '-1', '--format=%B', rev]
469 msg = util.check_output(*cmd, cwd=git_repo)
470 return msg
471
472
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800473def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800474 """Get git commit hash.
475
476 Args:
477 git_repo: path of git repo.
478 rev: could be git tag, branch, or (shortened) commit hash
479
480 Returns:
481 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800482
483 Raises:
484 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800485 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800486 try:
487 # Use '^{commit}' to restrict search only commits.
488 # Use '--' to avoid ambiguity, like matching rev against path name.
489 output = util.check_output(
490 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
491 git_rev = output.rstrip('-\n')
492 except subprocess.CalledProcessError:
493 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
494 # because it searches objects other than commits as well.
495 raise ValueError('%s is not unique or does not exist' % rev)
496 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800497 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800498
499
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800500def get_commit_time(git_repo, rev, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800501 """Get git commit timestamp.
502
503 Args:
504 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800505 rev: git commit id, branch name, tag name, or other git object
506 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800507
508 Returns:
509 timestamp (int)
510 """
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800511 cmd = ['git', 'log', '-1', '--format=%ct', rev]
512 if path:
513 cmd += ['--', path]
514 line = util.check_output(*cmd, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800515 return int(line)
516
517
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800518def is_symbolic_link(git_repo, rev, path):
519 """Check if a file is symbolic link.
520
521 Args:
522 git_repo: path of git repo
523 rev: git commit id
524 path: file path
525
526 Returns:
527 True if the specified file is a symbolic link in repo.
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800528
529 Raises:
530 ValueError if not found
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800531 """
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800532 # format: 120000 blob 8735a8c1dd96ede39a21d983d5c96792fd15c1a5 default.xml
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800533 # TODO(kcwu): handle escaped path with special characters
Kuang-che Wu020a1182020-09-08 17:17:22 +0800534 parts = util.check_output(
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800535 'git', 'ls-tree', rev, '--full-name', path, cwd=git_repo).split()
Kuang-che Wu020a1182020-09-08 17:17:22 +0800536 if len(parts) >= 4 and parts[3] == path:
537 return stat.S_ISLNK(int(parts[0], 8))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800538
Kuang-che Wud1b74152020-05-20 08:46:46 +0800539 raise ValueError('file %s is not found in repo:%s rev:%s' %
540 (path, git_repo, rev))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800541
542
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800543@cache_util.Cache.default_disabled
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800544def get_file_from_revision(git_repo, rev, path):
545 """Get file content of given revision.
546
547 Args:
548 git_repo: path of git repo
549 rev: git commit id
550 path: file path
551
552 Returns:
553 file content (str)
554 """
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800555 result = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800556 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_stdout=False)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800557
558 # It might be a symbolic link.
559 # In extreme case, it's possible that filenames contain special characters,
560 # like newlines. In practice, it should be safe to assume no such cases and
561 # reduce disk i/o.
562 if '\n' not in result and is_symbolic_link(git_repo, rev, path):
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800563 return get_file_from_revision(git_repo, rev, result)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800564
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800565 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800566
567
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800568def list_dir_from_revision(git_repo, rev, path):
569 """Lists entries of directory of given revision.
570
571 Args:
572 git_repo: path of git repo
573 rev: git commit id
574 path: directory path, relative to git root
575
576 Returns:
577 list of names
578
579 Raises:
580 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
581 """
582 return util.check_output(
583 'git',
584 'ls-tree',
585 '--name-only',
586 '%s:%s' % (rev, path),
587 cwd=git_repo,
Kuang-che Wubcafc552019-08-15 15:27:02 +0800588 log_stdout=False).splitlines()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800589
590
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800591def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800592 """Query commit of given time.
593
594 Args:
595 git_repo: path of git repo.
596 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800597 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
598 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800599 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800600
601 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800602 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800603 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800604 if not branch:
605 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800606
Kuang-che Wued1bb622020-05-30 23:06:23 +0800607 if not path:
608 try:
609 return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
610 except FastLookupFailed:
611 pass
612
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800613 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800614 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800615 'rev-list',
616 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800617 '-1',
618 '--before',
619 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800620 branch,
621 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800622 if path:
623 cmd += ['--', path]
624
625 result = util.check_output(*cmd, cwd=git_repo).strip()
626 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800627
628
Kuang-che Wued1bb622020-05-30 23:06:23 +0800629def get_revlist_by_period(git_repo, branch, period):
630 # Find the last commit before period[0].
631 text = util.check_output(
632 'git',
633 'rev-list',
Kuang-che Wuea002f62020-11-09 19:28:52 +0800634 '--first-parent',
Kuang-che Wued1bb622020-05-30 23:06:23 +0800635 '--timestamp',
636 '-1',
637 '--before',
638 str(period[0] - 1),
639 branch,
640 cwd=git_repo)
641
642 # Find commits in the period.
643 text += util.check_output(
644 'git',
645 'rev-list',
Kuang-che Wuea002f62020-11-09 19:28:52 +0800646 '--first-parent',
Kuang-che Wued1bb622020-05-30 23:06:23 +0800647 '--timestamp',
648 '--reverse',
649 '--after',
650 str(period[0]),
651 '--before',
652 str(period[1]),
653 branch,
654 cwd=git_repo)
655
656 result = []
657 for line in text.splitlines():
658 timestamp, commit = line.split()
659 result.append((int(timestamp), commit))
660 return result
661
662
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800663def reset_hard(git_repo):
664 """Restore modified and deleted files.
665
666 This is simply wrapper of "git reset --hard".
667
668 Args:
669 git_repo: path of git repo.
670 """
671 util.check_call('git', 'reset', '--hard', cwd=git_repo)
672
673
674def list_untracked(git_repo, excludes=None):
675 """List untracked files and directories.
676
677 Args:
678 git_repo: path of git repo.
679 excludes: files and/or directories to ignore, relative to git_repo
680
681 Returns:
682 list of paths, relative to git_repo
683 """
684 exclude_flags = []
685 if excludes:
686 for exclude in excludes:
687 assert not os.path.isabs(exclude), 'should be relative'
688 exclude_flags += ['--exclude', '/' + re.escape(exclude)]
689
690 result = []
691 for path in util.check_output(
692 'git',
693 'ls-files',
694 '--others',
695 '--exclude-standard',
696 *exclude_flags,
697 cwd=git_repo).splitlines():
698 # Remove the trailing slash, which means directory.
699 path = path.rstrip('/')
700 result.append(path)
701 return result
702
703
704def distclean(git_repo, excludes=None):
705 """Clean up git repo directory.
706
707 Restore modified and deleted files. Delete untracked files.
708
709 Args:
710 git_repo: path of git repo.
711 excludes: files and/or directories to ignore, relative to git_repo
712 """
713 reset_hard(git_repo)
714
715 # Delete untracked files.
716 for untracked in list_untracked(git_repo, excludes=excludes):
717 path = os.path.join(git_repo, untracked)
718 logger.debug('delete untracked: %s', path)
Zheng-Jie Chang42fd2e42020-04-10 07:21:23 +0800719 if os.path.islink(path):
720 os.unlink(path)
721 elif os.path.isdir(path):
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800722 shutil.rmtree(path)
723 else:
724 os.unlink(path)
725
726
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800727def get_history(git_repo,
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800728 path=None,
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800729 branch=None,
730 after=None,
731 before=None,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800732 padding_begin=False,
733 padding_end=False,
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800734 with_subject=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800735 """Get commit history of given path.
736
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800737 `after` and `before` could be outside of lifetime of `path`. `padding` is
738 used to control what to return for such cases.
739
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800740 Args:
741 git_repo: path of git repo.
742 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800743 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800744 after: limit history after given time (inclusive)
745 before: limit history before given time (inclusive)
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800746 padding_begin: If True, pads returned result with dummy record at exact
747 'after' time, if 'path' existed at that time.
748 padding_end: If True, pads returned result with dummy record at exact
749 'before' time, if 'path' existed at that time.
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800750 with_subject: If True, return commit subject together
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800751
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800752 Returns:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800753 List of (timestamp, git hash, subject); or (timestamp, git hash) depends
754 on with_subject flag. They are all events when `path` was added, removed,
755 modified, and start and end time if `padding` is true. If `padding` and
756 `with_subject` are both true, 'dummy subject' will be returned as padding
757 history's subject.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800758
759 For each pair, at `timestamp`, the repo state is `git hash`. In other
760 words, `timestamp` is not necessary the commit time of `git hash` for the
761 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800762 """
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800763 log_format = '%ct %H' if not with_subject else '%ct %H %s'
764 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=' + log_format]
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800765 if after:
766 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800767 if before:
768 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800769 if branch:
770 assert not is_git_rev(branch)
771 cmd += [branch]
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800772 if path:
773 # '--' is necessary otherwise if `path` is removed in current revision, git
774 # will complain it's an ambiguous argument which may be path or something
775 # else (like git branch name, tag name, etc.)
776 cmd += ['--', path]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800777
778 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800779 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800780 # array = [timestamp, git_rev, subject] or [timestamp, git_rev]
781 array = line.split(' ', 2)
782 array[0] = int(array[0])
783 result.append(tuple(array))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800784
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800785 if padding_begin or padding_end:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800786 history = [0, '']
787 if with_subject:
788 history.append('dummy subject')
789
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800790 if padding_end:
791 assert before, 'padding_end=True make no sense if before=None'
792 if get_rev_by_time(git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800793 before = int(before)
794 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800795 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800796 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800797 history[0:2] = [before, git_rev]
798 result.append(tuple(history))
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800799
800 if padding_begin:
801 assert after, 'padding_begin=True make no sense if after=None'
802 if get_rev_by_time(git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800803 after = int(after)
804 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800805 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800806 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800807 history[0:2] = [after, git_rev]
808 result.insert(0, tuple(history))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800809
810 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800811
812
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800813def get_history_recursively(git_repo,
814 path,
815 after,
816 before,
817 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800818 padding_end=True,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800819 branch=None):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800820 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800821
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800822 In comparison to get_history(), get_history_recursively also takes
823 dependencies into consideration. For example, if file A referenced file B,
824 get_history_recursively(A) will return commits of B in addition to A. This
825 applies recursively, so commits of C will be included if file B referenced
826 file C, and so on.
827
828 This function is file type neutral. `parser_callback(filename, content)` will
829 be invoked to parse file content and should return list of filename of
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800830 dependencies. If `parser_callback` returns None (usually syntax error), the
831 commit is omitted.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800832
833 Args:
834 git_repo: path of git repo
835 path: path to query, relative to git_repo
836 after: limit history after given time (inclusive)
837 before: limit history before given time (inclusive)
838 parser_callback: callback to parse file content. See above comment.
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800839 padding_end: If True, pads returned result with dummy record at exact
840 'after' time, if 'path' existed at that time.
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800841 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800842
843 Returns:
844 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800845 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800846 history = get_history(
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800847 git_repo,
848 path,
849 after=after,
850 before=before,
851 padding_begin=True,
852 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800853
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800854 # Collect include information of each commit.
855 includes = {}
856 for commit_time, git_rev in history:
857 content = get_file_from_revision(git_repo, git_rev, path)
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800858 parse_result = parser_callback(path, content)
859 if parse_result is None:
860 continue
861 for include_name in parse_result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800862 if include_name not in includes:
863 includes[include_name] = set()
864 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800865
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800866 # Analyze the start time and end time of each include.
867 dependencies = []
868 for include in includes:
869 appeared = None
870 for commit_time, git_rev in history:
871 if git_rev in includes[include]:
872 if not appeared:
873 appeared = commit_time
874 else:
875 if appeared:
Zheng-Jie Chang4d617a42020-02-15 06:46:00 +0800876 # dependency file exists in time range [appeared, commit_time)
877 dependencies.append((include, appeared, commit_time - 1))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800878 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800879
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800880 if appeared is not None:
881 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800882
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800883 # Recursion and merge.
884 result = list(history)
885 for include, appeared, disappeared in dependencies:
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800886 result += get_history_recursively(
887 git_repo,
888 include,
889 appeared,
890 disappeared,
891 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800892 padding_end=False,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800893 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800894
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800895 # Sort and padding.
896 result.sort(key=lambda x: x[0])
897 if padding_end:
898 pad = (before,)
899 pad += result[-1][1:]
900 result.append(pad)
901
902 # Dedup.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800903 result2 = []
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800904 for x in result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800905 if result2 and result2[-1] == x:
906 continue
907 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800908
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800909 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800910
911
Kuang-che Wud558a042020-06-06 02:11:00 +0800912def get_branches(git_repo, all_branches=True, commit=None, remote=False):
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800913 """Get branches of a repository.
914
915 Args:
916 git_repo: path of git repo
917 all_branches: return remote branches if is set to True
918 commit: return branches containing this commit if is not None
Kuang-che Wud558a042020-06-06 02:11:00 +0800919 remote: only remote tracking branches
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800920
921 Returns:
922 list of branch names
923 """
924 cmd = ['git', 'branch', '--format=%(refname)']
925 if all_branches:
926 cmd += ['-a']
927 if commit:
928 cmd += ['--contains', commit]
Kuang-che Wud558a042020-06-06 02:11:00 +0800929 if remote:
930 cmd.append('--remote')
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800931
932 result = []
933 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
934 result.append(line.strip())
935 return result
936
937
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800938def list_commits_between_commits(git_repo, old, new):
939 """Get all commits between (old, new].
940
941 Args:
942 git_repo: path of git repo.
943 old: old commit hash (exclusive)
944 new: new commit hash (inclusive)
945
946 Returns:
947 list of (timestamp, rev)
948 """
949 assert old and new
Kuang-che Wu470866e2020-05-27 11:03:10 +0800950 if old == new:
951 return []
952
953 assert is_ancestor_commit(git_repo, old, new)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800954 commits = []
955 # --first-parent is necessary for Android, see following link for more
956 # discussion.
957 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
958 for line in util.check_output(
959 'git',
960 'rev-list',
961 '--timestamp',
962 '--reverse',
963 '--first-parent',
964 '%s..%s' % (old, new),
965 cwd=git_repo).splitlines():
966 timestamp, git_rev = line.split()
Kuang-che Wued1bb622020-05-30 23:06:23 +0800967 commits.append((int(timestamp), git_rev))
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800968
969 # bisect-kit has a fundamental assumption that commit timestamps are
970 # increasing because we sort and bisect the commits by timestamp across git
971 # repos. If not increasing, we have to adjust the timestamp as workaround.
972 # This might lead to bad bisect result, however the bad probability is low in
973 # practice since most machines' clocks are good enough.
Kuang-che Wued1bb622020-05-30 23:06:23 +0800974 adjusted, commits = _adjust_timestamp_increasingly(commits)
975 if adjusted != 0:
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800976 logger.warning('Commit timestamps are not increasing')
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800977 logger.warning('%d timestamps adjusted', adjusted)
978
979 return commits