blob: c43c4cb3c498b76ccbbe5a67d0052267aad4ec7f [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
Kuang-che Wued1bb622020-05-30 23:06:23 +08008import bisect
Kuang-che Wue41e0062017-09-01 19:04:14 +08009import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +080010import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080011import re
Kuang-che Wu3d04eda2019-09-05 23:56:40 +080012import shutil
Zheng-Jie Chang29144442020-02-18 11:53:25 +080013import stat
Kuang-che Wue41e0062017-09-01 19:04:14 +080014import subprocess
Kuang-che Wu13acc7b2020-06-15 10:45:35 +080015import tempfile
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080016import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080017
Kuang-che Wufcbcc502020-06-01 11:48:20 +080018from bisect_kit import cache_util
Kuang-che Wue41e0062017-09-01 19:04:14 +080019from bisect_kit import cli
20from bisect_kit import util
21
22logger = logging.getLogger(__name__)
23
24GIT_FULL_COMMIT_ID_LENGTH = 40
25
26# Minimal acceptable length of git commit id.
27#
28# For chromium, hash collision rate over number of digits:
29# - 6 digits: 4.85%
30# - 7 digits: 0.32%
31# - 8 digits: 0.01%
32# As foolproof check, 7 digits should be enough.
33GIT_MIN_COMMIT_ID_LENGTH = 7
34
35
36def is_git_rev(s):
37 """Is a git hash-like version string.
38
39 It accepts shortened hash with at least 7 digits.
40 """
41 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
42 return False
43 return bool(re.match(r'^[0-9a-f]+$', s))
44
45
46def argtype_git_rev(s):
47 """Validates git hash."""
48 if not is_git_rev(s):
49 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
50 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
51 return s
52
53
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080054def is_git_root(path):
55 """Is given path root of git repo."""
56 return os.path.exists(os.path.join(path, '.git'))
57
58
Kuang-che Wu08366542019-01-12 12:37:49 +080059def is_git_bare_dir(path):
60 """Is inside .git folder or bare git checkout."""
61 if not os.path.isdir(path):
62 return False
63 try:
64 return util.check_output(
65 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
66 except subprocess.CalledProcessError:
67 return False
68
69
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080070def clone(git_repo, repo_url, reference=None):
71 if not os.path.exists(git_repo):
72 os.makedirs(git_repo)
73 cmd = ['git', 'clone', repo_url, '.']
74 if reference:
75 cmd += ['--reference', reference]
76 util.check_call(*cmd, cwd=git_repo)
77
78
Kuang-che Wue41e0062017-09-01 19:04:14 +080079def checkout_version(git_repo, rev):
80 """git checkout.
81
82 Args:
83 git_repo: path of git repo.
84 rev: git commit revision to checkout.
85 """
86 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
87
88
Kuang-che Wu88e96312020-10-20 16:21:11 +080089def init(git_repo, initial_branch='main'):
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080090 """git init.
91
92 git_repo and its parent directories will be created if they don't exist.
93
94 Args:
95 git_repo: path of git repo.
Kuang-che Wuf0bfd182020-10-26 15:52:29 +080096 initial_branch: the default branch after git init
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080097 """
98 if not os.path.exists(git_repo):
99 os.makedirs(git_repo)
100
Kuang-che Wu88e96312020-10-20 16:21:11 +0800101 util.check_call(
102 'git', 'init', '-q', '--initial-branch', initial_branch, cwd=git_repo)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800103
104
105def commit_file(git_repo,
106 path,
107 message,
108 content,
109 commit_time=None,
110 author_time=None):
111 """Commit a file.
112
113 Args:
114 git_repo: path of git repo
115 path: file path, relative to git_repo
116 message: commit message
117 content: file content
118 commit_time: commit timestamp
119 author_time: author timestamp
120 """
121 if author_time is None:
122 author_time = commit_time
123
124 env = {}
125 if author_time:
126 env['GIT_AUTHOR_DATE'] = str(author_time)
127 if commit_time:
128 env['GIT_COMMITTER_DATE'] = str(commit_time)
129
130 full_path = os.path.join(git_repo, path)
131 dirname = os.path.dirname(full_path)
132 if not os.path.exists(dirname):
133 os.makedirs(dirname)
134 with open(full_path, 'w') as f:
135 f.write(content)
136
137 util.check_call('git', 'add', path, cwd=git_repo)
138 util.check_call(
139 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
140
141
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800142def config(git_repo, *args):
143 """Wrapper of 'git config'.
144
145 Args:
146 git_repo: path of git repo.
147 args: parameters pass to 'git config'
148 """
149 util.check_call('git', 'config', *args, cwd=git_repo)
150
151
152def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800153 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800154
155 Args:
156 git_repo: path of git repo.
157 args: parameters pass to 'git fetch'
158 """
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800159 tries = 0
160 while True:
161 tries += 1
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800162 stderr_lines = []
163 try:
164 util.check_call(
165 'git',
166 'fetch',
167 *args,
168 cwd=git_repo,
169 stderr_callback=stderr_lines.append)
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800170 return
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800171 except subprocess.CalledProcessError:
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800172 if tries >= 5:
173 logger.error('git fetch failed too much times')
174 raise
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800175 stderr = ''.join(stderr_lines)
176 # only retry 5xx internal server error
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800177 if 'The requested URL returned error: 5' in stderr:
178 delay = min(60, 10 * 2**tries)
179 logger.warning('git fetch failed, will retry %s seconds later', delay)
180 time.sleep(delay)
181 continue
182 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800183
184
Kuang-che Wued1bb622020-05-30 23:06:23 +0800185def _adjust_timestamp_increasingly(commits):
186 """Adjust commit timestamps.
187
188 After adjust, the timestamps are increasing.
189
190 Args:
191 commits: list of (timestamp, commit hash)
192
193 Returns:
194 (adjusted count, list of (timestamp, commit hash))
195 """
196 result = []
197 adjusted = 0
198 last_timestamp = -1
199 for timestamp, git_rev in commits:
200 if timestamp < last_timestamp:
201 adjusted += 1
202 timestamp = last_timestamp
203 else:
204 last_timestamp = timestamp
205 result.append((timestamp, git_rev))
206 return adjusted, result
207
208
209class FastLookupFailed(Exception):
210 """No data is cached for this query.
211
212 The caller should fallback to the original operation.
213 """
214
215
216class FastLookupEntry:
217 """Cached commits from one branch of given time period.
218
219 With this class, we can look up commit via commit hash and timestamp fast.
220 """
221
222 def __init__(self, git_repo, branch):
223 self.git_repo = git_repo
224 self.branch = branch
225 self.optimized_period = None
226 self.cached = []
227 self.commit_to_index = {}
228
229 def optimize(self, period):
230 assert period[0] <= period[1]
231 if (self.optimized_period and self.optimized_period[0] <= period[0] and
232 period[1] <= self.optimized_period[1]):
233 # already done
234 return
235
236 self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
237 self.optimized_period = period
238
239 # Adjust timestamps, so we can do binary search by timestamp
240 _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
241
242 self.commit_to_index = {}
243 for i, (_timestamp, rev) in enumerate(self.cached):
244 self.commit_to_index[rev] = i
245
246 def get_rev_by_time(self, timestamp):
247 if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
248 raise FastLookupFailed
249
250 # Note that, the return value might be different as "git rev-list" if the
251 # actual commit timestamps are not fully increasing.
252 x = (timestamp, '')
253 idx = bisect.bisect_right(self.cached, x)
254 if idx == 0 and timestamp < self.cached[0][0]:
255 return None
256 if idx == len(self.cached) or self.cached[idx][0] != timestamp:
257 idx -= 1
258 return self.cached[idx][1]
259
260 def is_containing_commit(self, rev):
261 if rev in self.commit_to_index:
262 return True
263 raise FastLookupFailed
264
Kuang-che Wued1bb622020-05-30 23:06:23 +0800265
266class FastLookup:
267 """Collection of FastLookupEntry"""
268
269 def __init__(self):
270 self.entries = {}
271 self.target_period = None
272
273 def optimize(self, period):
274 self.target_period = period
275
276 def disable(self):
277 self.target_period = None
278 self.entries = {}
279
280 def get_rev_by_time(self, git_repo, timestamp, branch):
281 if not self.target_period:
282 raise FastLookupFailed
283 if not self.target_period[0] <= timestamp <= self.target_period[1]:
284 raise FastLookupFailed
285
286 if git_repo not in self.entries:
287 self.entries[git_repo] = {}
288 if branch not in self.entries[git_repo]:
289 self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
290 entry = self.entries[git_repo][branch]
291 entry.optimize(self.target_period)
292 return entry.get_rev_by_time(timestamp)
293
294 def is_containing_commit(self, git_repo, rev):
295 # This function is optimized only after get_rev_by_time() is invoked.
296 if git_repo not in self.entries:
297 raise FastLookupFailed
298
299 for entry in self.entries[git_repo].values():
300 try:
301 return entry.is_containing_commit(rev)
302 except FastLookupFailed:
303 pass
304 raise FastLookupFailed
305
Kuang-che Wued1bb622020-05-30 23:06:23 +0800306
307fast_lookup = FastLookup()
308
309
Kuang-che Wu98d98462020-06-19 17:07:22 +0800310@cache_util.Cache.default_disabled
Kuang-che Wue41e0062017-09-01 19:04:14 +0800311def is_containing_commit(git_repo, rev):
312 """Determines given commit exists.
313
314 Args:
315 git_repo: path of git repo.
316 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800317
318 Returns:
319 True if rev is inside given git repo. If git_repo is not a git folder,
320 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800321 """
322 try:
Kuang-che Wued1bb622020-05-30 23:06:23 +0800323 return fast_lookup.is_containing_commit(git_repo, rev)
324 except FastLookupFailed:
325 pass
326
327 try:
Kuang-che Wue41e0062017-09-01 19:04:14 +0800328 return util.check_output(
Kuang-che Wueef83462021-01-15 16:11:30 +0800329 'git', 'cat-file', '-t', rev, cwd=git_repo) in ['commit\n', 'tag\n']
Kuang-che Wue41e0062017-09-01 19:04:14 +0800330 except subprocess.CalledProcessError:
331 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800332 except OSError:
333 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800334
335
Zheng-Jie Changad174a42020-06-20 15:28:10 +0800336@cache_util.Cache.default_disabled
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800337def is_ancestor_commit(git_repo, old, new):
338 """Determines `old` commit is ancestor of `new` commit.
339
340 Args:
341 git_repo: path of git repo.
342 old: the ancestor commit.
343 new: the descendant commit.
344
345 Returns:
346 True only if `old` is the ancestor of `new`. One commit is not considered
347 as ancestor of itself.
348 """
Kuang-che Wu6fd9a7f2021-01-12 08:07:51 +0800349 try:
350 return util.check_output(
351 'git',
352 'rev-list',
353 '--ancestry-path',
354 '-1',
355 '%s..%s' % (old, new),
356 cwd=git_repo) != ''
357 except subprocess.CalledProcessError:
358 return False
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800359
360
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800361def _parse_commit_object(s):
362 meta = {}
363 header, meta['message'] = s.split('\n\n', 1)
364 for line in header.splitlines():
Kuang-che Wueef83462021-01-15 16:11:30 +0800365 m = re.match(r'^(object|tree|type) (\w+)', line)
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800366 if m:
Kuang-che Wueef83462021-01-15 16:11:30 +0800367 meta[m.group(1)] = m.group(2)
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800368 continue
369
370 m = re.match(r'^parent (\w+)', line)
371 if m:
372 meta['parent'] = line.split()[1:]
373 continue
374
375 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
376 if m:
377 meta[m.group(1)] = m.group(2)
378 meta['%s_time' % m.group(1)] = int(m.group(3))
379 continue
380 return meta
381
382
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800383@cache_util.Cache.default_disabled
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800384def get_commit_metadata(git_repo, rev):
385 """Get metadata of given commit.
386
387 Args:
388 git_repo: path of git repo.
389 rev: git commit revision in query.
390
391 Returns:
392 dict of metadata, including (if available):
393 tree: hash of git tree object
394 parent: list of parent commits; this field is unavailable for the very
395 first commit of git repo.
396 author: name and email of author
397 author_time: author timestamp (without timezone information)
398 committer: name and email of committer
399 committer_time: commit timestamp (without timezone information)
400 message: commit message text
401 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800402 data = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800403 'git', 'cat-file', '-p', rev, cwd=git_repo, log_stdout=False)
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800404 return _parse_commit_object(data)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800405
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800406
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800407def get_batch_commit_metadata(git_repo, revs):
408 query = '\n'.join(revs)
409 logger.debug('get_batch_commit_metadata %r', query)
410 with tempfile.NamedTemporaryFile('w+t') as f:
411 f.write(query)
412 f.flush()
413 # util.check_output doesn't support stdin, so use shell
414 # redirect instead.
415 # binary=True because we need to count size in bytes later.
416 data = util.check_output(
417 'sh',
418 '-c',
419 'git cat-file --batch < ' + f.name,
420 cwd=git_repo,
421 binary=True)
422
423 metas = {}
424 while data:
425 first_line, data = data.split(b'\n', 1)
426 m = re.match(r'^(\w+) (\w+)(?: (\d+))?', first_line.decode('utf8'))
427 assert m, repr(first_line)
428 object_name, object_type = m.group(1, 2)
429 if not m.group(3):
430 metas[object_name] = None
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800431 continue
Kuang-che Wueef83462021-01-15 16:11:30 +0800432 assert object_type in ['commit',
433 'tag'], 'unsupported object type: %s' % object_type
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800434 object_size = int(m.group(3))
435 assert data[object_size] == ord(b'\n'), repr(data[object_size])
436 obj, data = data[:object_size], data[object_size + 1:]
437 metas[object_name] = _parse_commit_object(obj.decode('utf8'))
438 return metas
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800439
440
Kuang-che Wue41e0062017-09-01 19:04:14 +0800441def get_revlist(git_repo, old, new):
442 """Enumerates git commit between two revisions (inclusive).
443
444 Args:
445 git_repo: path of git repo.
446 old: git commit revision.
447 new: git commit revision.
448
449 Returns:
450 list of git revisions. The list contains the input revisions, old and new.
451 """
452 assert old
453 assert new
Kuang-che Wuea002f62020-11-09 19:28:52 +0800454 cmd = [
455 'git', 'rev-list', '--first-parent', '--reverse',
456 '%s^..%s' % (old, new)
457 ]
Kuang-che Wue41e0062017-09-01 19:04:14 +0800458 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
459 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800460
461
462def get_commit_log(git_repo, rev):
463 """Get git commit log.
464
465 Args:
466 git_repo: path of git repo.
467 rev: git commit revision.
468
469 Returns:
470 commit log message
471 """
472 cmd = ['git', 'log', '-1', '--format=%B', rev]
473 msg = util.check_output(*cmd, cwd=git_repo)
474 return msg
475
476
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800477def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800478 """Get git commit hash.
479
480 Args:
481 git_repo: path of git repo.
482 rev: could be git tag, branch, or (shortened) commit hash
483
484 Returns:
485 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800486
487 Raises:
488 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800489 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800490 try:
491 # Use '^{commit}' to restrict search only commits.
492 # Use '--' to avoid ambiguity, like matching rev against path name.
493 output = util.check_output(
494 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
495 git_rev = output.rstrip('-\n')
Kuang-che Wu6d91b8c2020-11-24 20:14:35 +0800496 except subprocess.CalledProcessError as e:
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800497 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
498 # because it searches objects other than commits as well.
Kuang-che Wu6d91b8c2020-11-24 20:14:35 +0800499 raise ValueError('%s is not unique or does not exist' % rev) from e
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800500 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800501 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800502
503
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800504def get_commit_time(git_repo, rev, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800505 """Get git commit timestamp.
506
507 Args:
508 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800509 rev: git commit id, branch name, tag name, or other git object
510 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800511
512 Returns:
513 timestamp (int)
514 """
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800515 cmd = ['git', 'log', '-1', '--format=%ct', rev]
516 if path:
517 cmd += ['--', path]
518 line = util.check_output(*cmd, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800519 return int(line)
520
521
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800522def is_symbolic_link(git_repo, rev, path):
523 """Check if a file is symbolic link.
524
525 Args:
526 git_repo: path of git repo
527 rev: git commit id
528 path: file path
529
530 Returns:
531 True if the specified file is a symbolic link in repo.
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800532
533 Raises:
534 ValueError if not found
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800535 """
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800536 # format: 120000 blob 8735a8c1dd96ede39a21d983d5c96792fd15c1a5 default.xml
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800537 # TODO(kcwu): handle escaped path with special characters
Kuang-che Wu020a1182020-09-08 17:17:22 +0800538 parts = util.check_output(
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800539 'git', 'ls-tree', rev, '--full-name', path, cwd=git_repo).split()
Kuang-che Wu020a1182020-09-08 17:17:22 +0800540 if len(parts) >= 4 and parts[3] == path:
541 return stat.S_ISLNK(int(parts[0], 8))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800542
Kuang-che Wud1b74152020-05-20 08:46:46 +0800543 raise ValueError('file %s is not found in repo:%s rev:%s' %
544 (path, git_repo, rev))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800545
546
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800547@cache_util.Cache.default_disabled
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800548def get_file_from_revision(git_repo, rev, path):
549 """Get file content of given revision.
550
551 Args:
552 git_repo: path of git repo
553 rev: git commit id
554 path: file path
555
556 Returns:
557 file content (str)
558 """
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800559 result = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800560 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_stdout=False)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800561
562 # It might be a symbolic link.
563 # In extreme case, it's possible that filenames contain special characters,
564 # like newlines. In practice, it should be safe to assume no such cases and
565 # reduce disk i/o.
566 if '\n' not in result and is_symbolic_link(git_repo, rev, path):
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800567 return get_file_from_revision(git_repo, rev, result)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800568
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800569 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800570
571
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800572def list_dir_from_revision(git_repo, rev, path):
573 """Lists entries of directory of given revision.
574
575 Args:
576 git_repo: path of git repo
577 rev: git commit id
578 path: directory path, relative to git root
579
580 Returns:
581 list of names
582
583 Raises:
584 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
585 """
586 return util.check_output(
587 'git',
588 'ls-tree',
589 '--name-only',
590 '%s:%s' % (rev, path),
591 cwd=git_repo,
Kuang-che Wubcafc552019-08-15 15:27:02 +0800592 log_stdout=False).splitlines()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800593
594
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800595def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800596 """Query commit of given time.
597
598 Args:
599 git_repo: path of git repo.
600 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800601 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
602 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800603 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800604
605 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800606 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800607 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800608 if not branch:
609 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800610
Kuang-che Wued1bb622020-05-30 23:06:23 +0800611 if not path:
612 try:
613 return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
614 except FastLookupFailed:
615 pass
616
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800617 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800618 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800619 'rev-list',
620 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800621 '-1',
622 '--before',
623 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800624 branch,
625 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800626 if path:
627 cmd += ['--', path]
628
629 result = util.check_output(*cmd, cwd=git_repo).strip()
630 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800631
632
Kuang-che Wued1bb622020-05-30 23:06:23 +0800633def get_revlist_by_period(git_repo, branch, period):
634 # Find the last commit before period[0].
635 text = util.check_output(
636 'git',
637 'rev-list',
Kuang-che Wuea002f62020-11-09 19:28:52 +0800638 '--first-parent',
Kuang-che Wued1bb622020-05-30 23:06:23 +0800639 '--timestamp',
640 '-1',
641 '--before',
642 str(period[0] - 1),
643 branch,
644 cwd=git_repo)
645
646 # Find commits in the period.
647 text += util.check_output(
648 'git',
649 'rev-list',
Kuang-che Wuea002f62020-11-09 19:28:52 +0800650 '--first-parent',
Kuang-che Wued1bb622020-05-30 23:06:23 +0800651 '--timestamp',
652 '--reverse',
653 '--after',
654 str(period[0]),
655 '--before',
656 str(period[1]),
657 branch,
658 cwd=git_repo)
659
660 result = []
661 for line in text.splitlines():
662 timestamp, commit = line.split()
663 result.append((int(timestamp), commit))
664 return result
665
666
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800667def reset_hard(git_repo):
668 """Restore modified and deleted files.
669
670 This is simply wrapper of "git reset --hard".
671
672 Args:
673 git_repo: path of git repo.
674 """
675 util.check_call('git', 'reset', '--hard', cwd=git_repo)
676
677
678def list_untracked(git_repo, excludes=None):
679 """List untracked files and directories.
680
681 Args:
682 git_repo: path of git repo.
683 excludes: files and/or directories to ignore, relative to git_repo
684
685 Returns:
686 list of paths, relative to git_repo
687 """
688 exclude_flags = []
689 if excludes:
690 for exclude in excludes:
691 assert not os.path.isabs(exclude), 'should be relative'
692 exclude_flags += ['--exclude', '/' + re.escape(exclude)]
693
694 result = []
695 for path in util.check_output(
696 'git',
697 'ls-files',
698 '--others',
699 '--exclude-standard',
700 *exclude_flags,
701 cwd=git_repo).splitlines():
702 # Remove the trailing slash, which means directory.
703 path = path.rstrip('/')
704 result.append(path)
705 return result
706
707
708def distclean(git_repo, excludes=None):
709 """Clean up git repo directory.
710
711 Restore modified and deleted files. Delete untracked files.
712
713 Args:
714 git_repo: path of git repo.
715 excludes: files and/or directories to ignore, relative to git_repo
716 """
717 reset_hard(git_repo)
718
719 # Delete untracked files.
720 for untracked in list_untracked(git_repo, excludes=excludes):
721 path = os.path.join(git_repo, untracked)
722 logger.debug('delete untracked: %s', path)
Zheng-Jie Chang42fd2e42020-04-10 07:21:23 +0800723 if os.path.islink(path):
724 os.unlink(path)
725 elif os.path.isdir(path):
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800726 shutil.rmtree(path)
727 else:
728 os.unlink(path)
729
730
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800731def get_history(git_repo,
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800732 path=None,
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800733 branch=None,
734 after=None,
735 before=None,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800736 padding_begin=False,
737 padding_end=False,
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800738 with_subject=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800739 """Get commit history of given path.
740
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800741 `after` and `before` could be outside of lifetime of `path`. `padding` is
742 used to control what to return for such cases.
743
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800744 Args:
745 git_repo: path of git repo.
746 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800747 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800748 after: limit history after given time (inclusive)
749 before: limit history before given time (inclusive)
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800750 padding_begin: If True, pads returned result with dummy record at exact
751 'after' time, if 'path' existed at that time.
752 padding_end: If True, pads returned result with dummy record at exact
753 'before' time, if 'path' existed at that time.
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800754 with_subject: If True, return commit subject together
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800755
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800756 Returns:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800757 List of (timestamp, git hash, subject); or (timestamp, git hash) depends
758 on with_subject flag. They are all events when `path` was added, removed,
759 modified, and start and end time if `padding` is true. If `padding` and
760 `with_subject` are both true, 'dummy subject' will be returned as padding
761 history's subject.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800762
763 For each pair, at `timestamp`, the repo state is `git hash`. In other
764 words, `timestamp` is not necessary the commit time of `git hash` for the
765 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800766 """
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800767 log_format = '%ct %H' if not with_subject else '%ct %H %s'
768 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=' + log_format]
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800769 if after:
770 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800771 if before:
772 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800773 if branch:
774 assert not is_git_rev(branch)
775 cmd += [branch]
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800776 if path:
777 # '--' is necessary otherwise if `path` is removed in current revision, git
778 # will complain it's an ambiguous argument which may be path or something
779 # else (like git branch name, tag name, etc.)
780 cmd += ['--', path]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800781
782 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800783 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800784 # array = [timestamp, git_rev, subject] or [timestamp, git_rev]
785 array = line.split(' ', 2)
786 array[0] = int(array[0])
787 result.append(tuple(array))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800788
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800789 if padding_begin or padding_end:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800790 history = [0, '']
791 if with_subject:
792 history.append('dummy subject')
793
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800794 if padding_end:
795 assert before, 'padding_end=True make no sense if before=None'
796 if get_rev_by_time(git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800797 before = int(before)
798 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800799 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800800 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800801 history[0:2] = [before, git_rev]
802 result.append(tuple(history))
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800803
804 if padding_begin:
805 assert after, 'padding_begin=True make no sense if after=None'
806 if get_rev_by_time(git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800807 after = int(after)
808 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800809 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800810 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800811 history[0:2] = [after, git_rev]
812 result.insert(0, tuple(history))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800813
814 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800815
816
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800817def get_history_recursively(git_repo,
818 path,
819 after,
820 before,
821 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800822 padding_end=True,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800823 branch=None):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800824 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800825
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800826 In comparison to get_history(), get_history_recursively also takes
827 dependencies into consideration. For example, if file A referenced file B,
828 get_history_recursively(A) will return commits of B in addition to A. This
829 applies recursively, so commits of C will be included if file B referenced
830 file C, and so on.
831
832 This function is file type neutral. `parser_callback(filename, content)` will
833 be invoked to parse file content and should return list of filename of
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800834 dependencies. If `parser_callback` returns None (usually syntax error), the
835 commit is omitted.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800836
837 Args:
838 git_repo: path of git repo
839 path: path to query, relative to git_repo
840 after: limit history after given time (inclusive)
841 before: limit history before given time (inclusive)
842 parser_callback: callback to parse file content. See above comment.
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800843 padding_end: If True, pads returned result with dummy record at exact
844 'after' time, if 'path' existed at that time.
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800845 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800846
847 Returns:
848 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800849 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800850 history = get_history(
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800851 git_repo,
852 path,
853 after=after,
854 before=before,
855 padding_begin=True,
856 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800857
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800858 # Collect include information of each commit.
859 includes = {}
860 for commit_time, git_rev in history:
861 content = get_file_from_revision(git_repo, git_rev, path)
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800862 parse_result = parser_callback(path, content)
863 if parse_result is None:
864 continue
865 for include_name in parse_result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800866 if include_name not in includes:
867 includes[include_name] = set()
868 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800869
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800870 # Analyze the start time and end time of each include.
871 dependencies = []
872 for include in includes:
873 appeared = None
874 for commit_time, git_rev in history:
875 if git_rev in includes[include]:
876 if not appeared:
877 appeared = commit_time
878 else:
879 if appeared:
Zheng-Jie Chang4d617a42020-02-15 06:46:00 +0800880 # dependency file exists in time range [appeared, commit_time)
881 dependencies.append((include, appeared, commit_time - 1))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800882 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800883
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800884 if appeared is not None:
885 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800886
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800887 # Recursion and merge.
888 result = list(history)
889 for include, appeared, disappeared in dependencies:
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800890 result += get_history_recursively(
891 git_repo,
892 include,
893 appeared,
894 disappeared,
895 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800896 padding_end=False,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800897 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800898
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800899 # Sort and padding.
900 result.sort(key=lambda x: x[0])
901 if padding_end:
902 pad = (before,)
903 pad += result[-1][1:]
904 result.append(pad)
905
906 # Dedup.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800907 result2 = []
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800908 for x in result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800909 if result2 and result2[-1] == x:
910 continue
911 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800912
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800913 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800914
915
Kuang-che Wud558a042020-06-06 02:11:00 +0800916def get_branches(git_repo, all_branches=True, commit=None, remote=False):
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800917 """Get branches of a repository.
918
919 Args:
920 git_repo: path of git repo
921 all_branches: return remote branches if is set to True
922 commit: return branches containing this commit if is not None
Kuang-che Wud558a042020-06-06 02:11:00 +0800923 remote: only remote tracking branches
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800924
925 Returns:
926 list of branch names
927 """
928 cmd = ['git', 'branch', '--format=%(refname)']
929 if all_branches:
930 cmd += ['-a']
931 if commit:
932 cmd += ['--contains', commit]
Kuang-che Wud558a042020-06-06 02:11:00 +0800933 if remote:
934 cmd.append('--remote')
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800935
936 result = []
937 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
938 result.append(line.strip())
939 return result
940
941
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800942def list_commits_between_commits(git_repo, old, new):
943 """Get all commits between (old, new].
944
945 Args:
946 git_repo: path of git repo.
947 old: old commit hash (exclusive)
948 new: new commit hash (inclusive)
949
950 Returns:
951 list of (timestamp, rev)
952 """
953 assert old and new
Kuang-che Wu470866e2020-05-27 11:03:10 +0800954 if old == new:
955 return []
956
957 assert is_ancestor_commit(git_repo, old, new)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800958 commits = []
959 # --first-parent is necessary for Android, see following link for more
960 # discussion.
961 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
962 for line in util.check_output(
963 'git',
964 'rev-list',
965 '--timestamp',
966 '--reverse',
967 '--first-parent',
968 '%s..%s' % (old, new),
969 cwd=git_repo).splitlines():
970 timestamp, git_rev = line.split()
Kuang-che Wued1bb622020-05-30 23:06:23 +0800971 commits.append((int(timestamp), git_rev))
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800972
973 # bisect-kit has a fundamental assumption that commit timestamps are
974 # increasing because we sort and bisect the commits by timestamp across git
975 # repos. If not increasing, we have to adjust the timestamp as workaround.
976 # This might lead to bad bisect result, however the bad probability is low in
977 # practice since most machines' clocks are good enough.
Kuang-che Wued1bb622020-05-30 23:06:23 +0800978 adjusted, commits = _adjust_timestamp_increasingly(commits)
979 if adjusted != 0:
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800980 logger.warning('Commit timestamps are not increasing')
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800981 logger.warning('%d timestamps adjusted', adjusted)
982
983 return commits