blob: 588cd5ad32b963bbf5504295f4d75e193f82402b [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
Kuang-che Wued1bb622020-05-30 23:06:23 +08008import bisect
Kuang-che Wue41e0062017-09-01 19:04:14 +08009import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +080010import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080011import re
Kuang-che Wu3d04eda2019-09-05 23:56:40 +080012import shutil
Zheng-Jie Chang29144442020-02-18 11:53:25 +080013import stat
Kuang-che Wue41e0062017-09-01 19:04:14 +080014import subprocess
Kuang-che Wu13acc7b2020-06-15 10:45:35 +080015import tempfile
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080016import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080017
Kuang-che Wufcbcc502020-06-01 11:48:20 +080018from bisect_kit import cache_util
Kuang-che Wue41e0062017-09-01 19:04:14 +080019from bisect_kit import cli
20from bisect_kit import util
21
22logger = logging.getLogger(__name__)
23
24GIT_FULL_COMMIT_ID_LENGTH = 40
25
26# Minimal acceptable length of git commit id.
27#
28# For chromium, hash collision rate over number of digits:
29# - 6 digits: 4.85%
30# - 7 digits: 0.32%
31# - 8 digits: 0.01%
32# As foolproof check, 7 digits should be enough.
33GIT_MIN_COMMIT_ID_LENGTH = 7
34
35
36def is_git_rev(s):
37 """Is a git hash-like version string.
38
39 It accepts shortened hash with at least 7 digits.
40 """
41 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
42 return False
43 return bool(re.match(r'^[0-9a-f]+$', s))
44
45
46def argtype_git_rev(s):
47 """Validates git hash."""
48 if not is_git_rev(s):
49 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
50 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
51 return s
52
53
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080054def is_git_root(path):
55 """Is given path root of git repo."""
56 return os.path.exists(os.path.join(path, '.git'))
57
58
Kuang-che Wu08366542019-01-12 12:37:49 +080059def is_git_bare_dir(path):
60 """Is inside .git folder or bare git checkout."""
61 if not os.path.isdir(path):
62 return False
63 try:
64 return util.check_output(
65 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
66 except subprocess.CalledProcessError:
67 return False
68
69
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080070def clone(git_repo, repo_url, reference=None):
71 if not os.path.exists(git_repo):
72 os.makedirs(git_repo)
73 cmd = ['git', 'clone', repo_url, '.']
74 if reference:
75 cmd += ['--reference', reference]
76 util.check_call(*cmd, cwd=git_repo)
77
78
Kuang-che Wue41e0062017-09-01 19:04:14 +080079def checkout_version(git_repo, rev):
80 """git checkout.
81
82 Args:
83 git_repo: path of git repo.
84 rev: git commit revision to checkout.
85 """
86 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
87
88
Kuang-che Wu88e96312020-10-20 16:21:11 +080089def init(git_repo, initial_branch='main'):
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080090 """git init.
91
92 git_repo and its parent directories will be created if they don't exist.
93
94 Args:
95 git_repo: path of git repo.
Kuang-che Wuf0bfd182020-10-26 15:52:29 +080096 initial_branch: the default branch after git init
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080097 """
98 if not os.path.exists(git_repo):
99 os.makedirs(git_repo)
100
Kuang-che Wu88e96312020-10-20 16:21:11 +0800101 util.check_call(
102 'git', 'init', '-q', '--initial-branch', initial_branch, cwd=git_repo)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800103
104
105def commit_file(git_repo,
106 path,
107 message,
108 content,
109 commit_time=None,
110 author_time=None):
111 """Commit a file.
112
113 Args:
114 git_repo: path of git repo
115 path: file path, relative to git_repo
116 message: commit message
117 content: file content
118 commit_time: commit timestamp
119 author_time: author timestamp
120 """
121 if author_time is None:
122 author_time = commit_time
123
124 env = {}
125 if author_time:
126 env['GIT_AUTHOR_DATE'] = str(author_time)
127 if commit_time:
128 env['GIT_COMMITTER_DATE'] = str(commit_time)
129
130 full_path = os.path.join(git_repo, path)
131 dirname = os.path.dirname(full_path)
132 if not os.path.exists(dirname):
133 os.makedirs(dirname)
134 with open(full_path, 'w') as f:
135 f.write(content)
136
137 util.check_call('git', 'add', path, cwd=git_repo)
138 util.check_call(
139 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
140
141
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800142def config(git_repo, *args):
143 """Wrapper of 'git config'.
144
145 Args:
146 git_repo: path of git repo.
147 args: parameters pass to 'git config'
148 """
149 util.check_call('git', 'config', *args, cwd=git_repo)
150
151
152def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800153 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800154
155 Args:
156 git_repo: path of git repo.
157 args: parameters pass to 'git fetch'
158 """
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800159 tries = 0
160 while True:
161 tries += 1
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800162 stderr_lines = []
163 try:
164 util.check_call(
165 'git',
166 'fetch',
167 *args,
168 cwd=git_repo,
169 stderr_callback=stderr_lines.append)
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800170 return
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800171 except subprocess.CalledProcessError:
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800172 if tries >= 5:
173 logger.error('git fetch failed too much times')
174 raise
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800175 stderr = ''.join(stderr_lines)
176 # only retry 5xx internal server error
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800177 if 'The requested URL returned error: 5' in stderr:
178 delay = min(60, 10 * 2**tries)
179 logger.warning('git fetch failed, will retry %s seconds later', delay)
180 time.sleep(delay)
181 continue
182 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800183
184
Kuang-che Wued1bb622020-05-30 23:06:23 +0800185def _adjust_timestamp_increasingly(commits):
186 """Adjust commit timestamps.
187
188 After adjust, the timestamps are increasing.
189
190 Args:
191 commits: list of (timestamp, commit hash)
192
193 Returns:
194 (adjusted count, list of (timestamp, commit hash))
195 """
196 result = []
197 adjusted = 0
198 last_timestamp = -1
199 for timestamp, git_rev in commits:
200 if timestamp < last_timestamp:
201 adjusted += 1
202 timestamp = last_timestamp
203 else:
204 last_timestamp = timestamp
205 result.append((timestamp, git_rev))
206 return adjusted, result
207
208
209class FastLookupFailed(Exception):
210 """No data is cached for this query.
211
212 The caller should fallback to the original operation.
213 """
214
215
216class FastLookupEntry:
217 """Cached commits from one branch of given time period.
218
219 With this class, we can look up commit via commit hash and timestamp fast.
220 """
221
222 def __init__(self, git_repo, branch):
223 self.git_repo = git_repo
224 self.branch = branch
225 self.optimized_period = None
226 self.cached = []
227 self.commit_to_index = {}
228
229 def optimize(self, period):
230 assert period[0] <= period[1]
231 if (self.optimized_period and self.optimized_period[0] <= period[0] and
232 period[1] <= self.optimized_period[1]):
233 # already done
234 return
235
236 self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
237 self.optimized_period = period
238
239 # Adjust timestamps, so we can do binary search by timestamp
240 _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
241
242 self.commit_to_index = {}
243 for i, (_timestamp, rev) in enumerate(self.cached):
244 self.commit_to_index[rev] = i
245
246 def get_rev_by_time(self, timestamp):
247 if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
248 raise FastLookupFailed
249
250 # Note that, the return value might be different as "git rev-list" if the
251 # actual commit timestamps are not fully increasing.
252 x = (timestamp, '')
253 idx = bisect.bisect_right(self.cached, x)
254 if idx == 0 and timestamp < self.cached[0][0]:
255 return None
256 if idx == len(self.cached) or self.cached[idx][0] != timestamp:
257 idx -= 1
258 return self.cached[idx][1]
259
260 def is_containing_commit(self, rev):
261 if rev in self.commit_to_index:
262 return True
263 raise FastLookupFailed
264
Kuang-che Wued1bb622020-05-30 23:06:23 +0800265
266class FastLookup:
267 """Collection of FastLookupEntry"""
268
269 def __init__(self):
270 self.entries = {}
271 self.target_period = None
272
273 def optimize(self, period):
274 self.target_period = period
275
276 def disable(self):
277 self.target_period = None
278 self.entries = {}
279
280 def get_rev_by_time(self, git_repo, timestamp, branch):
281 if not self.target_period:
282 raise FastLookupFailed
283 if not self.target_period[0] <= timestamp <= self.target_period[1]:
284 raise FastLookupFailed
285
286 if git_repo not in self.entries:
287 self.entries[git_repo] = {}
288 if branch not in self.entries[git_repo]:
289 self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
290 entry = self.entries[git_repo][branch]
291 entry.optimize(self.target_period)
292 return entry.get_rev_by_time(timestamp)
293
294 def is_containing_commit(self, git_repo, rev):
295 # This function is optimized only after get_rev_by_time() is invoked.
296 if git_repo not in self.entries:
297 raise FastLookupFailed
298
299 for entry in self.entries[git_repo].values():
300 try:
301 return entry.is_containing_commit(rev)
302 except FastLookupFailed:
303 pass
304 raise FastLookupFailed
305
Kuang-che Wued1bb622020-05-30 23:06:23 +0800306
307fast_lookup = FastLookup()
308
309
Kuang-che Wu98d98462020-06-19 17:07:22 +0800310@cache_util.Cache.default_disabled
Kuang-che Wue41e0062017-09-01 19:04:14 +0800311def is_containing_commit(git_repo, rev):
312 """Determines given commit exists.
313
314 Args:
315 git_repo: path of git repo.
316 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800317
318 Returns:
319 True if rev is inside given git repo. If git_repo is not a git folder,
320 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800321 """
322 try:
Kuang-che Wued1bb622020-05-30 23:06:23 +0800323 return fast_lookup.is_containing_commit(git_repo, rev)
324 except FastLookupFailed:
325 pass
326
327 try:
Kuang-che Wue41e0062017-09-01 19:04:14 +0800328 return util.check_output(
329 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
330 except subprocess.CalledProcessError:
331 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800332 except OSError:
333 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800334
335
Zheng-Jie Changad174a42020-06-20 15:28:10 +0800336@cache_util.Cache.default_disabled
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800337def is_ancestor_commit(git_repo, old, new):
338 """Determines `old` commit is ancestor of `new` commit.
339
340 Args:
341 git_repo: path of git repo.
342 old: the ancestor commit.
343 new: the descendant commit.
344
345 Returns:
346 True only if `old` is the ancestor of `new`. One commit is not considered
347 as ancestor of itself.
348 """
Kuang-che Wu6fd9a7f2021-01-12 08:07:51 +0800349 try:
350 return util.check_output(
351 'git',
352 'rev-list',
353 '--ancestry-path',
354 '-1',
355 '%s..%s' % (old, new),
356 cwd=git_repo) != ''
357 except subprocess.CalledProcessError:
358 return False
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800359
360
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800361def _parse_commit_object(s):
362 meta = {}
363 header, meta['message'] = s.split('\n\n', 1)
364 for line in header.splitlines():
365 m = re.match(r'^tree (\w+)', line)
366 if m:
367 meta['tree'] = m.group(1)
368 continue
369
370 m = re.match(r'^parent (\w+)', line)
371 if m:
372 meta['parent'] = line.split()[1:]
373 continue
374
375 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
376 if m:
377 meta[m.group(1)] = m.group(2)
378 meta['%s_time' % m.group(1)] = int(m.group(3))
379 continue
380 return meta
381
382
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800383@cache_util.Cache.default_disabled
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800384def get_commit_metadata(git_repo, rev):
385 """Get metadata of given commit.
386
387 Args:
388 git_repo: path of git repo.
389 rev: git commit revision in query.
390
391 Returns:
392 dict of metadata, including (if available):
393 tree: hash of git tree object
394 parent: list of parent commits; this field is unavailable for the very
395 first commit of git repo.
396 author: name and email of author
397 author_time: author timestamp (without timezone information)
398 committer: name and email of committer
399 committer_time: commit timestamp (without timezone information)
400 message: commit message text
401 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800402 data = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800403 'git', 'cat-file', '-p', rev, cwd=git_repo, log_stdout=False)
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800404 return _parse_commit_object(data)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800405
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800406
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800407def get_batch_commit_metadata(git_repo, revs):
408 query = '\n'.join(revs)
409 logger.debug('get_batch_commit_metadata %r', query)
410 with tempfile.NamedTemporaryFile('w+t') as f:
411 f.write(query)
412 f.flush()
413 # util.check_output doesn't support stdin, so use shell
414 # redirect instead.
415 # binary=True because we need to count size in bytes later.
416 data = util.check_output(
417 'sh',
418 '-c',
419 'git cat-file --batch < ' + f.name,
420 cwd=git_repo,
421 binary=True)
422
423 metas = {}
424 while data:
425 first_line, data = data.split(b'\n', 1)
426 m = re.match(r'^(\w+) (\w+)(?: (\d+))?', first_line.decode('utf8'))
427 assert m, repr(first_line)
428 object_name, object_type = m.group(1, 2)
429 if not m.group(3):
430 metas[object_name] = None
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800431 continue
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800432 assert object_type == 'commit', 'unsupported object type: %s' % object_type
433 object_size = int(m.group(3))
434 assert data[object_size] == ord(b'\n'), repr(data[object_size])
435 obj, data = data[:object_size], data[object_size + 1:]
436 metas[object_name] = _parse_commit_object(obj.decode('utf8'))
437 return metas
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800438
439
Kuang-che Wue41e0062017-09-01 19:04:14 +0800440def get_revlist(git_repo, old, new):
441 """Enumerates git commit between two revisions (inclusive).
442
443 Args:
444 git_repo: path of git repo.
445 old: git commit revision.
446 new: git commit revision.
447
448 Returns:
449 list of git revisions. The list contains the input revisions, old and new.
450 """
451 assert old
452 assert new
Kuang-che Wuea002f62020-11-09 19:28:52 +0800453 cmd = [
454 'git', 'rev-list', '--first-parent', '--reverse',
455 '%s^..%s' % (old, new)
456 ]
Kuang-che Wue41e0062017-09-01 19:04:14 +0800457 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
458 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800459
460
461def get_commit_log(git_repo, rev):
462 """Get git commit log.
463
464 Args:
465 git_repo: path of git repo.
466 rev: git commit revision.
467
468 Returns:
469 commit log message
470 """
471 cmd = ['git', 'log', '-1', '--format=%B', rev]
472 msg = util.check_output(*cmd, cwd=git_repo)
473 return msg
474
475
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800476def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800477 """Get git commit hash.
478
479 Args:
480 git_repo: path of git repo.
481 rev: could be git tag, branch, or (shortened) commit hash
482
483 Returns:
484 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800485
486 Raises:
487 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800488 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800489 try:
490 # Use '^{commit}' to restrict search only commits.
491 # Use '--' to avoid ambiguity, like matching rev against path name.
492 output = util.check_output(
493 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
494 git_rev = output.rstrip('-\n')
Kuang-che Wu6d91b8c2020-11-24 20:14:35 +0800495 except subprocess.CalledProcessError as e:
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800496 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
497 # because it searches objects other than commits as well.
Kuang-che Wu6d91b8c2020-11-24 20:14:35 +0800498 raise ValueError('%s is not unique or does not exist' % rev) from e
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800499 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800500 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800501
502
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800503def get_commit_time(git_repo, rev, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800504 """Get git commit timestamp.
505
506 Args:
507 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800508 rev: git commit id, branch name, tag name, or other git object
509 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800510
511 Returns:
512 timestamp (int)
513 """
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800514 cmd = ['git', 'log', '-1', '--format=%ct', rev]
515 if path:
516 cmd += ['--', path]
517 line = util.check_output(*cmd, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800518 return int(line)
519
520
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800521def is_symbolic_link(git_repo, rev, path):
522 """Check if a file is symbolic link.
523
524 Args:
525 git_repo: path of git repo
526 rev: git commit id
527 path: file path
528
529 Returns:
530 True if the specified file is a symbolic link in repo.
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800531
532 Raises:
533 ValueError if not found
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800534 """
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800535 # format: 120000 blob 8735a8c1dd96ede39a21d983d5c96792fd15c1a5 default.xml
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800536 # TODO(kcwu): handle escaped path with special characters
Kuang-che Wu020a1182020-09-08 17:17:22 +0800537 parts = util.check_output(
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800538 'git', 'ls-tree', rev, '--full-name', path, cwd=git_repo).split()
Kuang-che Wu020a1182020-09-08 17:17:22 +0800539 if len(parts) >= 4 and parts[3] == path:
540 return stat.S_ISLNK(int(parts[0], 8))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800541
Kuang-che Wud1b74152020-05-20 08:46:46 +0800542 raise ValueError('file %s is not found in repo:%s rev:%s' %
543 (path, git_repo, rev))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800544
545
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800546@cache_util.Cache.default_disabled
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800547def get_file_from_revision(git_repo, rev, path):
548 """Get file content of given revision.
549
550 Args:
551 git_repo: path of git repo
552 rev: git commit id
553 path: file path
554
555 Returns:
556 file content (str)
557 """
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800558 result = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800559 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_stdout=False)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800560
561 # It might be a symbolic link.
562 # In extreme case, it's possible that filenames contain special characters,
563 # like newlines. In practice, it should be safe to assume no such cases and
564 # reduce disk i/o.
565 if '\n' not in result and is_symbolic_link(git_repo, rev, path):
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800566 return get_file_from_revision(git_repo, rev, result)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800567
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800568 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800569
570
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800571def list_dir_from_revision(git_repo, rev, path):
572 """Lists entries of directory of given revision.
573
574 Args:
575 git_repo: path of git repo
576 rev: git commit id
577 path: directory path, relative to git root
578
579 Returns:
580 list of names
581
582 Raises:
583 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
584 """
585 return util.check_output(
586 'git',
587 'ls-tree',
588 '--name-only',
589 '%s:%s' % (rev, path),
590 cwd=git_repo,
Kuang-che Wubcafc552019-08-15 15:27:02 +0800591 log_stdout=False).splitlines()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800592
593
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800594def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800595 """Query commit of given time.
596
597 Args:
598 git_repo: path of git repo.
599 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800600 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
601 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800602 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800603
604 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800605 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800606 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800607 if not branch:
608 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800609
Kuang-che Wued1bb622020-05-30 23:06:23 +0800610 if not path:
611 try:
612 return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
613 except FastLookupFailed:
614 pass
615
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800616 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800617 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800618 'rev-list',
619 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800620 '-1',
621 '--before',
622 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800623 branch,
624 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800625 if path:
626 cmd += ['--', path]
627
628 result = util.check_output(*cmd, cwd=git_repo).strip()
629 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800630
631
Kuang-che Wued1bb622020-05-30 23:06:23 +0800632def get_revlist_by_period(git_repo, branch, period):
633 # Find the last commit before period[0].
634 text = util.check_output(
635 'git',
636 'rev-list',
Kuang-che Wuea002f62020-11-09 19:28:52 +0800637 '--first-parent',
Kuang-che Wued1bb622020-05-30 23:06:23 +0800638 '--timestamp',
639 '-1',
640 '--before',
641 str(period[0] - 1),
642 branch,
643 cwd=git_repo)
644
645 # Find commits in the period.
646 text += util.check_output(
647 'git',
648 'rev-list',
Kuang-che Wuea002f62020-11-09 19:28:52 +0800649 '--first-parent',
Kuang-che Wued1bb622020-05-30 23:06:23 +0800650 '--timestamp',
651 '--reverse',
652 '--after',
653 str(period[0]),
654 '--before',
655 str(period[1]),
656 branch,
657 cwd=git_repo)
658
659 result = []
660 for line in text.splitlines():
661 timestamp, commit = line.split()
662 result.append((int(timestamp), commit))
663 return result
664
665
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800666def reset_hard(git_repo):
667 """Restore modified and deleted files.
668
669 This is simply wrapper of "git reset --hard".
670
671 Args:
672 git_repo: path of git repo.
673 """
674 util.check_call('git', 'reset', '--hard', cwd=git_repo)
675
676
677def list_untracked(git_repo, excludes=None):
678 """List untracked files and directories.
679
680 Args:
681 git_repo: path of git repo.
682 excludes: files and/or directories to ignore, relative to git_repo
683
684 Returns:
685 list of paths, relative to git_repo
686 """
687 exclude_flags = []
688 if excludes:
689 for exclude in excludes:
690 assert not os.path.isabs(exclude), 'should be relative'
691 exclude_flags += ['--exclude', '/' + re.escape(exclude)]
692
693 result = []
694 for path in util.check_output(
695 'git',
696 'ls-files',
697 '--others',
698 '--exclude-standard',
699 *exclude_flags,
700 cwd=git_repo).splitlines():
701 # Remove the trailing slash, which means directory.
702 path = path.rstrip('/')
703 result.append(path)
704 return result
705
706
707def distclean(git_repo, excludes=None):
708 """Clean up git repo directory.
709
710 Restore modified and deleted files. Delete untracked files.
711
712 Args:
713 git_repo: path of git repo.
714 excludes: files and/or directories to ignore, relative to git_repo
715 """
716 reset_hard(git_repo)
717
718 # Delete untracked files.
719 for untracked in list_untracked(git_repo, excludes=excludes):
720 path = os.path.join(git_repo, untracked)
721 logger.debug('delete untracked: %s', path)
Zheng-Jie Chang42fd2e42020-04-10 07:21:23 +0800722 if os.path.islink(path):
723 os.unlink(path)
724 elif os.path.isdir(path):
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800725 shutil.rmtree(path)
726 else:
727 os.unlink(path)
728
729
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800730def get_history(git_repo,
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800731 path=None,
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800732 branch=None,
733 after=None,
734 before=None,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800735 padding_begin=False,
736 padding_end=False,
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800737 with_subject=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800738 """Get commit history of given path.
739
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800740 `after` and `before` could be outside of lifetime of `path`. `padding` is
741 used to control what to return for such cases.
742
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800743 Args:
744 git_repo: path of git repo.
745 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800746 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800747 after: limit history after given time (inclusive)
748 before: limit history before given time (inclusive)
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800749 padding_begin: If True, pads returned result with dummy record at exact
750 'after' time, if 'path' existed at that time.
751 padding_end: If True, pads returned result with dummy record at exact
752 'before' time, if 'path' existed at that time.
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800753 with_subject: If True, return commit subject together
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800754
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800755 Returns:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800756 List of (timestamp, git hash, subject); or (timestamp, git hash) depends
757 on with_subject flag. They are all events when `path` was added, removed,
758 modified, and start and end time if `padding` is true. If `padding` and
759 `with_subject` are both true, 'dummy subject' will be returned as padding
760 history's subject.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800761
762 For each pair, at `timestamp`, the repo state is `git hash`. In other
763 words, `timestamp` is not necessary the commit time of `git hash` for the
764 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800765 """
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800766 log_format = '%ct %H' if not with_subject else '%ct %H %s'
767 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=' + log_format]
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800768 if after:
769 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800770 if before:
771 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800772 if branch:
773 assert not is_git_rev(branch)
774 cmd += [branch]
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800775 if path:
776 # '--' is necessary otherwise if `path` is removed in current revision, git
777 # will complain it's an ambiguous argument which may be path or something
778 # else (like git branch name, tag name, etc.)
779 cmd += ['--', path]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800780
781 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800782 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800783 # array = [timestamp, git_rev, subject] or [timestamp, git_rev]
784 array = line.split(' ', 2)
785 array[0] = int(array[0])
786 result.append(tuple(array))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800787
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800788 if padding_begin or padding_end:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800789 history = [0, '']
790 if with_subject:
791 history.append('dummy subject')
792
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800793 if padding_end:
794 assert before, 'padding_end=True make no sense if before=None'
795 if get_rev_by_time(git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800796 before = int(before)
797 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800798 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800799 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800800 history[0:2] = [before, git_rev]
801 result.append(tuple(history))
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800802
803 if padding_begin:
804 assert after, 'padding_begin=True make no sense if after=None'
805 if get_rev_by_time(git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800806 after = int(after)
807 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800808 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800809 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800810 history[0:2] = [after, git_rev]
811 result.insert(0, tuple(history))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800812
813 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800814
815
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800816def get_history_recursively(git_repo,
817 path,
818 after,
819 before,
820 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800821 padding_end=True,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800822 branch=None):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800823 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800824
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800825 In comparison to get_history(), get_history_recursively also takes
826 dependencies into consideration. For example, if file A referenced file B,
827 get_history_recursively(A) will return commits of B in addition to A. This
828 applies recursively, so commits of C will be included if file B referenced
829 file C, and so on.
830
831 This function is file type neutral. `parser_callback(filename, content)` will
832 be invoked to parse file content and should return list of filename of
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800833 dependencies. If `parser_callback` returns None (usually syntax error), the
834 commit is omitted.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800835
836 Args:
837 git_repo: path of git repo
838 path: path to query, relative to git_repo
839 after: limit history after given time (inclusive)
840 before: limit history before given time (inclusive)
841 parser_callback: callback to parse file content. See above comment.
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800842 padding_end: If True, pads returned result with dummy record at exact
843 'after' time, if 'path' existed at that time.
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800844 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800845
846 Returns:
847 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800848 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800849 history = get_history(
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800850 git_repo,
851 path,
852 after=after,
853 before=before,
854 padding_begin=True,
855 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800856
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800857 # Collect include information of each commit.
858 includes = {}
859 for commit_time, git_rev in history:
860 content = get_file_from_revision(git_repo, git_rev, path)
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800861 parse_result = parser_callback(path, content)
862 if parse_result is None:
863 continue
864 for include_name in parse_result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800865 if include_name not in includes:
866 includes[include_name] = set()
867 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800868
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800869 # Analyze the start time and end time of each include.
870 dependencies = []
871 for include in includes:
872 appeared = None
873 for commit_time, git_rev in history:
874 if git_rev in includes[include]:
875 if not appeared:
876 appeared = commit_time
877 else:
878 if appeared:
Zheng-Jie Chang4d617a42020-02-15 06:46:00 +0800879 # dependency file exists in time range [appeared, commit_time)
880 dependencies.append((include, appeared, commit_time - 1))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800881 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800882
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800883 if appeared is not None:
884 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800885
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800886 # Recursion and merge.
887 result = list(history)
888 for include, appeared, disappeared in dependencies:
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800889 result += get_history_recursively(
890 git_repo,
891 include,
892 appeared,
893 disappeared,
894 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800895 padding_end=False,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800896 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800897
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800898 # Sort and padding.
899 result.sort(key=lambda x: x[0])
900 if padding_end:
901 pad = (before,)
902 pad += result[-1][1:]
903 result.append(pad)
904
905 # Dedup.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800906 result2 = []
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800907 for x in result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800908 if result2 and result2[-1] == x:
909 continue
910 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800911
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800912 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800913
914
Kuang-che Wud558a042020-06-06 02:11:00 +0800915def get_branches(git_repo, all_branches=True, commit=None, remote=False):
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800916 """Get branches of a repository.
917
918 Args:
919 git_repo: path of git repo
920 all_branches: return remote branches if is set to True
921 commit: return branches containing this commit if is not None
Kuang-che Wud558a042020-06-06 02:11:00 +0800922 remote: only remote tracking branches
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800923
924 Returns:
925 list of branch names
926 """
927 cmd = ['git', 'branch', '--format=%(refname)']
928 if all_branches:
929 cmd += ['-a']
930 if commit:
931 cmd += ['--contains', commit]
Kuang-che Wud558a042020-06-06 02:11:00 +0800932 if remote:
933 cmd.append('--remote')
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800934
935 result = []
936 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
937 result.append(line.strip())
938 return result
939
940
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800941def list_commits_between_commits(git_repo, old, new):
942 """Get all commits between (old, new].
943
944 Args:
945 git_repo: path of git repo.
946 old: old commit hash (exclusive)
947 new: new commit hash (inclusive)
948
949 Returns:
950 list of (timestamp, rev)
951 """
952 assert old and new
Kuang-che Wu470866e2020-05-27 11:03:10 +0800953 if old == new:
954 return []
955
956 assert is_ancestor_commit(git_repo, old, new)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800957 commits = []
958 # --first-parent is necessary for Android, see following link for more
959 # discussion.
960 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
961 for line in util.check_output(
962 'git',
963 'rev-list',
964 '--timestamp',
965 '--reverse',
966 '--first-parent',
967 '%s..%s' % (old, new),
968 cwd=git_repo).splitlines():
969 timestamp, git_rev = line.split()
Kuang-che Wued1bb622020-05-30 23:06:23 +0800970 commits.append((int(timestamp), git_rev))
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800971
972 # bisect-kit has a fundamental assumption that commit timestamps are
973 # increasing because we sort and bisect the commits by timestamp across git
974 # repos. If not increasing, we have to adjust the timestamp as workaround.
975 # This might lead to bad bisect result, however the bad probability is low in
976 # practice since most machines' clocks are good enough.
Kuang-che Wued1bb622020-05-30 23:06:23 +0800977 adjusted, commits = _adjust_timestamp_increasingly(commits)
978 if adjusted != 0:
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800979 logger.warning('Commit timestamps are not increasing')
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800980 logger.warning('%d timestamps adjusted', adjusted)
981
982 return commits