blob: f621bdeb304fa11cbe88224b56ea07d3c6c43ecf [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
Kuang-che Wued1bb622020-05-30 23:06:23 +08008import bisect
Kuang-che Wue41e0062017-09-01 19:04:14 +08009import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +080010import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080011import re
Kuang-che Wu3d04eda2019-09-05 23:56:40 +080012import shutil
Zheng-Jie Chang29144442020-02-18 11:53:25 +080013import stat
Kuang-che Wue41e0062017-09-01 19:04:14 +080014import subprocess
Kuang-che Wu13acc7b2020-06-15 10:45:35 +080015import tempfile
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080016import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080017
Kuang-che Wufcbcc502020-06-01 11:48:20 +080018from bisect_kit import cache_util
Kuang-che Wue41e0062017-09-01 19:04:14 +080019from bisect_kit import cli
20from bisect_kit import util
21
22logger = logging.getLogger(__name__)
23
24GIT_FULL_COMMIT_ID_LENGTH = 40
25
26# Minimal acceptable length of git commit id.
27#
28# For chromium, hash collision rate over number of digits:
29# - 6 digits: 4.85%
30# - 7 digits: 0.32%
31# - 8 digits: 0.01%
32# As foolproof check, 7 digits should be enough.
33GIT_MIN_COMMIT_ID_LENGTH = 7
34
35
36def is_git_rev(s):
37 """Is a git hash-like version string.
38
39 It accepts shortened hash with at least 7 digits.
40 """
41 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
42 return False
43 return bool(re.match(r'^[0-9a-f]+$', s))
44
45
46def argtype_git_rev(s):
47 """Validates git hash."""
48 if not is_git_rev(s):
49 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
50 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
51 return s
52
53
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080054def is_git_root(path):
55 """Is given path root of git repo."""
56 return os.path.exists(os.path.join(path, '.git'))
57
58
Kuang-che Wu08366542019-01-12 12:37:49 +080059def is_git_bare_dir(path):
60 """Is inside .git folder or bare git checkout."""
61 if not os.path.isdir(path):
62 return False
63 try:
64 return util.check_output(
65 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
66 except subprocess.CalledProcessError:
67 return False
68
69
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080070def clone(git_repo, repo_url, reference=None):
71 if not os.path.exists(git_repo):
72 os.makedirs(git_repo)
73 cmd = ['git', 'clone', repo_url, '.']
74 if reference:
75 cmd += ['--reference', reference]
76 util.check_call(*cmd, cwd=git_repo)
77
78
Kuang-che Wue41e0062017-09-01 19:04:14 +080079def checkout_version(git_repo, rev):
80 """git checkout.
81
82 Args:
83 git_repo: path of git repo.
84 rev: git commit revision to checkout.
85 """
86 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
87
88
Kuang-che Wu88e96312020-10-20 16:21:11 +080089def init(git_repo, initial_branch='main'):
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080090 """git init.
91
92 git_repo and its parent directories will be created if they don't exist.
93
94 Args:
95 git_repo: path of git repo.
96 """
97 if not os.path.exists(git_repo):
98 os.makedirs(git_repo)
99
Kuang-che Wu88e96312020-10-20 16:21:11 +0800100 util.check_call(
101 'git', 'init', '-q', '--initial-branch', initial_branch, cwd=git_repo)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800102
103
104def commit_file(git_repo,
105 path,
106 message,
107 content,
108 commit_time=None,
109 author_time=None):
110 """Commit a file.
111
112 Args:
113 git_repo: path of git repo
114 path: file path, relative to git_repo
115 message: commit message
116 content: file content
117 commit_time: commit timestamp
118 author_time: author timestamp
119 """
120 if author_time is None:
121 author_time = commit_time
122
123 env = {}
124 if author_time:
125 env['GIT_AUTHOR_DATE'] = str(author_time)
126 if commit_time:
127 env['GIT_COMMITTER_DATE'] = str(commit_time)
128
129 full_path = os.path.join(git_repo, path)
130 dirname = os.path.dirname(full_path)
131 if not os.path.exists(dirname):
132 os.makedirs(dirname)
133 with open(full_path, 'w') as f:
134 f.write(content)
135
136 util.check_call('git', 'add', path, cwd=git_repo)
137 util.check_call(
138 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
139
140
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800141def config(git_repo, *args):
142 """Wrapper of 'git config'.
143
144 Args:
145 git_repo: path of git repo.
146 args: parameters pass to 'git config'
147 """
148 util.check_call('git', 'config', *args, cwd=git_repo)
149
150
151def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800152 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800153
154 Args:
155 git_repo: path of git repo.
156 args: parameters pass to 'git fetch'
157 """
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800158 tries = 0
159 while True:
160 tries += 1
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800161 stderr_lines = []
162 try:
163 util.check_call(
164 'git',
165 'fetch',
166 *args,
167 cwd=git_repo,
168 stderr_callback=stderr_lines.append)
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800169 return
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800170 except subprocess.CalledProcessError:
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800171 if tries >= 5:
172 logger.error('git fetch failed too much times')
173 raise
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800174 stderr = ''.join(stderr_lines)
175 # only retry 5xx internal server error
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800176 if 'The requested URL returned error: 5' in stderr:
177 delay = min(60, 10 * 2**tries)
178 logger.warning('git fetch failed, will retry %s seconds later', delay)
179 time.sleep(delay)
180 continue
181 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800182
183
Kuang-che Wued1bb622020-05-30 23:06:23 +0800184def _adjust_timestamp_increasingly(commits):
185 """Adjust commit timestamps.
186
187 After adjust, the timestamps are increasing.
188
189 Args:
190 commits: list of (timestamp, commit hash)
191
192 Returns:
193 (adjusted count, list of (timestamp, commit hash))
194 """
195 result = []
196 adjusted = 0
197 last_timestamp = -1
198 for timestamp, git_rev in commits:
199 if timestamp < last_timestamp:
200 adjusted += 1
201 timestamp = last_timestamp
202 else:
203 last_timestamp = timestamp
204 result.append((timestamp, git_rev))
205 return adjusted, result
206
207
208class FastLookupFailed(Exception):
209 """No data is cached for this query.
210
211 The caller should fallback to the original operation.
212 """
213
214
215class FastLookupEntry:
216 """Cached commits from one branch of given time period.
217
218 With this class, we can look up commit via commit hash and timestamp fast.
219 """
220
221 def __init__(self, git_repo, branch):
222 self.git_repo = git_repo
223 self.branch = branch
224 self.optimized_period = None
225 self.cached = []
226 self.commit_to_index = {}
227
228 def optimize(self, period):
229 assert period[0] <= period[1]
230 if (self.optimized_period and self.optimized_period[0] <= period[0] and
231 period[1] <= self.optimized_period[1]):
232 # already done
233 return
234
235 self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
236 self.optimized_period = period
237
238 # Adjust timestamps, so we can do binary search by timestamp
239 _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
240
241 self.commit_to_index = {}
242 for i, (_timestamp, rev) in enumerate(self.cached):
243 self.commit_to_index[rev] = i
244
245 def get_rev_by_time(self, timestamp):
246 if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
247 raise FastLookupFailed
248
249 # Note that, the return value might be different as "git rev-list" if the
250 # actual commit timestamps are not fully increasing.
251 x = (timestamp, '')
252 idx = bisect.bisect_right(self.cached, x)
253 if idx == 0 and timestamp < self.cached[0][0]:
254 return None
255 if idx == len(self.cached) or self.cached[idx][0] != timestamp:
256 idx -= 1
257 return self.cached[idx][1]
258
259 def is_containing_commit(self, rev):
260 if rev in self.commit_to_index:
261 return True
262 raise FastLookupFailed
263
Kuang-che Wued1bb622020-05-30 23:06:23 +0800264
265class FastLookup:
266 """Collection of FastLookupEntry"""
267
268 def __init__(self):
269 self.entries = {}
270 self.target_period = None
271
272 def optimize(self, period):
273 self.target_period = period
274
275 def disable(self):
276 self.target_period = None
277 self.entries = {}
278
279 def get_rev_by_time(self, git_repo, timestamp, branch):
280 if not self.target_period:
281 raise FastLookupFailed
282 if not self.target_period[0] <= timestamp <= self.target_period[1]:
283 raise FastLookupFailed
284
285 if git_repo not in self.entries:
286 self.entries[git_repo] = {}
287 if branch not in self.entries[git_repo]:
288 self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
289 entry = self.entries[git_repo][branch]
290 entry.optimize(self.target_period)
291 return entry.get_rev_by_time(timestamp)
292
293 def is_containing_commit(self, git_repo, rev):
294 # This function is optimized only after get_rev_by_time() is invoked.
295 if git_repo not in self.entries:
296 raise FastLookupFailed
297
298 for entry in self.entries[git_repo].values():
299 try:
300 return entry.is_containing_commit(rev)
301 except FastLookupFailed:
302 pass
303 raise FastLookupFailed
304
Kuang-che Wued1bb622020-05-30 23:06:23 +0800305
306fast_lookup = FastLookup()
307
308
Kuang-che Wu98d98462020-06-19 17:07:22 +0800309@cache_util.Cache.default_disabled
Kuang-che Wue41e0062017-09-01 19:04:14 +0800310def is_containing_commit(git_repo, rev):
311 """Determines given commit exists.
312
313 Args:
314 git_repo: path of git repo.
315 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800316
317 Returns:
318 True if rev is inside given git repo. If git_repo is not a git folder,
319 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800320 """
321 try:
Kuang-che Wued1bb622020-05-30 23:06:23 +0800322 return fast_lookup.is_containing_commit(git_repo, rev)
323 except FastLookupFailed:
324 pass
325
326 try:
Kuang-che Wue41e0062017-09-01 19:04:14 +0800327 return util.check_output(
328 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
329 except subprocess.CalledProcessError:
330 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800331 except OSError:
332 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800333
334
Zheng-Jie Changad174a42020-06-20 15:28:10 +0800335@cache_util.Cache.default_disabled
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800336def is_ancestor_commit(git_repo, old, new):
337 """Determines `old` commit is ancestor of `new` commit.
338
339 Args:
340 git_repo: path of git repo.
341 old: the ancestor commit.
342 new: the descendant commit.
343
344 Returns:
345 True only if `old` is the ancestor of `new`. One commit is not considered
346 as ancestor of itself.
347 """
348 return util.check_output(
349 'git',
350 'rev-list',
351 '--ancestry-path',
352 '-1',
353 '%s..%s' % (old, new),
354 cwd=git_repo) != ''
355
356
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800357def _parse_commit_object(s):
358 meta = {}
359 header, meta['message'] = s.split('\n\n', 1)
360 for line in header.splitlines():
361 m = re.match(r'^tree (\w+)', line)
362 if m:
363 meta['tree'] = m.group(1)
364 continue
365
366 m = re.match(r'^parent (\w+)', line)
367 if m:
368 meta['parent'] = line.split()[1:]
369 continue
370
371 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
372 if m:
373 meta[m.group(1)] = m.group(2)
374 meta['%s_time' % m.group(1)] = int(m.group(3))
375 continue
376 return meta
377
378
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800379@cache_util.Cache.default_disabled
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800380def get_commit_metadata(git_repo, rev):
381 """Get metadata of given commit.
382
383 Args:
384 git_repo: path of git repo.
385 rev: git commit revision in query.
386
387 Returns:
388 dict of metadata, including (if available):
389 tree: hash of git tree object
390 parent: list of parent commits; this field is unavailable for the very
391 first commit of git repo.
392 author: name and email of author
393 author_time: author timestamp (without timezone information)
394 committer: name and email of committer
395 committer_time: commit timestamp (without timezone information)
396 message: commit message text
397 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800398 data = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800399 'git', 'cat-file', '-p', rev, cwd=git_repo, log_stdout=False)
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800400 return _parse_commit_object(data)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800401
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800402
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800403def get_batch_commit_metadata(git_repo, revs):
404 query = '\n'.join(revs)
405 logger.debug('get_batch_commit_metadata %r', query)
406 with tempfile.NamedTemporaryFile('w+t') as f:
407 f.write(query)
408 f.flush()
409 # util.check_output doesn't support stdin, so use shell
410 # redirect instead.
411 # binary=True because we need to count size in bytes later.
412 data = util.check_output(
413 'sh',
414 '-c',
415 'git cat-file --batch < ' + f.name,
416 cwd=git_repo,
417 binary=True)
418
419 metas = {}
420 while data:
421 first_line, data = data.split(b'\n', 1)
422 m = re.match(r'^(\w+) (\w+)(?: (\d+))?', first_line.decode('utf8'))
423 assert m, repr(first_line)
424 object_name, object_type = m.group(1, 2)
425 if not m.group(3):
426 metas[object_name] = None
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800427 continue
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800428 assert object_type == 'commit', 'unsupported object type: %s' % object_type
429 object_size = int(m.group(3))
430 assert data[object_size] == ord(b'\n'), repr(data[object_size])
431 obj, data = data[:object_size], data[object_size + 1:]
432 metas[object_name] = _parse_commit_object(obj.decode('utf8'))
433 return metas
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800434
435
Kuang-che Wue41e0062017-09-01 19:04:14 +0800436def get_revlist(git_repo, old, new):
437 """Enumerates git commit between two revisions (inclusive).
438
439 Args:
440 git_repo: path of git repo.
441 old: git commit revision.
442 new: git commit revision.
443
444 Returns:
445 list of git revisions. The list contains the input revisions, old and new.
446 """
447 assert old
448 assert new
449 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
450 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
451 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800452
453
454def get_commit_log(git_repo, rev):
455 """Get git commit log.
456
457 Args:
458 git_repo: path of git repo.
459 rev: git commit revision.
460
461 Returns:
462 commit log message
463 """
464 cmd = ['git', 'log', '-1', '--format=%B', rev]
465 msg = util.check_output(*cmd, cwd=git_repo)
466 return msg
467
468
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800469def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800470 """Get git commit hash.
471
472 Args:
473 git_repo: path of git repo.
474 rev: could be git tag, branch, or (shortened) commit hash
475
476 Returns:
477 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800478
479 Raises:
480 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800481 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800482 try:
483 # Use '^{commit}' to restrict search only commits.
484 # Use '--' to avoid ambiguity, like matching rev against path name.
485 output = util.check_output(
486 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
487 git_rev = output.rstrip('-\n')
488 except subprocess.CalledProcessError:
489 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
490 # because it searches objects other than commits as well.
491 raise ValueError('%s is not unique or does not exist' % rev)
492 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800493 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800494
495
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800496def get_commit_time(git_repo, rev, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800497 """Get git commit timestamp.
498
499 Args:
500 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800501 rev: git commit id, branch name, tag name, or other git object
502 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800503
504 Returns:
505 timestamp (int)
506 """
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800507 cmd = ['git', 'log', '-1', '--format=%ct', rev]
508 if path:
509 cmd += ['--', path]
510 line = util.check_output(*cmd, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800511 return int(line)
512
513
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800514def is_symbolic_link(git_repo, rev, path):
515 """Check if a file is symbolic link.
516
517 Args:
518 git_repo: path of git repo
519 rev: git commit id
520 path: file path
521
522 Returns:
523 True if the specified file is a symbolic link in repo.
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800524
525 Raises:
526 ValueError if not found
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800527 """
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800528 # format: 120000 blob 8735a8c1dd96ede39a21d983d5c96792fd15c1a5 default.xml
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800529 # TODO(kcwu): handle escaped path with special characters
Kuang-che Wu020a1182020-09-08 17:17:22 +0800530 parts = util.check_output(
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800531 'git', 'ls-tree', rev, '--full-name', path, cwd=git_repo).split()
Kuang-che Wu020a1182020-09-08 17:17:22 +0800532 if len(parts) >= 4 and parts[3] == path:
533 return stat.S_ISLNK(int(parts[0], 8))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800534
Kuang-che Wud1b74152020-05-20 08:46:46 +0800535 raise ValueError('file %s is not found in repo:%s rev:%s' %
536 (path, git_repo, rev))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800537
538
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800539@cache_util.Cache.default_disabled
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800540def get_file_from_revision(git_repo, rev, path):
541 """Get file content of given revision.
542
543 Args:
544 git_repo: path of git repo
545 rev: git commit id
546 path: file path
547
548 Returns:
549 file content (str)
550 """
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800551 result = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800552 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_stdout=False)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800553
554 # It might be a symbolic link.
555 # In extreme case, it's possible that filenames contain special characters,
556 # like newlines. In practice, it should be safe to assume no such cases and
557 # reduce disk i/o.
558 if '\n' not in result and is_symbolic_link(git_repo, rev, path):
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800559 return get_file_from_revision(git_repo, rev, result)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800560
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800561 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800562
563
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800564def list_dir_from_revision(git_repo, rev, path):
565 """Lists entries of directory of given revision.
566
567 Args:
568 git_repo: path of git repo
569 rev: git commit id
570 path: directory path, relative to git root
571
572 Returns:
573 list of names
574
575 Raises:
576 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
577 """
578 return util.check_output(
579 'git',
580 'ls-tree',
581 '--name-only',
582 '%s:%s' % (rev, path),
583 cwd=git_repo,
Kuang-che Wubcafc552019-08-15 15:27:02 +0800584 log_stdout=False).splitlines()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800585
586
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800587def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800588 """Query commit of given time.
589
590 Args:
591 git_repo: path of git repo.
592 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800593 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
594 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800595 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800596
597 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800598 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800599 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800600 if not branch:
601 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800602
Kuang-che Wued1bb622020-05-30 23:06:23 +0800603 if not path:
604 try:
605 return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
606 except FastLookupFailed:
607 pass
608
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800609 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800610 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800611 'rev-list',
612 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800613 '-1',
614 '--before',
615 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800616 branch,
617 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800618 if path:
619 cmd += ['--', path]
620
621 result = util.check_output(*cmd, cwd=git_repo).strip()
622 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800623
624
Kuang-che Wued1bb622020-05-30 23:06:23 +0800625def get_revlist_by_period(git_repo, branch, period):
626 # Find the last commit before period[0].
627 text = util.check_output(
628 'git',
629 'rev-list',
630 '--timestamp',
631 '-1',
632 '--before',
633 str(period[0] - 1),
634 branch,
635 cwd=git_repo)
636
637 # Find commits in the period.
638 text += util.check_output(
639 'git',
640 'rev-list',
641 '--timestamp',
642 '--reverse',
643 '--after',
644 str(period[0]),
645 '--before',
646 str(period[1]),
647 branch,
648 cwd=git_repo)
649
650 result = []
651 for line in text.splitlines():
652 timestamp, commit = line.split()
653 result.append((int(timestamp), commit))
654 return result
655
656
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800657def reset_hard(git_repo):
658 """Restore modified and deleted files.
659
660 This is simply wrapper of "git reset --hard".
661
662 Args:
663 git_repo: path of git repo.
664 """
665 util.check_call('git', 'reset', '--hard', cwd=git_repo)
666
667
668def list_untracked(git_repo, excludes=None):
669 """List untracked files and directories.
670
671 Args:
672 git_repo: path of git repo.
673 excludes: files and/or directories to ignore, relative to git_repo
674
675 Returns:
676 list of paths, relative to git_repo
677 """
678 exclude_flags = []
679 if excludes:
680 for exclude in excludes:
681 assert not os.path.isabs(exclude), 'should be relative'
682 exclude_flags += ['--exclude', '/' + re.escape(exclude)]
683
684 result = []
685 for path in util.check_output(
686 'git',
687 'ls-files',
688 '--others',
689 '--exclude-standard',
690 *exclude_flags,
691 cwd=git_repo).splitlines():
692 # Remove the trailing slash, which means directory.
693 path = path.rstrip('/')
694 result.append(path)
695 return result
696
697
698def distclean(git_repo, excludes=None):
699 """Clean up git repo directory.
700
701 Restore modified and deleted files. Delete untracked files.
702
703 Args:
704 git_repo: path of git repo.
705 excludes: files and/or directories to ignore, relative to git_repo
706 """
707 reset_hard(git_repo)
708
709 # Delete untracked files.
710 for untracked in list_untracked(git_repo, excludes=excludes):
711 path = os.path.join(git_repo, untracked)
712 logger.debug('delete untracked: %s', path)
Zheng-Jie Chang42fd2e42020-04-10 07:21:23 +0800713 if os.path.islink(path):
714 os.unlink(path)
715 elif os.path.isdir(path):
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800716 shutil.rmtree(path)
717 else:
718 os.unlink(path)
719
720
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800721def get_history(git_repo,
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800722 path=None,
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800723 branch=None,
724 after=None,
725 before=None,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800726 padding_begin=False,
727 padding_end=False,
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800728 with_subject=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800729 """Get commit history of given path.
730
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800731 `after` and `before` could be outside of lifetime of `path`. `padding` is
732 used to control what to return for such cases.
733
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800734 Args:
735 git_repo: path of git repo.
736 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800737 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800738 after: limit history after given time (inclusive)
739 before: limit history before given time (inclusive)
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800740 padding_begin: If True, pads returned result with dummy record at exact
741 'after' time, if 'path' existed at that time.
742 padding_end: If True, pads returned result with dummy record at exact
743 'before' time, if 'path' existed at that time.
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800744 with_subject: If True, return commit subject together
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800745
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800746 Returns:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800747 List of (timestamp, git hash, subject); or (timestamp, git hash) depends
748 on with_subject flag. They are all events when `path` was added, removed,
749 modified, and start and end time if `padding` is true. If `padding` and
750 `with_subject` are both true, 'dummy subject' will be returned as padding
751 history's subject.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800752
753 For each pair, at `timestamp`, the repo state is `git hash`. In other
754 words, `timestamp` is not necessary the commit time of `git hash` for the
755 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800756 """
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800757 log_format = '%ct %H' if not with_subject else '%ct %H %s'
758 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=' + log_format]
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800759 if after:
760 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800761 if before:
762 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800763 if branch:
764 assert not is_git_rev(branch)
765 cmd += [branch]
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800766 if path:
767 # '--' is necessary otherwise if `path` is removed in current revision, git
768 # will complain it's an ambiguous argument which may be path or something
769 # else (like git branch name, tag name, etc.)
770 cmd += ['--', path]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800771
772 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800773 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800774 # array = [timestamp, git_rev, subject] or [timestamp, git_rev]
775 array = line.split(' ', 2)
776 array[0] = int(array[0])
777 result.append(tuple(array))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800778
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800779 if padding_begin or padding_end:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800780 history = [0, '']
781 if with_subject:
782 history.append('dummy subject')
783
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800784 if padding_end:
785 assert before, 'padding_end=True make no sense if before=None'
786 if get_rev_by_time(git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800787 before = int(before)
788 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800789 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800790 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800791 history[0:2] = [before, git_rev]
792 result.append(tuple(history))
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800793
794 if padding_begin:
795 assert after, 'padding_begin=True make no sense if after=None'
796 if get_rev_by_time(git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800797 after = int(after)
798 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800799 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800800 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800801 history[0:2] = [after, git_rev]
802 result.insert(0, tuple(history))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800803
804 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800805
806
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800807def get_history_recursively(git_repo,
808 path,
809 after,
810 before,
811 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800812 padding_end=True,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800813 branch=None):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800814 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800815
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800816 In comparison to get_history(), get_history_recursively also takes
817 dependencies into consideration. For example, if file A referenced file B,
818 get_history_recursively(A) will return commits of B in addition to A. This
819 applies recursively, so commits of C will be included if file B referenced
820 file C, and so on.
821
822 This function is file type neutral. `parser_callback(filename, content)` will
823 be invoked to parse file content and should return list of filename of
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800824 dependencies. If `parser_callback` returns None (usually syntax error), the
825 commit is omitted.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800826
827 Args:
828 git_repo: path of git repo
829 path: path to query, relative to git_repo
830 after: limit history after given time (inclusive)
831 before: limit history before given time (inclusive)
832 parser_callback: callback to parse file content. See above comment.
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800833 padding_end: If True, pads returned result with dummy record at exact
834 'after' time, if 'path' existed at that time.
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800835 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800836
837 Returns:
838 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800839 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800840 history = get_history(
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800841 git_repo,
842 path,
843 after=after,
844 before=before,
845 padding_begin=True,
846 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800847
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800848 # Collect include information of each commit.
849 includes = {}
850 for commit_time, git_rev in history:
851 content = get_file_from_revision(git_repo, git_rev, path)
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800852 parse_result = parser_callback(path, content)
853 if parse_result is None:
854 continue
855 for include_name in parse_result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800856 if include_name not in includes:
857 includes[include_name] = set()
858 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800859
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800860 # Analyze the start time and end time of each include.
861 dependencies = []
862 for include in includes:
863 appeared = None
864 for commit_time, git_rev in history:
865 if git_rev in includes[include]:
866 if not appeared:
867 appeared = commit_time
868 else:
869 if appeared:
Zheng-Jie Chang4d617a42020-02-15 06:46:00 +0800870 # dependency file exists in time range [appeared, commit_time)
871 dependencies.append((include, appeared, commit_time - 1))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800872 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800873
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800874 if appeared is not None:
875 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800876
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800877 # Recursion and merge.
878 result = list(history)
879 for include, appeared, disappeared in dependencies:
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800880 result += get_history_recursively(
881 git_repo,
882 include,
883 appeared,
884 disappeared,
885 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800886 padding_end=False,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800887 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800888
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800889 # Sort and padding.
890 result.sort(key=lambda x: x[0])
891 if padding_end:
892 pad = (before,)
893 pad += result[-1][1:]
894 result.append(pad)
895
896 # Dedup.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800897 result2 = []
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800898 for x in result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800899 if result2 and result2[-1] == x:
900 continue
901 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800902
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800903 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800904
905
Kuang-che Wud558a042020-06-06 02:11:00 +0800906def get_branches(git_repo, all_branches=True, commit=None, remote=False):
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800907 """Get branches of a repository.
908
909 Args:
910 git_repo: path of git repo
911 all_branches: return remote branches if is set to True
912 commit: return branches containing this commit if is not None
Kuang-che Wud558a042020-06-06 02:11:00 +0800913 remote: only remote tracking branches
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800914
915 Returns:
916 list of branch names
917 """
918 cmd = ['git', 'branch', '--format=%(refname)']
919 if all_branches:
920 cmd += ['-a']
921 if commit:
922 cmd += ['--contains', commit]
Kuang-che Wud558a042020-06-06 02:11:00 +0800923 if remote:
924 cmd.append('--remote')
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800925
926 result = []
927 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
928 result.append(line.strip())
929 return result
930
931
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800932def list_commits_between_commits(git_repo, old, new):
933 """Get all commits between (old, new].
934
935 Args:
936 git_repo: path of git repo.
937 old: old commit hash (exclusive)
938 new: new commit hash (inclusive)
939
940 Returns:
941 list of (timestamp, rev)
942 """
943 assert old and new
Kuang-che Wu470866e2020-05-27 11:03:10 +0800944 if old == new:
945 return []
946
947 assert is_ancestor_commit(git_repo, old, new)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800948 commits = []
949 # --first-parent is necessary for Android, see following link for more
950 # discussion.
951 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
952 for line in util.check_output(
953 'git',
954 'rev-list',
955 '--timestamp',
956 '--reverse',
957 '--first-parent',
958 '%s..%s' % (old, new),
959 cwd=git_repo).splitlines():
960 timestamp, git_rev = line.split()
Kuang-che Wued1bb622020-05-30 23:06:23 +0800961 commits.append((int(timestamp), git_rev))
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800962
963 # bisect-kit has a fundamental assumption that commit timestamps are
964 # increasing because we sort and bisect the commits by timestamp across git
965 # repos. If not increasing, we have to adjust the timestamp as workaround.
966 # This might lead to bad bisect result, however the bad probability is low in
967 # practice since most machines' clocks are good enough.
Kuang-che Wued1bb622020-05-30 23:06:23 +0800968 adjusted, commits = _adjust_timestamp_increasingly(commits)
969 if adjusted != 0:
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800970 logger.warning('Commit timestamps are not increasing')
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800971 logger.warning('%d timestamps adjusted', adjusted)
972
973 return commits