blob: 9fdf7d996259cc059ee4919511db6c9212bb95fb [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
Kuang-che Wued1bb622020-05-30 23:06:23 +08008import bisect
Kuang-che Wue41e0062017-09-01 19:04:14 +08009import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +080010import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080011import re
Kuang-che Wu3d04eda2019-09-05 23:56:40 +080012import shutil
Zheng-Jie Chang29144442020-02-18 11:53:25 +080013import stat
Kuang-che Wue41e0062017-09-01 19:04:14 +080014import subprocess
Kuang-che Wu13acc7b2020-06-15 10:45:35 +080015import tempfile
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080016import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080017
Kuang-che Wufcbcc502020-06-01 11:48:20 +080018from bisect_kit import cache_util
Kuang-che Wue41e0062017-09-01 19:04:14 +080019from bisect_kit import cli
20from bisect_kit import util
21
22logger = logging.getLogger(__name__)
23
24GIT_FULL_COMMIT_ID_LENGTH = 40
25
26# Minimal acceptable length of git commit id.
27#
28# For chromium, hash collision rate over number of digits:
29# - 6 digits: 4.85%
30# - 7 digits: 0.32%
31# - 8 digits: 0.01%
32# As foolproof check, 7 digits should be enough.
33GIT_MIN_COMMIT_ID_LENGTH = 7
34
35
36def is_git_rev(s):
37 """Is a git hash-like version string.
38
39 It accepts shortened hash with at least 7 digits.
40 """
41 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
42 return False
43 return bool(re.match(r'^[0-9a-f]+$', s))
44
45
46def argtype_git_rev(s):
47 """Validates git hash."""
48 if not is_git_rev(s):
49 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
50 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
51 return s
52
53
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080054def is_git_root(path):
55 """Is given path root of git repo."""
56 return os.path.exists(os.path.join(path, '.git'))
57
58
Kuang-che Wu08366542019-01-12 12:37:49 +080059def is_git_bare_dir(path):
60 """Is inside .git folder or bare git checkout."""
61 if not os.path.isdir(path):
62 return False
63 try:
64 return util.check_output(
65 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
66 except subprocess.CalledProcessError:
67 return False
68
69
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080070def clone(git_repo, repo_url, reference=None):
71 if not os.path.exists(git_repo):
72 os.makedirs(git_repo)
73 cmd = ['git', 'clone', repo_url, '.']
74 if reference:
75 cmd += ['--reference', reference]
76 util.check_call(*cmd, cwd=git_repo)
77
78
Kuang-che Wue41e0062017-09-01 19:04:14 +080079def checkout_version(git_repo, rev):
80 """git checkout.
81
82 Args:
83 git_repo: path of git repo.
84 rev: git commit revision to checkout.
85 """
86 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
87
88
Kuang-che Wu88e96312020-10-20 16:21:11 +080089def init(git_repo, initial_branch='main'):
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080090 """git init.
91
92 git_repo and its parent directories will be created if they don't exist.
93
94 Args:
95 git_repo: path of git repo.
Kuang-che Wuf0bfd182020-10-26 15:52:29 +080096 initial_branch: the default branch after git init
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080097 """
98 if not os.path.exists(git_repo):
99 os.makedirs(git_repo)
100
Kuang-che Wu88e96312020-10-20 16:21:11 +0800101 util.check_call(
102 'git', 'init', '-q', '--initial-branch', initial_branch, cwd=git_repo)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800103
104
105def commit_file(git_repo,
106 path,
107 message,
108 content,
109 commit_time=None,
110 author_time=None):
111 """Commit a file.
112
113 Args:
114 git_repo: path of git repo
115 path: file path, relative to git_repo
116 message: commit message
117 content: file content
118 commit_time: commit timestamp
119 author_time: author timestamp
120 """
121 if author_time is None:
122 author_time = commit_time
123
124 env = {}
125 if author_time:
126 env['GIT_AUTHOR_DATE'] = str(author_time)
127 if commit_time:
128 env['GIT_COMMITTER_DATE'] = str(commit_time)
129
130 full_path = os.path.join(git_repo, path)
131 dirname = os.path.dirname(full_path)
132 if not os.path.exists(dirname):
133 os.makedirs(dirname)
134 with open(full_path, 'w') as f:
135 f.write(content)
136
137 util.check_call('git', 'add', path, cwd=git_repo)
138 util.check_call(
139 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
140
141
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800142def config(git_repo, *args):
143 """Wrapper of 'git config'.
144
145 Args:
146 git_repo: path of git repo.
147 args: parameters pass to 'git config'
148 """
149 util.check_call('git', 'config', *args, cwd=git_repo)
150
151
152def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800153 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800154
155 Args:
156 git_repo: path of git repo.
157 args: parameters pass to 'git fetch'
158 """
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800159 tries = 0
160 while True:
161 tries += 1
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800162 stderr_lines = []
163 try:
164 util.check_call(
165 'git',
166 'fetch',
167 *args,
168 cwd=git_repo,
169 stderr_callback=stderr_lines.append)
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800170 return
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800171 except subprocess.CalledProcessError:
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800172 if tries >= 5:
173 logger.error('git fetch failed too much times')
174 raise
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800175 stderr = ''.join(stderr_lines)
176 # only retry 5xx internal server error
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800177 if 'The requested URL returned error: 5' in stderr:
178 delay = min(60, 10 * 2**tries)
179 logger.warning('git fetch failed, will retry %s seconds later', delay)
180 time.sleep(delay)
181 continue
182 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800183
184
Kuang-che Wued1bb622020-05-30 23:06:23 +0800185def _adjust_timestamp_increasingly(commits):
186 """Adjust commit timestamps.
187
188 After adjust, the timestamps are increasing.
189
190 Args:
191 commits: list of (timestamp, commit hash)
192
193 Returns:
194 (adjusted count, list of (timestamp, commit hash))
195 """
196 result = []
197 adjusted = 0
198 last_timestamp = -1
199 for timestamp, git_rev in commits:
200 if timestamp < last_timestamp:
201 adjusted += 1
202 timestamp = last_timestamp
203 else:
204 last_timestamp = timestamp
205 result.append((timestamp, git_rev))
206 return adjusted, result
207
208
209class FastLookupFailed(Exception):
210 """No data is cached for this query.
211
212 The caller should fallback to the original operation.
213 """
214
215
216class FastLookupEntry:
217 """Cached commits from one branch of given time period.
218
219 With this class, we can look up commit via commit hash and timestamp fast.
220 """
221
222 def __init__(self, git_repo, branch):
223 self.git_repo = git_repo
224 self.branch = branch
225 self.optimized_period = None
226 self.cached = []
227 self.commit_to_index = {}
228
229 def optimize(self, period):
230 assert period[0] <= period[1]
231 if (self.optimized_period and self.optimized_period[0] <= period[0] and
232 period[1] <= self.optimized_period[1]):
233 # already done
234 return
235
236 self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
237 self.optimized_period = period
238
239 # Adjust timestamps, so we can do binary search by timestamp
240 _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
241
242 self.commit_to_index = {}
243 for i, (_timestamp, rev) in enumerate(self.cached):
244 self.commit_to_index[rev] = i
245
246 def get_rev_by_time(self, timestamp):
247 if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
248 raise FastLookupFailed
249
250 # Note that, the return value might be different as "git rev-list" if the
251 # actual commit timestamps are not fully increasing.
252 x = (timestamp, '')
253 idx = bisect.bisect_right(self.cached, x)
254 if idx == 0 and timestamp < self.cached[0][0]:
255 return None
256 if idx == len(self.cached) or self.cached[idx][0] != timestamp:
257 idx -= 1
258 return self.cached[idx][1]
259
260 def is_containing_commit(self, rev):
261 if rev in self.commit_to_index:
262 return True
263 raise FastLookupFailed
264
Kuang-che Wued1bb622020-05-30 23:06:23 +0800265
266class FastLookup:
267 """Collection of FastLookupEntry"""
268
269 def __init__(self):
270 self.entries = {}
271 self.target_period = None
272
273 def optimize(self, period):
274 self.target_period = period
275
276 def disable(self):
277 self.target_period = None
278 self.entries = {}
279
280 def get_rev_by_time(self, git_repo, timestamp, branch):
281 if not self.target_period:
282 raise FastLookupFailed
283 if not self.target_period[0] <= timestamp <= self.target_period[1]:
284 raise FastLookupFailed
285
286 if git_repo not in self.entries:
287 self.entries[git_repo] = {}
288 if branch not in self.entries[git_repo]:
289 self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
290 entry = self.entries[git_repo][branch]
291 entry.optimize(self.target_period)
292 return entry.get_rev_by_time(timestamp)
293
294 def is_containing_commit(self, git_repo, rev):
295 # This function is optimized only after get_rev_by_time() is invoked.
296 if git_repo not in self.entries:
297 raise FastLookupFailed
298
299 for entry in self.entries[git_repo].values():
300 try:
301 return entry.is_containing_commit(rev)
302 except FastLookupFailed:
303 pass
304 raise FastLookupFailed
305
Kuang-che Wued1bb622020-05-30 23:06:23 +0800306
307fast_lookup = FastLookup()
308
309
Kuang-che Wu98d98462020-06-19 17:07:22 +0800310@cache_util.Cache.default_disabled
Kuang-che Wue41e0062017-09-01 19:04:14 +0800311def is_containing_commit(git_repo, rev):
312 """Determines given commit exists.
313
314 Args:
315 git_repo: path of git repo.
316 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800317
318 Returns:
319 True if rev is inside given git repo. If git_repo is not a git folder,
320 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800321 """
322 try:
Kuang-che Wued1bb622020-05-30 23:06:23 +0800323 return fast_lookup.is_containing_commit(git_repo, rev)
324 except FastLookupFailed:
325 pass
326
327 try:
Kuang-che Wue41e0062017-09-01 19:04:14 +0800328 return util.check_output(
329 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
330 except subprocess.CalledProcessError:
331 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800332 except OSError:
333 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800334
335
Zheng-Jie Changad174a42020-06-20 15:28:10 +0800336@cache_util.Cache.default_disabled
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800337def is_ancestor_commit(git_repo, old, new):
338 """Determines `old` commit is ancestor of `new` commit.
339
340 Args:
341 git_repo: path of git repo.
342 old: the ancestor commit.
343 new: the descendant commit.
344
345 Returns:
346 True only if `old` is the ancestor of `new`. One commit is not considered
347 as ancestor of itself.
348 """
349 return util.check_output(
350 'git',
351 'rev-list',
352 '--ancestry-path',
353 '-1',
354 '%s..%s' % (old, new),
355 cwd=git_repo) != ''
356
357
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800358def _parse_commit_object(s):
359 meta = {}
360 header, meta['message'] = s.split('\n\n', 1)
361 for line in header.splitlines():
362 m = re.match(r'^tree (\w+)', line)
363 if m:
364 meta['tree'] = m.group(1)
365 continue
366
367 m = re.match(r'^parent (\w+)', line)
368 if m:
369 meta['parent'] = line.split()[1:]
370 continue
371
372 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
373 if m:
374 meta[m.group(1)] = m.group(2)
375 meta['%s_time' % m.group(1)] = int(m.group(3))
376 continue
377 return meta
378
379
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800380@cache_util.Cache.default_disabled
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800381def get_commit_metadata(git_repo, rev):
382 """Get metadata of given commit.
383
384 Args:
385 git_repo: path of git repo.
386 rev: git commit revision in query.
387
388 Returns:
389 dict of metadata, including (if available):
390 tree: hash of git tree object
391 parent: list of parent commits; this field is unavailable for the very
392 first commit of git repo.
393 author: name and email of author
394 author_time: author timestamp (without timezone information)
395 committer: name and email of committer
396 committer_time: commit timestamp (without timezone information)
397 message: commit message text
398 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800399 data = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800400 'git', 'cat-file', '-p', rev, cwd=git_repo, log_stdout=False)
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800401 return _parse_commit_object(data)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800402
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800403
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800404def get_batch_commit_metadata(git_repo, revs):
405 query = '\n'.join(revs)
406 logger.debug('get_batch_commit_metadata %r', query)
407 with tempfile.NamedTemporaryFile('w+t') as f:
408 f.write(query)
409 f.flush()
410 # util.check_output doesn't support stdin, so use shell
411 # redirect instead.
412 # binary=True because we need to count size in bytes later.
413 data = util.check_output(
414 'sh',
415 '-c',
416 'git cat-file --batch < ' + f.name,
417 cwd=git_repo,
418 binary=True)
419
420 metas = {}
421 while data:
422 first_line, data = data.split(b'\n', 1)
423 m = re.match(r'^(\w+) (\w+)(?: (\d+))?', first_line.decode('utf8'))
424 assert m, repr(first_line)
425 object_name, object_type = m.group(1, 2)
426 if not m.group(3):
427 metas[object_name] = None
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800428 continue
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800429 assert object_type == 'commit', 'unsupported object type: %s' % object_type
430 object_size = int(m.group(3))
431 assert data[object_size] == ord(b'\n'), repr(data[object_size])
432 obj, data = data[:object_size], data[object_size + 1:]
433 metas[object_name] = _parse_commit_object(obj.decode('utf8'))
434 return metas
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800435
436
Kuang-che Wue41e0062017-09-01 19:04:14 +0800437def get_revlist(git_repo, old, new):
438 """Enumerates git commit between two revisions (inclusive).
439
440 Args:
441 git_repo: path of git repo.
442 old: git commit revision.
443 new: git commit revision.
444
445 Returns:
446 list of git revisions. The list contains the input revisions, old and new.
447 """
448 assert old
449 assert new
450 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
451 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
452 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800453
454
455def get_commit_log(git_repo, rev):
456 """Get git commit log.
457
458 Args:
459 git_repo: path of git repo.
460 rev: git commit revision.
461
462 Returns:
463 commit log message
464 """
465 cmd = ['git', 'log', '-1', '--format=%B', rev]
466 msg = util.check_output(*cmd, cwd=git_repo)
467 return msg
468
469
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800470def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800471 """Get git commit hash.
472
473 Args:
474 git_repo: path of git repo.
475 rev: could be git tag, branch, or (shortened) commit hash
476
477 Returns:
478 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800479
480 Raises:
481 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800482 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800483 try:
484 # Use '^{commit}' to restrict search only commits.
485 # Use '--' to avoid ambiguity, like matching rev against path name.
486 output = util.check_output(
487 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
488 git_rev = output.rstrip('-\n')
489 except subprocess.CalledProcessError:
490 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
491 # because it searches objects other than commits as well.
492 raise ValueError('%s is not unique or does not exist' % rev)
493 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800494 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800495
496
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800497def get_commit_time(git_repo, rev, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800498 """Get git commit timestamp.
499
500 Args:
501 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800502 rev: git commit id, branch name, tag name, or other git object
503 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800504
505 Returns:
506 timestamp (int)
507 """
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800508 cmd = ['git', 'log', '-1', '--format=%ct', rev]
509 if path:
510 cmd += ['--', path]
511 line = util.check_output(*cmd, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800512 return int(line)
513
514
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800515def is_symbolic_link(git_repo, rev, path):
516 """Check if a file is symbolic link.
517
518 Args:
519 git_repo: path of git repo
520 rev: git commit id
521 path: file path
522
523 Returns:
524 True if the specified file is a symbolic link in repo.
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800525
526 Raises:
527 ValueError if not found
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800528 """
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800529 # format: 120000 blob 8735a8c1dd96ede39a21d983d5c96792fd15c1a5 default.xml
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800530 # TODO(kcwu): handle escaped path with special characters
Kuang-che Wu020a1182020-09-08 17:17:22 +0800531 parts = util.check_output(
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800532 'git', 'ls-tree', rev, '--full-name', path, cwd=git_repo).split()
Kuang-che Wu020a1182020-09-08 17:17:22 +0800533 if len(parts) >= 4 and parts[3] == path:
534 return stat.S_ISLNK(int(parts[0], 8))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800535
Kuang-che Wud1b74152020-05-20 08:46:46 +0800536 raise ValueError('file %s is not found in repo:%s rev:%s' %
537 (path, git_repo, rev))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800538
539
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800540@cache_util.Cache.default_disabled
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800541def get_file_from_revision(git_repo, rev, path):
542 """Get file content of given revision.
543
544 Args:
545 git_repo: path of git repo
546 rev: git commit id
547 path: file path
548
549 Returns:
550 file content (str)
551 """
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800552 result = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800553 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_stdout=False)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800554
555 # It might be a symbolic link.
556 # In extreme case, it's possible that filenames contain special characters,
557 # like newlines. In practice, it should be safe to assume no such cases and
558 # reduce disk i/o.
559 if '\n' not in result and is_symbolic_link(git_repo, rev, path):
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800560 return get_file_from_revision(git_repo, rev, result)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800561
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800562 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800563
564
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800565def list_dir_from_revision(git_repo, rev, path):
566 """Lists entries of directory of given revision.
567
568 Args:
569 git_repo: path of git repo
570 rev: git commit id
571 path: directory path, relative to git root
572
573 Returns:
574 list of names
575
576 Raises:
577 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
578 """
579 return util.check_output(
580 'git',
581 'ls-tree',
582 '--name-only',
583 '%s:%s' % (rev, path),
584 cwd=git_repo,
Kuang-che Wubcafc552019-08-15 15:27:02 +0800585 log_stdout=False).splitlines()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800586
587
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800588def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800589 """Query commit of given time.
590
591 Args:
592 git_repo: path of git repo.
593 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800594 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
595 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800596 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800597
598 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800599 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800600 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800601 if not branch:
602 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800603
Kuang-che Wued1bb622020-05-30 23:06:23 +0800604 if not path:
605 try:
606 return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
607 except FastLookupFailed:
608 pass
609
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800610 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800611 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800612 'rev-list',
613 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800614 '-1',
615 '--before',
616 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800617 branch,
618 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800619 if path:
620 cmd += ['--', path]
621
622 result = util.check_output(*cmd, cwd=git_repo).strip()
623 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800624
625
Kuang-che Wued1bb622020-05-30 23:06:23 +0800626def get_revlist_by_period(git_repo, branch, period):
627 # Find the last commit before period[0].
628 text = util.check_output(
629 'git',
630 'rev-list',
631 '--timestamp',
632 '-1',
633 '--before',
634 str(period[0] - 1),
635 branch,
636 cwd=git_repo)
637
638 # Find commits in the period.
639 text += util.check_output(
640 'git',
641 'rev-list',
642 '--timestamp',
643 '--reverse',
644 '--after',
645 str(period[0]),
646 '--before',
647 str(period[1]),
648 branch,
649 cwd=git_repo)
650
651 result = []
652 for line in text.splitlines():
653 timestamp, commit = line.split()
654 result.append((int(timestamp), commit))
655 return result
656
657
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800658def reset_hard(git_repo):
659 """Restore modified and deleted files.
660
661 This is simply wrapper of "git reset --hard".
662
663 Args:
664 git_repo: path of git repo.
665 """
666 util.check_call('git', 'reset', '--hard', cwd=git_repo)
667
668
669def list_untracked(git_repo, excludes=None):
670 """List untracked files and directories.
671
672 Args:
673 git_repo: path of git repo.
674 excludes: files and/or directories to ignore, relative to git_repo
675
676 Returns:
677 list of paths, relative to git_repo
678 """
679 exclude_flags = []
680 if excludes:
681 for exclude in excludes:
682 assert not os.path.isabs(exclude), 'should be relative'
683 exclude_flags += ['--exclude', '/' + re.escape(exclude)]
684
685 result = []
686 for path in util.check_output(
687 'git',
688 'ls-files',
689 '--others',
690 '--exclude-standard',
691 *exclude_flags,
692 cwd=git_repo).splitlines():
693 # Remove the trailing slash, which means directory.
694 path = path.rstrip('/')
695 result.append(path)
696 return result
697
698
699def distclean(git_repo, excludes=None):
700 """Clean up git repo directory.
701
702 Restore modified and deleted files. Delete untracked files.
703
704 Args:
705 git_repo: path of git repo.
706 excludes: files and/or directories to ignore, relative to git_repo
707 """
708 reset_hard(git_repo)
709
710 # Delete untracked files.
711 for untracked in list_untracked(git_repo, excludes=excludes):
712 path = os.path.join(git_repo, untracked)
713 logger.debug('delete untracked: %s', path)
Zheng-Jie Chang42fd2e42020-04-10 07:21:23 +0800714 if os.path.islink(path):
715 os.unlink(path)
716 elif os.path.isdir(path):
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800717 shutil.rmtree(path)
718 else:
719 os.unlink(path)
720
721
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800722def get_history(git_repo,
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800723 path=None,
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800724 branch=None,
725 after=None,
726 before=None,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800727 padding_begin=False,
728 padding_end=False,
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800729 with_subject=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800730 """Get commit history of given path.
731
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800732 `after` and `before` could be outside of lifetime of `path`. `padding` is
733 used to control what to return for such cases.
734
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800735 Args:
736 git_repo: path of git repo.
737 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800738 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800739 after: limit history after given time (inclusive)
740 before: limit history before given time (inclusive)
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800741 padding_begin: If True, pads returned result with dummy record at exact
742 'after' time, if 'path' existed at that time.
743 padding_end: If True, pads returned result with dummy record at exact
744 'before' time, if 'path' existed at that time.
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800745 with_subject: If True, return commit subject together
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800746
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800747 Returns:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800748 List of (timestamp, git hash, subject); or (timestamp, git hash) depends
749 on with_subject flag. They are all events when `path` was added, removed,
750 modified, and start and end time if `padding` is true. If `padding` and
751 `with_subject` are both true, 'dummy subject' will be returned as padding
752 history's subject.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800753
754 For each pair, at `timestamp`, the repo state is `git hash`. In other
755 words, `timestamp` is not necessary the commit time of `git hash` for the
756 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800757 """
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800758 log_format = '%ct %H' if not with_subject else '%ct %H %s'
759 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=' + log_format]
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800760 if after:
761 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800762 if before:
763 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800764 if branch:
765 assert not is_git_rev(branch)
766 cmd += [branch]
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800767 if path:
768 # '--' is necessary otherwise if `path` is removed in current revision, git
769 # will complain it's an ambiguous argument which may be path or something
770 # else (like git branch name, tag name, etc.)
771 cmd += ['--', path]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800772
773 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800774 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800775 # array = [timestamp, git_rev, subject] or [timestamp, git_rev]
776 array = line.split(' ', 2)
777 array[0] = int(array[0])
778 result.append(tuple(array))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800779
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800780 if padding_begin or padding_end:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800781 history = [0, '']
782 if with_subject:
783 history.append('dummy subject')
784
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800785 if padding_end:
786 assert before, 'padding_end=True make no sense if before=None'
787 if get_rev_by_time(git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800788 before = int(before)
789 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800790 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800791 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800792 history[0:2] = [before, git_rev]
793 result.append(tuple(history))
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800794
795 if padding_begin:
796 assert after, 'padding_begin=True make no sense if after=None'
797 if get_rev_by_time(git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800798 after = int(after)
799 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800800 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800801 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800802 history[0:2] = [after, git_rev]
803 result.insert(0, tuple(history))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800804
805 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800806
807
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800808def get_history_recursively(git_repo,
809 path,
810 after,
811 before,
812 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800813 padding_end=True,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800814 branch=None):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800815 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800816
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800817 In comparison to get_history(), get_history_recursively also takes
818 dependencies into consideration. For example, if file A referenced file B,
819 get_history_recursively(A) will return commits of B in addition to A. This
820 applies recursively, so commits of C will be included if file B referenced
821 file C, and so on.
822
823 This function is file type neutral. `parser_callback(filename, content)` will
824 be invoked to parse file content and should return list of filename of
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800825 dependencies. If `parser_callback` returns None (usually syntax error), the
826 commit is omitted.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800827
828 Args:
829 git_repo: path of git repo
830 path: path to query, relative to git_repo
831 after: limit history after given time (inclusive)
832 before: limit history before given time (inclusive)
833 parser_callback: callback to parse file content. See above comment.
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800834 padding_end: If True, pads returned result with dummy record at exact
835 'after' time, if 'path' existed at that time.
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800836 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800837
838 Returns:
839 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800840 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800841 history = get_history(
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800842 git_repo,
843 path,
844 after=after,
845 before=before,
846 padding_begin=True,
847 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800848
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800849 # Collect include information of each commit.
850 includes = {}
851 for commit_time, git_rev in history:
852 content = get_file_from_revision(git_repo, git_rev, path)
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800853 parse_result = parser_callback(path, content)
854 if parse_result is None:
855 continue
856 for include_name in parse_result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800857 if include_name not in includes:
858 includes[include_name] = set()
859 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800860
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800861 # Analyze the start time and end time of each include.
862 dependencies = []
863 for include in includes:
864 appeared = None
865 for commit_time, git_rev in history:
866 if git_rev in includes[include]:
867 if not appeared:
868 appeared = commit_time
869 else:
870 if appeared:
Zheng-Jie Chang4d617a42020-02-15 06:46:00 +0800871 # dependency file exists in time range [appeared, commit_time)
872 dependencies.append((include, appeared, commit_time - 1))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800873 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800874
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800875 if appeared is not None:
876 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800877
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800878 # Recursion and merge.
879 result = list(history)
880 for include, appeared, disappeared in dependencies:
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800881 result += get_history_recursively(
882 git_repo,
883 include,
884 appeared,
885 disappeared,
886 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800887 padding_end=False,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800888 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800889
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800890 # Sort and padding.
891 result.sort(key=lambda x: x[0])
892 if padding_end:
893 pad = (before,)
894 pad += result[-1][1:]
895 result.append(pad)
896
897 # Dedup.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800898 result2 = []
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800899 for x in result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800900 if result2 and result2[-1] == x:
901 continue
902 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800903
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800904 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800905
906
Kuang-che Wud558a042020-06-06 02:11:00 +0800907def get_branches(git_repo, all_branches=True, commit=None, remote=False):
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800908 """Get branches of a repository.
909
910 Args:
911 git_repo: path of git repo
912 all_branches: return remote branches if is set to True
913 commit: return branches containing this commit if is not None
Kuang-che Wud558a042020-06-06 02:11:00 +0800914 remote: only remote tracking branches
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800915
916 Returns:
917 list of branch names
918 """
919 cmd = ['git', 'branch', '--format=%(refname)']
920 if all_branches:
921 cmd += ['-a']
922 if commit:
923 cmd += ['--contains', commit]
Kuang-che Wud558a042020-06-06 02:11:00 +0800924 if remote:
925 cmd.append('--remote')
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800926
927 result = []
928 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
929 result.append(line.strip())
930 return result
931
932
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800933def list_commits_between_commits(git_repo, old, new):
934 """Get all commits between (old, new].
935
936 Args:
937 git_repo: path of git repo.
938 old: old commit hash (exclusive)
939 new: new commit hash (inclusive)
940
941 Returns:
942 list of (timestamp, rev)
943 """
944 assert old and new
Kuang-che Wu470866e2020-05-27 11:03:10 +0800945 if old == new:
946 return []
947
948 assert is_ancestor_commit(git_repo, old, new)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800949 commits = []
950 # --first-parent is necessary for Android, see following link for more
951 # discussion.
952 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
953 for line in util.check_output(
954 'git',
955 'rev-list',
956 '--timestamp',
957 '--reverse',
958 '--first-parent',
959 '%s..%s' % (old, new),
960 cwd=git_repo).splitlines():
961 timestamp, git_rev = line.split()
Kuang-che Wued1bb622020-05-30 23:06:23 +0800962 commits.append((int(timestamp), git_rev))
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800963
964 # bisect-kit has a fundamental assumption that commit timestamps are
965 # increasing because we sort and bisect the commits by timestamp across git
966 # repos. If not increasing, we have to adjust the timestamp as workaround.
967 # This might lead to bad bisect result, however the bad probability is low in
968 # practice since most machines' clocks are good enough.
Kuang-che Wued1bb622020-05-30 23:06:23 +0800969 adjusted, commits = _adjust_timestamp_increasingly(commits)
970 if adjusted != 0:
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800971 logger.warning('Commit timestamps are not increasing')
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800972 logger.warning('%d timestamps adjusted', adjusted)
973
974 return commits