blob: 2994fd3da061f99d1cf8f3987be73fd16870fbed [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
Kuang-che Wued1bb622020-05-30 23:06:23 +08008import bisect
Kuang-che Wue41e0062017-09-01 19:04:14 +08009import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +080010import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080011import re
Kuang-che Wu3d04eda2019-09-05 23:56:40 +080012import shutil
Zheng-Jie Chang29144442020-02-18 11:53:25 +080013import stat
Kuang-che Wue41e0062017-09-01 19:04:14 +080014import subprocess
Kuang-che Wu13acc7b2020-06-15 10:45:35 +080015import tempfile
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080016import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080017
Kuang-che Wufcbcc502020-06-01 11:48:20 +080018from bisect_kit import cache_util
Kuang-che Wue41e0062017-09-01 19:04:14 +080019from bisect_kit import cli
20from bisect_kit import util
21
22logger = logging.getLogger(__name__)
23
24GIT_FULL_COMMIT_ID_LENGTH = 40
25
26# Minimal acceptable length of git commit id.
27#
28# For chromium, hash collision rate over number of digits:
29# - 6 digits: 4.85%
30# - 7 digits: 0.32%
31# - 8 digits: 0.01%
32# As foolproof check, 7 digits should be enough.
33GIT_MIN_COMMIT_ID_LENGTH = 7
34
35
36def is_git_rev(s):
37 """Is a git hash-like version string.
38
39 It accepts shortened hash with at least 7 digits.
40 """
41 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
42 return False
43 return bool(re.match(r'^[0-9a-f]+$', s))
44
45
46def argtype_git_rev(s):
47 """Validates git hash."""
48 if not is_git_rev(s):
49 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
50 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
51 return s
52
53
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080054def is_git_root(path):
55 """Is given path root of git repo."""
56 return os.path.exists(os.path.join(path, '.git'))
57
58
Kuang-che Wu08366542019-01-12 12:37:49 +080059def is_git_bare_dir(path):
60 """Is inside .git folder or bare git checkout."""
61 if not os.path.isdir(path):
62 return False
63 try:
64 return util.check_output(
65 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
66 except subprocess.CalledProcessError:
67 return False
68
69
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080070def clone(git_repo, repo_url, reference=None):
71 if not os.path.exists(git_repo):
72 os.makedirs(git_repo)
73 cmd = ['git', 'clone', repo_url, '.']
74 if reference:
75 cmd += ['--reference', reference]
76 util.check_call(*cmd, cwd=git_repo)
77
78
Kuang-che Wue41e0062017-09-01 19:04:14 +080079def checkout_version(git_repo, rev):
80 """git checkout.
81
82 Args:
83 git_repo: path of git repo.
84 rev: git commit revision to checkout.
85 """
86 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
87
88
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080089def init(git_repo):
90 """git init.
91
92 git_repo and its parent directories will be created if they don't exist.
93
94 Args:
95 git_repo: path of git repo.
96 """
97 if not os.path.exists(git_repo):
98 os.makedirs(git_repo)
99
100 util.check_call('git', 'init', '-q', cwd=git_repo)
101
102
103def commit_file(git_repo,
104 path,
105 message,
106 content,
107 commit_time=None,
108 author_time=None):
109 """Commit a file.
110
111 Args:
112 git_repo: path of git repo
113 path: file path, relative to git_repo
114 message: commit message
115 content: file content
116 commit_time: commit timestamp
117 author_time: author timestamp
118 """
119 if author_time is None:
120 author_time = commit_time
121
122 env = {}
123 if author_time:
124 env['GIT_AUTHOR_DATE'] = str(author_time)
125 if commit_time:
126 env['GIT_COMMITTER_DATE'] = str(commit_time)
127
128 full_path = os.path.join(git_repo, path)
129 dirname = os.path.dirname(full_path)
130 if not os.path.exists(dirname):
131 os.makedirs(dirname)
132 with open(full_path, 'w') as f:
133 f.write(content)
134
135 util.check_call('git', 'add', path, cwd=git_repo)
136 util.check_call(
137 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
138
139
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800140def config(git_repo, *args):
141 """Wrapper of 'git config'.
142
143 Args:
144 git_repo: path of git repo.
145 args: parameters pass to 'git config'
146 """
147 util.check_call('git', 'config', *args, cwd=git_repo)
148
149
150def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800151 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800152
153 Args:
154 git_repo: path of git repo.
155 args: parameters pass to 'git fetch'
156 """
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800157 tries = 0
158 while True:
159 tries += 1
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800160 stderr_lines = []
161 try:
162 util.check_call(
163 'git',
164 'fetch',
165 *args,
166 cwd=git_repo,
167 stderr_callback=stderr_lines.append)
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800168 return
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800169 except subprocess.CalledProcessError:
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800170 if tries >= 5:
171 logger.error('git fetch failed too much times')
172 raise
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800173 stderr = ''.join(stderr_lines)
174 # only retry 5xx internal server error
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800175 if 'The requested URL returned error: 5' in stderr:
176 delay = min(60, 10 * 2**tries)
177 logger.warning('git fetch failed, will retry %s seconds later', delay)
178 time.sleep(delay)
179 continue
180 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800181
182
Kuang-che Wued1bb622020-05-30 23:06:23 +0800183def _adjust_timestamp_increasingly(commits):
184 """Adjust commit timestamps.
185
186 After adjust, the timestamps are increasing.
187
188 Args:
189 commits: list of (timestamp, commit hash)
190
191 Returns:
192 (adjusted count, list of (timestamp, commit hash))
193 """
194 result = []
195 adjusted = 0
196 last_timestamp = -1
197 for timestamp, git_rev in commits:
198 if timestamp < last_timestamp:
199 adjusted += 1
200 timestamp = last_timestamp
201 else:
202 last_timestamp = timestamp
203 result.append((timestamp, git_rev))
204 return adjusted, result
205
206
207class FastLookupFailed(Exception):
208 """No data is cached for this query.
209
210 The caller should fallback to the original operation.
211 """
212
213
214class FastLookupEntry:
215 """Cached commits from one branch of given time period.
216
217 With this class, we can look up commit via commit hash and timestamp fast.
218 """
219
220 def __init__(self, git_repo, branch):
221 self.git_repo = git_repo
222 self.branch = branch
223 self.optimized_period = None
224 self.cached = []
225 self.commit_to_index = {}
226
227 def optimize(self, period):
228 assert period[0] <= period[1]
229 if (self.optimized_period and self.optimized_period[0] <= period[0] and
230 period[1] <= self.optimized_period[1]):
231 # already done
232 return
233
234 self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
235 self.optimized_period = period
236
237 # Adjust timestamps, so we can do binary search by timestamp
238 _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
239
240 self.commit_to_index = {}
241 for i, (_timestamp, rev) in enumerate(self.cached):
242 self.commit_to_index[rev] = i
243
244 def get_rev_by_time(self, timestamp):
245 if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
246 raise FastLookupFailed
247
248 # Note that, the return value might be different as "git rev-list" if the
249 # actual commit timestamps are not fully increasing.
250 x = (timestamp, '')
251 idx = bisect.bisect_right(self.cached, x)
252 if idx == 0 and timestamp < self.cached[0][0]:
253 return None
254 if idx == len(self.cached) or self.cached[idx][0] != timestamp:
255 idx -= 1
256 return self.cached[idx][1]
257
258 def is_containing_commit(self, rev):
259 if rev in self.commit_to_index:
260 return True
261 raise FastLookupFailed
262
Kuang-che Wued1bb622020-05-30 23:06:23 +0800263
264class FastLookup:
265 """Collection of FastLookupEntry"""
266
267 def __init__(self):
268 self.entries = {}
269 self.target_period = None
270
271 def optimize(self, period):
272 self.target_period = period
273
274 def disable(self):
275 self.target_period = None
276 self.entries = {}
277
278 def get_rev_by_time(self, git_repo, timestamp, branch):
279 if not self.target_period:
280 raise FastLookupFailed
281 if not self.target_period[0] <= timestamp <= self.target_period[1]:
282 raise FastLookupFailed
283
284 if git_repo not in self.entries:
285 self.entries[git_repo] = {}
286 if branch not in self.entries[git_repo]:
287 self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
288 entry = self.entries[git_repo][branch]
289 entry.optimize(self.target_period)
290 return entry.get_rev_by_time(timestamp)
291
292 def is_containing_commit(self, git_repo, rev):
293 # This function is optimized only after get_rev_by_time() is invoked.
294 if git_repo not in self.entries:
295 raise FastLookupFailed
296
297 for entry in self.entries[git_repo].values():
298 try:
299 return entry.is_containing_commit(rev)
300 except FastLookupFailed:
301 pass
302 raise FastLookupFailed
303
Kuang-che Wued1bb622020-05-30 23:06:23 +0800304
305fast_lookup = FastLookup()
306
307
Kuang-che Wu98d98462020-06-19 17:07:22 +0800308@cache_util.Cache.default_disabled
Kuang-che Wue41e0062017-09-01 19:04:14 +0800309def is_containing_commit(git_repo, rev):
310 """Determines given commit exists.
311
312 Args:
313 git_repo: path of git repo.
314 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800315
316 Returns:
317 True if rev is inside given git repo. If git_repo is not a git folder,
318 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800319 """
320 try:
Kuang-che Wued1bb622020-05-30 23:06:23 +0800321 return fast_lookup.is_containing_commit(git_repo, rev)
322 except FastLookupFailed:
323 pass
324
325 try:
Kuang-che Wue41e0062017-09-01 19:04:14 +0800326 return util.check_output(
327 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
328 except subprocess.CalledProcessError:
329 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800330 except OSError:
331 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800332
333
Zheng-Jie Changad174a42020-06-20 15:28:10 +0800334@cache_util.Cache.default_disabled
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800335def is_ancestor_commit(git_repo, old, new):
336 """Determines `old` commit is ancestor of `new` commit.
337
338 Args:
339 git_repo: path of git repo.
340 old: the ancestor commit.
341 new: the descendant commit.
342
343 Returns:
344 True only if `old` is the ancestor of `new`. One commit is not considered
345 as ancestor of itself.
346 """
347 return util.check_output(
348 'git',
349 'rev-list',
350 '--ancestry-path',
351 '-1',
352 '%s..%s' % (old, new),
353 cwd=git_repo) != ''
354
355
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800356def _parse_commit_object(s):
357 meta = {}
358 header, meta['message'] = s.split('\n\n', 1)
359 for line in header.splitlines():
360 m = re.match(r'^tree (\w+)', line)
361 if m:
362 meta['tree'] = m.group(1)
363 continue
364
365 m = re.match(r'^parent (\w+)', line)
366 if m:
367 meta['parent'] = line.split()[1:]
368 continue
369
370 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
371 if m:
372 meta[m.group(1)] = m.group(2)
373 meta['%s_time' % m.group(1)] = int(m.group(3))
374 continue
375 return meta
376
377
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800378@cache_util.Cache.default_disabled
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800379def get_commit_metadata(git_repo, rev):
380 """Get metadata of given commit.
381
382 Args:
383 git_repo: path of git repo.
384 rev: git commit revision in query.
385
386 Returns:
387 dict of metadata, including (if available):
388 tree: hash of git tree object
389 parent: list of parent commits; this field is unavailable for the very
390 first commit of git repo.
391 author: name and email of author
392 author_time: author timestamp (without timezone information)
393 committer: name and email of committer
394 committer_time: commit timestamp (without timezone information)
395 message: commit message text
396 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800397 data = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800398 'git', 'cat-file', '-p', rev, cwd=git_repo, log_stdout=False)
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800399 return _parse_commit_object(data)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800400
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800401
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800402def get_batch_commit_metadata(git_repo, revs):
403 query = '\n'.join(revs)
404 logger.debug('get_batch_commit_metadata %r', query)
405 with tempfile.NamedTemporaryFile('w+t') as f:
406 f.write(query)
407 f.flush()
408 # util.check_output doesn't support stdin, so use shell
409 # redirect instead.
410 # binary=True because we need to count size in bytes later.
411 data = util.check_output(
412 'sh',
413 '-c',
414 'git cat-file --batch < ' + f.name,
415 cwd=git_repo,
416 binary=True)
417
418 metas = {}
419 while data:
420 first_line, data = data.split(b'\n', 1)
421 m = re.match(r'^(\w+) (\w+)(?: (\d+))?', first_line.decode('utf8'))
422 assert m, repr(first_line)
423 object_name, object_type = m.group(1, 2)
424 if not m.group(3):
425 metas[object_name] = None
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800426 continue
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800427 assert object_type == 'commit', 'unsupported object type: %s' % object_type
428 object_size = int(m.group(3))
429 assert data[object_size] == ord(b'\n'), repr(data[object_size])
430 obj, data = data[:object_size], data[object_size + 1:]
431 metas[object_name] = _parse_commit_object(obj.decode('utf8'))
432 return metas
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800433
434
Kuang-che Wue41e0062017-09-01 19:04:14 +0800435def get_revlist(git_repo, old, new):
436 """Enumerates git commit between two revisions (inclusive).
437
438 Args:
439 git_repo: path of git repo.
440 old: git commit revision.
441 new: git commit revision.
442
443 Returns:
444 list of git revisions. The list contains the input revisions, old and new.
445 """
446 assert old
447 assert new
448 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
449 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
450 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800451
452
453def get_commit_log(git_repo, rev):
454 """Get git commit log.
455
456 Args:
457 git_repo: path of git repo.
458 rev: git commit revision.
459
460 Returns:
461 commit log message
462 """
463 cmd = ['git', 'log', '-1', '--format=%B', rev]
464 msg = util.check_output(*cmd, cwd=git_repo)
465 return msg
466
467
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800468def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800469 """Get git commit hash.
470
471 Args:
472 git_repo: path of git repo.
473 rev: could be git tag, branch, or (shortened) commit hash
474
475 Returns:
476 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800477
478 Raises:
479 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800480 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800481 try:
482 # Use '^{commit}' to restrict search only commits.
483 # Use '--' to avoid ambiguity, like matching rev against path name.
484 output = util.check_output(
485 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
486 git_rev = output.rstrip('-\n')
487 except subprocess.CalledProcessError:
488 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
489 # because it searches objects other than commits as well.
490 raise ValueError('%s is not unique or does not exist' % rev)
491 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800492 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800493
494
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800495def get_commit_time(git_repo, rev, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800496 """Get git commit timestamp.
497
498 Args:
499 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800500 rev: git commit id, branch name, tag name, or other git object
501 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800502
503 Returns:
504 timestamp (int)
505 """
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800506 cmd = ['git', 'log', '-1', '--format=%ct', rev]
507 if path:
508 cmd += ['--', path]
509 line = util.check_output(*cmd, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800510 return int(line)
511
512
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800513def is_symbolic_link(git_repo, rev, path):
514 """Check if a file is symbolic link.
515
516 Args:
517 git_repo: path of git repo
518 rev: git commit id
519 path: file path
520
521 Returns:
522 True if the specified file is a symbolic link in repo.
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800523
524 Raises:
525 ValueError if not found
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800526 """
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800527 # format: 120000 blob 8735a8c1dd96ede39a21d983d5c96792fd15c1a5 default.xml
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800528 # TODO(kcwu): handle escaped path with special characters
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800529 splitted = util.check_output(
530 'git', 'ls-tree', rev, '--full-name', path, cwd=git_repo).split()
531 if len(splitted) >= 4 and splitted[3] == path:
532 return stat.S_ISLNK(int(splitted[0], 8))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800533
Kuang-che Wud1b74152020-05-20 08:46:46 +0800534 raise ValueError('file %s is not found in repo:%s rev:%s' %
535 (path, git_repo, rev))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800536
537
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800538@cache_util.Cache.default_disabled
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800539def get_file_from_revision(git_repo, rev, path):
540 """Get file content of given revision.
541
542 Args:
543 git_repo: path of git repo
544 rev: git commit id
545 path: file path
546
547 Returns:
548 file content (str)
549 """
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800550 result = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800551 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_stdout=False)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800552
553 # It might be a symbolic link.
554 # In extreme case, it's possible that filenames contain special characters,
555 # like newlines. In practice, it should be safe to assume no such cases and
556 # reduce disk i/o.
557 if '\n' not in result and is_symbolic_link(git_repo, rev, path):
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800558 return get_file_from_revision(git_repo, rev, result)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800559
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800560 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800561
562
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800563def list_dir_from_revision(git_repo, rev, path):
564 """Lists entries of directory of given revision.
565
566 Args:
567 git_repo: path of git repo
568 rev: git commit id
569 path: directory path, relative to git root
570
571 Returns:
572 list of names
573
574 Raises:
575 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
576 """
577 return util.check_output(
578 'git',
579 'ls-tree',
580 '--name-only',
581 '%s:%s' % (rev, path),
582 cwd=git_repo,
Kuang-che Wubcafc552019-08-15 15:27:02 +0800583 log_stdout=False).splitlines()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800584
585
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800586def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800587 """Query commit of given time.
588
589 Args:
590 git_repo: path of git repo.
591 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800592 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
593 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800594 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800595
596 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800597 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800598 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800599 if not branch:
600 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800601
Kuang-che Wued1bb622020-05-30 23:06:23 +0800602 if not path:
603 try:
604 return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
605 except FastLookupFailed:
606 pass
607
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800608 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800609 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800610 'rev-list',
611 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800612 '-1',
613 '--before',
614 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800615 branch,
616 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800617 if path:
618 cmd += ['--', path]
619
620 result = util.check_output(*cmd, cwd=git_repo).strip()
621 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800622
623
Kuang-che Wued1bb622020-05-30 23:06:23 +0800624def get_revlist_by_period(git_repo, branch, period):
625 # Find the last commit before period[0].
626 text = util.check_output(
627 'git',
628 'rev-list',
629 '--timestamp',
630 '-1',
631 '--before',
632 str(period[0] - 1),
633 branch,
634 cwd=git_repo)
635
636 # Find commits in the period.
637 text += util.check_output(
638 'git',
639 'rev-list',
640 '--timestamp',
641 '--reverse',
642 '--after',
643 str(period[0]),
644 '--before',
645 str(period[1]),
646 branch,
647 cwd=git_repo)
648
649 result = []
650 for line in text.splitlines():
651 timestamp, commit = line.split()
652 result.append((int(timestamp), commit))
653 return result
654
655
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800656def reset_hard(git_repo):
657 """Restore modified and deleted files.
658
659 This is simply wrapper of "git reset --hard".
660
661 Args:
662 git_repo: path of git repo.
663 """
664 util.check_call('git', 'reset', '--hard', cwd=git_repo)
665
666
667def list_untracked(git_repo, excludes=None):
668 """List untracked files and directories.
669
670 Args:
671 git_repo: path of git repo.
672 excludes: files and/or directories to ignore, relative to git_repo
673
674 Returns:
675 list of paths, relative to git_repo
676 """
677 exclude_flags = []
678 if excludes:
679 for exclude in excludes:
680 assert not os.path.isabs(exclude), 'should be relative'
681 exclude_flags += ['--exclude', '/' + re.escape(exclude)]
682
683 result = []
684 for path in util.check_output(
685 'git',
686 'ls-files',
687 '--others',
688 '--exclude-standard',
689 *exclude_flags,
690 cwd=git_repo).splitlines():
691 # Remove the trailing slash, which means directory.
692 path = path.rstrip('/')
693 result.append(path)
694 return result
695
696
697def distclean(git_repo, excludes=None):
698 """Clean up git repo directory.
699
700 Restore modified and deleted files. Delete untracked files.
701
702 Args:
703 git_repo: path of git repo.
704 excludes: files and/or directories to ignore, relative to git_repo
705 """
706 reset_hard(git_repo)
707
708 # Delete untracked files.
709 for untracked in list_untracked(git_repo, excludes=excludes):
710 path = os.path.join(git_repo, untracked)
711 logger.debug('delete untracked: %s', path)
Zheng-Jie Chang42fd2e42020-04-10 07:21:23 +0800712 if os.path.islink(path):
713 os.unlink(path)
714 elif os.path.isdir(path):
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800715 shutil.rmtree(path)
716 else:
717 os.unlink(path)
718
719
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800720def get_history(git_repo,
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800721 path=None,
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800722 branch=None,
723 after=None,
724 before=None,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800725 padding_begin=False,
726 padding_end=False,
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800727 with_subject=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800728 """Get commit history of given path.
729
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800730 `after` and `before` could be outside of lifetime of `path`. `padding` is
731 used to control what to return for such cases.
732
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800733 Args:
734 git_repo: path of git repo.
735 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800736 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800737 after: limit history after given time (inclusive)
738 before: limit history before given time (inclusive)
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800739 padding_begin: If True, pads returned result with dummy record at exact
740 'after' time, if 'path' existed at that time.
741 padding_end: If True, pads returned result with dummy record at exact
742 'before' time, if 'path' existed at that time.
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800743 with_subject: If True, return commit subject together
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800744
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800745 Returns:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800746 List of (timestamp, git hash, subject); or (timestamp, git hash) depends
747 on with_subject flag. They are all events when `path` was added, removed,
748 modified, and start and end time if `padding` is true. If `padding` and
749 `with_subject` are both true, 'dummy subject' will be returned as padding
750 history's subject.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800751
752 For each pair, at `timestamp`, the repo state is `git hash`. In other
753 words, `timestamp` is not necessary the commit time of `git hash` for the
754 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800755 """
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800756 log_format = '%ct %H' if not with_subject else '%ct %H %s'
757 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=' + log_format]
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800758 if after:
759 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800760 if before:
761 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800762 if branch:
763 assert not is_git_rev(branch)
764 cmd += [branch]
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800765 if path:
766 # '--' is necessary otherwise if `path` is removed in current revision, git
767 # will complain it's an ambiguous argument which may be path or something
768 # else (like git branch name, tag name, etc.)
769 cmd += ['--', path]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800770
771 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800772 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800773 # array = [timestamp, git_rev, subject] or [timestamp, git_rev]
774 array = line.split(' ', 2)
775 array[0] = int(array[0])
776 result.append(tuple(array))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800777
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800778 if padding_begin or padding_end:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800779 history = [0, '']
780 if with_subject:
781 history.append('dummy subject')
782
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800783 if padding_end:
784 assert before, 'padding_end=True make no sense if before=None'
785 if get_rev_by_time(git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800786 before = int(before)
787 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800788 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800789 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800790 history[0:2] = [before, git_rev]
791 result.append(tuple(history))
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800792
793 if padding_begin:
794 assert after, 'padding_begin=True make no sense if after=None'
795 if get_rev_by_time(git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800796 after = int(after)
797 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800798 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800799 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800800 history[0:2] = [after, git_rev]
801 result.insert(0, tuple(history))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800802
803 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800804
805
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800806def get_history_recursively(git_repo,
807 path,
808 after,
809 before,
810 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800811 padding_end=True,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800812 branch=None):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800813 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800814
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800815 In comparison to get_history(), get_history_recursively also takes
816 dependencies into consideration. For example, if file A referenced file B,
817 get_history_recursively(A) will return commits of B in addition to A. This
818 applies recursively, so commits of C will be included if file B referenced
819 file C, and so on.
820
821 This function is file type neutral. `parser_callback(filename, content)` will
822 be invoked to parse file content and should return list of filename of
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800823 dependencies. If `parser_callback` returns None (usually syntax error), the
824 commit is omitted.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800825
826 Args:
827 git_repo: path of git repo
828 path: path to query, relative to git_repo
829 after: limit history after given time (inclusive)
830 before: limit history before given time (inclusive)
831 parser_callback: callback to parse file content. See above comment.
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800832 padding_end: If True, pads returned result with dummy record at exact
833 'after' time, if 'path' existed at that time.
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800834 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800835
836 Returns:
837 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800838 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800839 history = get_history(
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800840 git_repo,
841 path,
842 after=after,
843 before=before,
844 padding_begin=True,
845 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800846
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800847 # Collect include information of each commit.
848 includes = {}
849 for commit_time, git_rev in history:
850 content = get_file_from_revision(git_repo, git_rev, path)
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800851 parse_result = parser_callback(path, content)
852 if parse_result is None:
853 continue
854 for include_name in parse_result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800855 if include_name not in includes:
856 includes[include_name] = set()
857 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800858
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800859 # Analyze the start time and end time of each include.
860 dependencies = []
861 for include in includes:
862 appeared = None
863 for commit_time, git_rev in history:
864 if git_rev in includes[include]:
865 if not appeared:
866 appeared = commit_time
867 else:
868 if appeared:
Zheng-Jie Chang4d617a42020-02-15 06:46:00 +0800869 # dependency file exists in time range [appeared, commit_time)
870 dependencies.append((include, appeared, commit_time - 1))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800871 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800872
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800873 if appeared is not None:
874 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800875
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800876 # Recursion and merge.
877 result = list(history)
878 for include, appeared, disappeared in dependencies:
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800879 result += get_history_recursively(
880 git_repo,
881 include,
882 appeared,
883 disappeared,
884 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800885 padding_end=False,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800886 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800887
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800888 # Sort and padding.
889 result.sort(key=lambda x: x[0])
890 if padding_end:
891 pad = (before,)
892 pad += result[-1][1:]
893 result.append(pad)
894
895 # Dedup.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800896 result2 = []
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800897 for x in result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800898 if result2 and result2[-1] == x:
899 continue
900 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800901
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800902 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800903
904
Kuang-che Wud558a042020-06-06 02:11:00 +0800905def get_branches(git_repo, all_branches=True, commit=None, remote=False):
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800906 """Get branches of a repository.
907
908 Args:
909 git_repo: path of git repo
910 all_branches: return remote branches if is set to True
911 commit: return branches containing this commit if is not None
Kuang-che Wud558a042020-06-06 02:11:00 +0800912 remote: only remote tracking branches
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800913
914 Returns:
915 list of branch names
916 """
917 cmd = ['git', 'branch', '--format=%(refname)']
918 if all_branches:
919 cmd += ['-a']
920 if commit:
921 cmd += ['--contains', commit]
Kuang-che Wud558a042020-06-06 02:11:00 +0800922 if remote:
923 cmd.append('--remote')
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800924
925 result = []
926 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
927 result.append(line.strip())
928 return result
929
930
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800931def list_commits_between_commits(git_repo, old, new):
932 """Get all commits between (old, new].
933
934 Args:
935 git_repo: path of git repo.
936 old: old commit hash (exclusive)
937 new: new commit hash (inclusive)
938
939 Returns:
940 list of (timestamp, rev)
941 """
942 assert old and new
Kuang-che Wu470866e2020-05-27 11:03:10 +0800943 if old == new:
944 return []
945
946 assert is_ancestor_commit(git_repo, old, new)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800947 commits = []
948 # --first-parent is necessary for Android, see following link for more
949 # discussion.
950 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
951 for line in util.check_output(
952 'git',
953 'rev-list',
954 '--timestamp',
955 '--reverse',
956 '--first-parent',
957 '%s..%s' % (old, new),
958 cwd=git_repo).splitlines():
959 timestamp, git_rev = line.split()
Kuang-che Wued1bb622020-05-30 23:06:23 +0800960 commits.append((int(timestamp), git_rev))
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800961
962 # bisect-kit has a fundamental assumption that commit timestamps are
963 # increasing because we sort and bisect the commits by timestamp across git
964 # repos. If not increasing, we have to adjust the timestamp as workaround.
965 # This might lead to bad bisect result, however the bad probability is low in
966 # practice since most machines' clocks are good enough.
Kuang-che Wued1bb622020-05-30 23:06:23 +0800967 adjusted, commits = _adjust_timestamp_increasingly(commits)
968 if adjusted != 0:
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800969 logger.warning('Commit timestamps are not increasing')
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800970 logger.warning('%d timestamps adjusted', adjusted)
971
972 return commits