blob: 982e91431da28331d1df9140803d848dad42e52d [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
Kuang-che Wued1bb622020-05-30 23:06:23 +08008import bisect
Kuang-che Wue41e0062017-09-01 19:04:14 +08009import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +080010import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080011import re
Kuang-che Wu3d04eda2019-09-05 23:56:40 +080012import shutil
Zheng-Jie Chang29144442020-02-18 11:53:25 +080013import stat
Kuang-che Wue41e0062017-09-01 19:04:14 +080014import subprocess
Kuang-che Wu13acc7b2020-06-15 10:45:35 +080015import tempfile
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080016import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080017
Kuang-che Wufcbcc502020-06-01 11:48:20 +080018from bisect_kit import cache_util
Kuang-che Wue41e0062017-09-01 19:04:14 +080019from bisect_kit import cli
20from bisect_kit import util
21
22logger = logging.getLogger(__name__)
23
24GIT_FULL_COMMIT_ID_LENGTH = 40
25
26# Minimal acceptable length of git commit id.
27#
28# For chromium, hash collision rate over number of digits:
29# - 6 digits: 4.85%
30# - 7 digits: 0.32%
31# - 8 digits: 0.01%
32# As foolproof check, 7 digits should be enough.
33GIT_MIN_COMMIT_ID_LENGTH = 7
34
35
36def is_git_rev(s):
37 """Is a git hash-like version string.
38
39 It accepts shortened hash with at least 7 digits.
40 """
41 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
42 return False
43 return bool(re.match(r'^[0-9a-f]+$', s))
44
45
46def argtype_git_rev(s):
47 """Validates git hash."""
48 if not is_git_rev(s):
49 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
50 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
51 return s
52
53
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080054def is_git_root(path):
55 """Is given path root of git repo."""
56 return os.path.exists(os.path.join(path, '.git'))
57
58
Kuang-che Wu08366542019-01-12 12:37:49 +080059def is_git_bare_dir(path):
60 """Is inside .git folder or bare git checkout."""
61 if not os.path.isdir(path):
62 return False
63 try:
64 return util.check_output(
65 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
66 except subprocess.CalledProcessError:
67 return False
68
69
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080070def clone(git_repo, repo_url, reference=None):
71 if not os.path.exists(git_repo):
72 os.makedirs(git_repo)
73 cmd = ['git', 'clone', repo_url, '.']
74 if reference:
75 cmd += ['--reference', reference]
76 util.check_call(*cmd, cwd=git_repo)
77
78
Kuang-che Wue41e0062017-09-01 19:04:14 +080079def checkout_version(git_repo, rev):
80 """git checkout.
81
82 Args:
83 git_repo: path of git repo.
84 rev: git commit revision to checkout.
85 """
86 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
87
88
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080089def init(git_repo):
90 """git init.
91
92 git_repo and its parent directories will be created if they don't exist.
93
94 Args:
95 git_repo: path of git repo.
96 """
97 if not os.path.exists(git_repo):
98 os.makedirs(git_repo)
99
100 util.check_call('git', 'init', '-q', cwd=git_repo)
101
102
103def commit_file(git_repo,
104 path,
105 message,
106 content,
107 commit_time=None,
108 author_time=None):
109 """Commit a file.
110
111 Args:
112 git_repo: path of git repo
113 path: file path, relative to git_repo
114 message: commit message
115 content: file content
116 commit_time: commit timestamp
117 author_time: author timestamp
118 """
119 if author_time is None:
120 author_time = commit_time
121
122 env = {}
123 if author_time:
124 env['GIT_AUTHOR_DATE'] = str(author_time)
125 if commit_time:
126 env['GIT_COMMITTER_DATE'] = str(commit_time)
127
128 full_path = os.path.join(git_repo, path)
129 dirname = os.path.dirname(full_path)
130 if not os.path.exists(dirname):
131 os.makedirs(dirname)
132 with open(full_path, 'w') as f:
133 f.write(content)
134
135 util.check_call('git', 'add', path, cwd=git_repo)
136 util.check_call(
137 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
138
139
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800140def config(git_repo, *args):
141 """Wrapper of 'git config'.
142
143 Args:
144 git_repo: path of git repo.
145 args: parameters pass to 'git config'
146 """
147 util.check_call('git', 'config', *args, cwd=git_repo)
148
149
150def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800151 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800152
153 Args:
154 git_repo: path of git repo.
155 args: parameters pass to 'git fetch'
156 """
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800157 tries = 0
158 while True:
159 tries += 1
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800160 stderr_lines = []
161 try:
162 util.check_call(
163 'git',
164 'fetch',
165 *args,
166 cwd=git_repo,
167 stderr_callback=stderr_lines.append)
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800168 return
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800169 except subprocess.CalledProcessError:
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800170 if tries >= 5:
171 logger.error('git fetch failed too much times')
172 raise
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800173 stderr = ''.join(stderr_lines)
174 # only retry 5xx internal server error
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800175 if 'The requested URL returned error: 5' in stderr:
176 delay = min(60, 10 * 2**tries)
177 logger.warning('git fetch failed, will retry %s seconds later', delay)
178 time.sleep(delay)
179 continue
180 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800181
182
Kuang-che Wued1bb622020-05-30 23:06:23 +0800183def _adjust_timestamp_increasingly(commits):
184 """Adjust commit timestamps.
185
186 After adjust, the timestamps are increasing.
187
188 Args:
189 commits: list of (timestamp, commit hash)
190
191 Returns:
192 (adjusted count, list of (timestamp, commit hash))
193 """
194 result = []
195 adjusted = 0
196 last_timestamp = -1
197 for timestamp, git_rev in commits:
198 if timestamp < last_timestamp:
199 adjusted += 1
200 timestamp = last_timestamp
201 else:
202 last_timestamp = timestamp
203 result.append((timestamp, git_rev))
204 return adjusted, result
205
206
207class FastLookupFailed(Exception):
208 """No data is cached for this query.
209
210 The caller should fallback to the original operation.
211 """
212
213
214class FastLookupEntry:
215 """Cached commits from one branch of given time period.
216
217 With this class, we can look up commit via commit hash and timestamp fast.
218 """
219
220 def __init__(self, git_repo, branch):
221 self.git_repo = git_repo
222 self.branch = branch
223 self.optimized_period = None
224 self.cached = []
225 self.commit_to_index = {}
226
227 def optimize(self, period):
228 assert period[0] <= period[1]
229 if (self.optimized_period and self.optimized_period[0] <= period[0] and
230 period[1] <= self.optimized_period[1]):
231 # already done
232 return
233
234 self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
235 self.optimized_period = period
236
237 # Adjust timestamps, so we can do binary search by timestamp
238 _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
239
240 self.commit_to_index = {}
241 for i, (_timestamp, rev) in enumerate(self.cached):
242 self.commit_to_index[rev] = i
243
244 def get_rev_by_time(self, timestamp):
245 if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
246 raise FastLookupFailed
247
248 # Note that, the return value might be different as "git rev-list" if the
249 # actual commit timestamps are not fully increasing.
250 x = (timestamp, '')
251 idx = bisect.bisect_right(self.cached, x)
252 if idx == 0 and timestamp < self.cached[0][0]:
253 return None
254 if idx == len(self.cached) or self.cached[idx][0] != timestamp:
255 idx -= 1
256 return self.cached[idx][1]
257
258 def is_containing_commit(self, rev):
259 if rev in self.commit_to_index:
260 return True
261 raise FastLookupFailed
262
263 def is_ancestor_commit(self, old, new):
264 old_idx = self.commit_to_index.get(old)
265 new_idx = self.commit_to_index.get(new)
266 if old_idx is not None and new_idx is not None:
267 return old_idx < new_idx
268 raise FastLookupFailed
269
270
271class FastLookup:
272 """Collection of FastLookupEntry"""
273
274 def __init__(self):
275 self.entries = {}
276 self.target_period = None
277
278 def optimize(self, period):
279 self.target_period = period
280
281 def disable(self):
282 self.target_period = None
283 self.entries = {}
284
285 def get_rev_by_time(self, git_repo, timestamp, branch):
286 if not self.target_period:
287 raise FastLookupFailed
288 if not self.target_period[0] <= timestamp <= self.target_period[1]:
289 raise FastLookupFailed
290
291 if git_repo not in self.entries:
292 self.entries[git_repo] = {}
293 if branch not in self.entries[git_repo]:
294 self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
295 entry = self.entries[git_repo][branch]
296 entry.optimize(self.target_period)
297 return entry.get_rev_by_time(timestamp)
298
299 def is_containing_commit(self, git_repo, rev):
300 # This function is optimized only after get_rev_by_time() is invoked.
301 if git_repo not in self.entries:
302 raise FastLookupFailed
303
304 for entry in self.entries[git_repo].values():
305 try:
306 return entry.is_containing_commit(rev)
307 except FastLookupFailed:
308 pass
309 raise FastLookupFailed
310
311 def is_ancestor_commit(self, git_repo, old, new):
312 # This function is optimized only after get_rev_by_time() is invoked.
313 if git_repo not in self.entries:
314 raise FastLookupFailed
315
316 for entry in self.entries[git_repo].values():
317 try:
318 return entry.is_ancestor_commit(old, new)
319 except FastLookupFailed:
320 pass
321 raise FastLookupFailed
322
323
324fast_lookup = FastLookup()
325
326
Kuang-che Wue41e0062017-09-01 19:04:14 +0800327def is_containing_commit(git_repo, rev):
328 """Determines given commit exists.
329
330 Args:
331 git_repo: path of git repo.
332 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800333
334 Returns:
335 True if rev is inside given git repo. If git_repo is not a git folder,
336 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800337 """
338 try:
Kuang-che Wued1bb622020-05-30 23:06:23 +0800339 return fast_lookup.is_containing_commit(git_repo, rev)
340 except FastLookupFailed:
341 pass
342
343 try:
Kuang-che Wue41e0062017-09-01 19:04:14 +0800344 return util.check_output(
345 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
346 except subprocess.CalledProcessError:
347 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800348 except OSError:
349 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800350
351
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800352def is_ancestor_commit(git_repo, old, new):
353 """Determines `old` commit is ancestor of `new` commit.
354
355 Args:
356 git_repo: path of git repo.
357 old: the ancestor commit.
358 new: the descendant commit.
359
360 Returns:
361 True only if `old` is the ancestor of `new`. One commit is not considered
362 as ancestor of itself.
363 """
Kuang-che Wued1bb622020-05-30 23:06:23 +0800364 try:
365 return fast_lookup.is_ancestor_commit(git_repo, old, new)
366 except FastLookupFailed:
367 pass
368
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800369 return util.check_output(
370 'git',
371 'rev-list',
372 '--ancestry-path',
373 '-1',
374 '%s..%s' % (old, new),
375 cwd=git_repo) != ''
376
377
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800378def _parse_commit_object(s):
379 meta = {}
380 header, meta['message'] = s.split('\n\n', 1)
381 for line in header.splitlines():
382 m = re.match(r'^tree (\w+)', line)
383 if m:
384 meta['tree'] = m.group(1)
385 continue
386
387 m = re.match(r'^parent (\w+)', line)
388 if m:
389 meta['parent'] = line.split()[1:]
390 continue
391
392 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
393 if m:
394 meta[m.group(1)] = m.group(2)
395 meta['%s_time' % m.group(1)] = int(m.group(3))
396 continue
397 return meta
398
399
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800400@cache_util.Cache.default_disabled
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800401def get_commit_metadata(git_repo, rev):
402 """Get metadata of given commit.
403
404 Args:
405 git_repo: path of git repo.
406 rev: git commit revision in query.
407
408 Returns:
409 dict of metadata, including (if available):
410 tree: hash of git tree object
411 parent: list of parent commits; this field is unavailable for the very
412 first commit of git repo.
413 author: name and email of author
414 author_time: author timestamp (without timezone information)
415 committer: name and email of committer
416 committer_time: commit timestamp (without timezone information)
417 message: commit message text
418 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800419 data = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800420 'git', 'cat-file', '-p', rev, cwd=git_repo, log_stdout=False)
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800421 return _parse_commit_object(data)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800422
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800423
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800424def get_batch_commit_metadata(git_repo, revs):
425 query = '\n'.join(revs)
426 logger.debug('get_batch_commit_metadata %r', query)
427 with tempfile.NamedTemporaryFile('w+t') as f:
428 f.write(query)
429 f.flush()
430 # util.check_output doesn't support stdin, so use shell
431 # redirect instead.
432 # binary=True because we need to count size in bytes later.
433 data = util.check_output(
434 'sh',
435 '-c',
436 'git cat-file --batch < ' + f.name,
437 cwd=git_repo,
438 binary=True)
439
440 metas = {}
441 while data:
442 first_line, data = data.split(b'\n', 1)
443 m = re.match(r'^(\w+) (\w+)(?: (\d+))?', first_line.decode('utf8'))
444 assert m, repr(first_line)
445 object_name, object_type = m.group(1, 2)
446 if not m.group(3):
447 metas[object_name] = None
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800448 continue
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800449 assert object_type == 'commit', 'unsupported object type: %s' % object_type
450 object_size = int(m.group(3))
451 assert data[object_size] == ord(b'\n'), repr(data[object_size])
452 obj, data = data[:object_size], data[object_size + 1:]
453 metas[object_name] = _parse_commit_object(obj.decode('utf8'))
454 return metas
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800455
456
Kuang-che Wue41e0062017-09-01 19:04:14 +0800457def get_revlist(git_repo, old, new):
458 """Enumerates git commit between two revisions (inclusive).
459
460 Args:
461 git_repo: path of git repo.
462 old: git commit revision.
463 new: git commit revision.
464
465 Returns:
466 list of git revisions. The list contains the input revisions, old and new.
467 """
468 assert old
469 assert new
470 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
471 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
472 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800473
474
475def get_commit_log(git_repo, rev):
476 """Get git commit log.
477
478 Args:
479 git_repo: path of git repo.
480 rev: git commit revision.
481
482 Returns:
483 commit log message
484 """
485 cmd = ['git', 'log', '-1', '--format=%B', rev]
486 msg = util.check_output(*cmd, cwd=git_repo)
487 return msg
488
489
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800490def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800491 """Get git commit hash.
492
493 Args:
494 git_repo: path of git repo.
495 rev: could be git tag, branch, or (shortened) commit hash
496
497 Returns:
498 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800499
500 Raises:
501 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800502 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800503 try:
504 # Use '^{commit}' to restrict search only commits.
505 # Use '--' to avoid ambiguity, like matching rev against path name.
506 output = util.check_output(
507 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
508 git_rev = output.rstrip('-\n')
509 except subprocess.CalledProcessError:
510 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
511 # because it searches objects other than commits as well.
512 raise ValueError('%s is not unique or does not exist' % rev)
513 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800514 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800515
516
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800517def get_commit_time(git_repo, rev, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800518 """Get git commit timestamp.
519
520 Args:
521 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800522 rev: git commit id, branch name, tag name, or other git object
523 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800524
525 Returns:
526 timestamp (int)
527 """
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800528 cmd = ['git', 'log', '-1', '--format=%ct', rev]
529 if path:
530 cmd += ['--', path]
531 line = util.check_output(*cmd, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800532 return int(line)
533
534
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800535def is_symbolic_link(git_repo, rev, path):
536 """Check if a file is symbolic link.
537
538 Args:
539 git_repo: path of git repo
540 rev: git commit id
541 path: file path
542
543 Returns:
544 True if the specified file is a symbolic link in repo.
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800545
546 Raises:
547 ValueError if not found
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800548 """
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800549 # format: 120000 blob 8735a8c1dd96ede39a21d983d5c96792fd15c1a5 default.xml
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800550 # TODO(kcwu): handle escaped path with special characters
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800551 splitted = util.check_output(
552 'git', 'ls-tree', rev, '--full-name', path, cwd=git_repo).split()
553 if len(splitted) >= 4 and splitted[3] == path:
554 return stat.S_ISLNK(int(splitted[0], 8))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800555
Kuang-che Wud1b74152020-05-20 08:46:46 +0800556 raise ValueError('file %s is not found in repo:%s rev:%s' %
557 (path, git_repo, rev))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800558
559
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800560@cache_util.Cache.default_disabled
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800561def get_file_from_revision(git_repo, rev, path):
562 """Get file content of given revision.
563
564 Args:
565 git_repo: path of git repo
566 rev: git commit id
567 path: file path
568
569 Returns:
570 file content (str)
571 """
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800572 result = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800573 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_stdout=False)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800574
575 # It might be a symbolic link.
576 # In extreme case, it's possible that filenames contain special characters,
577 # like newlines. In practice, it should be safe to assume no such cases and
578 # reduce disk i/o.
579 if '\n' not in result and is_symbolic_link(git_repo, rev, path):
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800580 return get_file_from_revision(git_repo, rev, result)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800581
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800582 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800583
584
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800585def list_dir_from_revision(git_repo, rev, path):
586 """Lists entries of directory of given revision.
587
588 Args:
589 git_repo: path of git repo
590 rev: git commit id
591 path: directory path, relative to git root
592
593 Returns:
594 list of names
595
596 Raises:
597 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
598 """
599 return util.check_output(
600 'git',
601 'ls-tree',
602 '--name-only',
603 '%s:%s' % (rev, path),
604 cwd=git_repo,
Kuang-che Wubcafc552019-08-15 15:27:02 +0800605 log_stdout=False).splitlines()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800606
607
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800608def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800609 """Query commit of given time.
610
611 Args:
612 git_repo: path of git repo.
613 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800614 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
615 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800616 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800617
618 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800619 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800620 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800621 if not branch:
622 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800623
Kuang-che Wued1bb622020-05-30 23:06:23 +0800624 if not path:
625 try:
626 return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
627 except FastLookupFailed:
628 pass
629
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800630 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800631 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800632 'rev-list',
633 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800634 '-1',
635 '--before',
636 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800637 branch,
638 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800639 if path:
640 cmd += ['--', path]
641
642 result = util.check_output(*cmd, cwd=git_repo).strip()
643 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800644
645
Kuang-che Wued1bb622020-05-30 23:06:23 +0800646def get_revlist_by_period(git_repo, branch, period):
647 # Find the last commit before period[0].
648 text = util.check_output(
649 'git',
650 'rev-list',
651 '--timestamp',
652 '-1',
653 '--before',
654 str(period[0] - 1),
655 branch,
656 cwd=git_repo)
657
658 # Find commits in the period.
659 text += util.check_output(
660 'git',
661 'rev-list',
662 '--timestamp',
663 '--reverse',
664 '--after',
665 str(period[0]),
666 '--before',
667 str(period[1]),
668 branch,
669 cwd=git_repo)
670
671 result = []
672 for line in text.splitlines():
673 timestamp, commit = line.split()
674 result.append((int(timestamp), commit))
675 return result
676
677
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800678def reset_hard(git_repo):
679 """Restore modified and deleted files.
680
681 This is simply wrapper of "git reset --hard".
682
683 Args:
684 git_repo: path of git repo.
685 """
686 util.check_call('git', 'reset', '--hard', cwd=git_repo)
687
688
689def list_untracked(git_repo, excludes=None):
690 """List untracked files and directories.
691
692 Args:
693 git_repo: path of git repo.
694 excludes: files and/or directories to ignore, relative to git_repo
695
696 Returns:
697 list of paths, relative to git_repo
698 """
699 exclude_flags = []
700 if excludes:
701 for exclude in excludes:
702 assert not os.path.isabs(exclude), 'should be relative'
703 exclude_flags += ['--exclude', '/' + re.escape(exclude)]
704
705 result = []
706 for path in util.check_output(
707 'git',
708 'ls-files',
709 '--others',
710 '--exclude-standard',
711 *exclude_flags,
712 cwd=git_repo).splitlines():
713 # Remove the trailing slash, which means directory.
714 path = path.rstrip('/')
715 result.append(path)
716 return result
717
718
719def distclean(git_repo, excludes=None):
720 """Clean up git repo directory.
721
722 Restore modified and deleted files. Delete untracked files.
723
724 Args:
725 git_repo: path of git repo.
726 excludes: files and/or directories to ignore, relative to git_repo
727 """
728 reset_hard(git_repo)
729
730 # Delete untracked files.
731 for untracked in list_untracked(git_repo, excludes=excludes):
732 path = os.path.join(git_repo, untracked)
733 logger.debug('delete untracked: %s', path)
Zheng-Jie Chang42fd2e42020-04-10 07:21:23 +0800734 if os.path.islink(path):
735 os.unlink(path)
736 elif os.path.isdir(path):
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800737 shutil.rmtree(path)
738 else:
739 os.unlink(path)
740
741
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800742def get_history(git_repo,
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800743 path=None,
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800744 branch=None,
745 after=None,
746 before=None,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800747 padding_begin=False,
748 padding_end=False,
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800749 with_subject=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800750 """Get commit history of given path.
751
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800752 `after` and `before` could be outside of lifetime of `path`. `padding` is
753 used to control what to return for such cases.
754
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800755 Args:
756 git_repo: path of git repo.
757 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800758 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800759 after: limit history after given time (inclusive)
760 before: limit history before given time (inclusive)
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800761 padding_begin: If True, pads returned result with dummy record at exact
762 'after' time, if 'path' existed at that time.
763 padding_end: If True, pads returned result with dummy record at exact
764 'before' time, if 'path' existed at that time.
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800765 with_subject: If True, return commit subject together
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800766
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800767 Returns:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800768 List of (timestamp, git hash, subject); or (timestamp, git hash) depends
769 on with_subject flag. They are all events when `path` was added, removed,
770 modified, and start and end time if `padding` is true. If `padding` and
771 `with_subject` are both true, 'dummy subject' will be returned as padding
772 history's subject.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800773
774 For each pair, at `timestamp`, the repo state is `git hash`. In other
775 words, `timestamp` is not necessary the commit time of `git hash` for the
776 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800777 """
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800778 log_format = '%ct %H' if not with_subject else '%ct %H %s'
779 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=' + log_format]
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800780 if after:
781 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800782 if before:
783 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800784 if branch:
785 assert not is_git_rev(branch)
786 cmd += [branch]
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800787 if path:
788 # '--' is necessary otherwise if `path` is removed in current revision, git
789 # will complain it's an ambiguous argument which may be path or something
790 # else (like git branch name, tag name, etc.)
791 cmd += ['--', path]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800792
793 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800794 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800795 # array = [timestamp, git_rev, subject] or [timestamp, git_rev]
796 array = line.split(' ', 2)
797 array[0] = int(array[0])
798 result.append(tuple(array))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800799
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800800 if padding_begin or padding_end:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800801 history = [0, '']
802 if with_subject:
803 history.append('dummy subject')
804
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800805 if padding_end:
806 assert before, 'padding_end=True make no sense if before=None'
807 if get_rev_by_time(git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800808 before = int(before)
809 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800810 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800811 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800812 history[0:2] = [before, git_rev]
813 result.append(tuple(history))
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800814
815 if padding_begin:
816 assert after, 'padding_begin=True make no sense if after=None'
817 if get_rev_by_time(git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800818 after = int(after)
819 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800820 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800821 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800822 history[0:2] = [after, git_rev]
823 result.insert(0, tuple(history))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800824
825 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800826
827
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800828def get_history_recursively(git_repo,
829 path,
830 after,
831 before,
832 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800833 padding_end=True,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800834 branch=None):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800835 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800836
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800837 In comparison to get_history(), get_history_recursively also takes
838 dependencies into consideration. For example, if file A referenced file B,
839 get_history_recursively(A) will return commits of B in addition to A. This
840 applies recursively, so commits of C will be included if file B referenced
841 file C, and so on.
842
843 This function is file type neutral. `parser_callback(filename, content)` will
844 be invoked to parse file content and should return list of filename of
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800845 dependencies. If `parser_callback` returns None (usually syntax error), the
846 commit is omitted.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800847
848 Args:
849 git_repo: path of git repo
850 path: path to query, relative to git_repo
851 after: limit history after given time (inclusive)
852 before: limit history before given time (inclusive)
853 parser_callback: callback to parse file content. See above comment.
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800854 padding_end: If True, pads returned result with dummy record at exact
855 'after' time, if 'path' existed at that time.
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800856 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800857
858 Returns:
859 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800860 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800861 history = get_history(
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800862 git_repo,
863 path,
864 after=after,
865 before=before,
866 padding_begin=True,
867 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800868
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800869 # Collect include information of each commit.
870 includes = {}
871 for commit_time, git_rev in history:
872 content = get_file_from_revision(git_repo, git_rev, path)
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800873 parse_result = parser_callback(path, content)
874 if parse_result is None:
875 continue
876 for include_name in parse_result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800877 if include_name not in includes:
878 includes[include_name] = set()
879 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800880
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800881 # Analyze the start time and end time of each include.
882 dependencies = []
883 for include in includes:
884 appeared = None
885 for commit_time, git_rev in history:
886 if git_rev in includes[include]:
887 if not appeared:
888 appeared = commit_time
889 else:
890 if appeared:
Zheng-Jie Chang4d617a42020-02-15 06:46:00 +0800891 # dependency file exists in time range [appeared, commit_time)
892 dependencies.append((include, appeared, commit_time - 1))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800893 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800894
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800895 if appeared is not None:
896 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800897
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800898 # Recursion and merge.
899 result = list(history)
900 for include, appeared, disappeared in dependencies:
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800901 result += get_history_recursively(
902 git_repo,
903 include,
904 appeared,
905 disappeared,
906 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800907 padding_end=False,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800908 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800909
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800910 # Sort and padding.
911 result.sort(key=lambda x: x[0])
912 if padding_end:
913 pad = (before,)
914 pad += result[-1][1:]
915 result.append(pad)
916
917 # Dedup.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800918 result2 = []
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800919 for x in result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800920 if result2 and result2[-1] == x:
921 continue
922 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800923
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800924 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800925
926
Kuang-che Wud558a042020-06-06 02:11:00 +0800927def get_branches(git_repo, all_branches=True, commit=None, remote=False):
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800928 """Get branches of a repository.
929
930 Args:
931 git_repo: path of git repo
932 all_branches: return remote branches if is set to True
933 commit: return branches containing this commit if is not None
Kuang-che Wud558a042020-06-06 02:11:00 +0800934 remote: only remote tracking branches
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800935
936 Returns:
937 list of branch names
938 """
939 cmd = ['git', 'branch', '--format=%(refname)']
940 if all_branches:
941 cmd += ['-a']
942 if commit:
943 cmd += ['--contains', commit]
Kuang-che Wud558a042020-06-06 02:11:00 +0800944 if remote:
945 cmd.append('--remote')
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800946
947 result = []
948 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
949 result.append(line.strip())
950 return result
951
952
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800953def list_commits_between_commits(git_repo, old, new):
954 """Get all commits between (old, new].
955
956 Args:
957 git_repo: path of git repo.
958 old: old commit hash (exclusive)
959 new: new commit hash (inclusive)
960
961 Returns:
962 list of (timestamp, rev)
963 """
964 assert old and new
Kuang-che Wu470866e2020-05-27 11:03:10 +0800965 if old == new:
966 return []
967
968 assert is_ancestor_commit(git_repo, old, new)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800969 commits = []
970 # --first-parent is necessary for Android, see following link for more
971 # discussion.
972 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
973 for line in util.check_output(
974 'git',
975 'rev-list',
976 '--timestamp',
977 '--reverse',
978 '--first-parent',
979 '%s..%s' % (old, new),
980 cwd=git_repo).splitlines():
981 timestamp, git_rev = line.split()
Kuang-che Wued1bb622020-05-30 23:06:23 +0800982 commits.append((int(timestamp), git_rev))
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800983
984 # bisect-kit has a fundamental assumption that commit timestamps are
985 # increasing because we sort and bisect the commits by timestamp across git
986 # repos. If not increasing, we have to adjust the timestamp as workaround.
987 # This might lead to bad bisect result, however the bad probability is low in
988 # practice since most machines' clocks are good enough.
Kuang-che Wued1bb622020-05-30 23:06:23 +0800989 adjusted, commits = _adjust_timestamp_increasingly(commits)
990 if adjusted != 0:
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800991 logger.warning('Commit timestamps are not increasing')
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800992 logger.warning('%d timestamps adjusted', adjusted)
993
994 return commits