blob: f3da4e44d85ee70882577d71d95c2fc4be4cf1b9 [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
Kuang-che Wued1bb622020-05-30 23:06:23 +08008import bisect
Kuang-che Wue41e0062017-09-01 19:04:14 +08009import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +080010import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080011import re
Kuang-che Wu3d04eda2019-09-05 23:56:40 +080012import shutil
Zheng-Jie Chang29144442020-02-18 11:53:25 +080013import stat
Kuang-che Wue41e0062017-09-01 19:04:14 +080014import subprocess
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080015import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080016
Kuang-che Wufcbcc502020-06-01 11:48:20 +080017from bisect_kit import cache_util
Kuang-che Wue41e0062017-09-01 19:04:14 +080018from bisect_kit import cli
19from bisect_kit import util
20
21logger = logging.getLogger(__name__)
22
23GIT_FULL_COMMIT_ID_LENGTH = 40
24
25# Minimal acceptable length of git commit id.
26#
27# For chromium, hash collision rate over number of digits:
28# - 6 digits: 4.85%
29# - 7 digits: 0.32%
30# - 8 digits: 0.01%
31# As foolproof check, 7 digits should be enough.
32GIT_MIN_COMMIT_ID_LENGTH = 7
33
34
35def is_git_rev(s):
36 """Is a git hash-like version string.
37
38 It accepts shortened hash with at least 7 digits.
39 """
40 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
41 return False
42 return bool(re.match(r'^[0-9a-f]+$', s))
43
44
45def argtype_git_rev(s):
46 """Validates git hash."""
47 if not is_git_rev(s):
48 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
49 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
50 return s
51
52
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080053def is_git_root(path):
54 """Is given path root of git repo."""
55 return os.path.exists(os.path.join(path, '.git'))
56
57
Kuang-che Wu08366542019-01-12 12:37:49 +080058def is_git_bare_dir(path):
59 """Is inside .git folder or bare git checkout."""
60 if not os.path.isdir(path):
61 return False
62 try:
63 return util.check_output(
64 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
65 except subprocess.CalledProcessError:
66 return False
67
68
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080069def clone(git_repo, repo_url, reference=None):
70 if not os.path.exists(git_repo):
71 os.makedirs(git_repo)
72 cmd = ['git', 'clone', repo_url, '.']
73 if reference:
74 cmd += ['--reference', reference]
75 util.check_call(*cmd, cwd=git_repo)
76
77
Kuang-che Wue41e0062017-09-01 19:04:14 +080078def checkout_version(git_repo, rev):
79 """git checkout.
80
81 Args:
82 git_repo: path of git repo.
83 rev: git commit revision to checkout.
84 """
85 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
86
87
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080088def init(git_repo):
89 """git init.
90
91 git_repo and its parent directories will be created if they don't exist.
92
93 Args:
94 git_repo: path of git repo.
95 """
96 if not os.path.exists(git_repo):
97 os.makedirs(git_repo)
98
99 util.check_call('git', 'init', '-q', cwd=git_repo)
100
101
102def commit_file(git_repo,
103 path,
104 message,
105 content,
106 commit_time=None,
107 author_time=None):
108 """Commit a file.
109
110 Args:
111 git_repo: path of git repo
112 path: file path, relative to git_repo
113 message: commit message
114 content: file content
115 commit_time: commit timestamp
116 author_time: author timestamp
117 """
118 if author_time is None:
119 author_time = commit_time
120
121 env = {}
122 if author_time:
123 env['GIT_AUTHOR_DATE'] = str(author_time)
124 if commit_time:
125 env['GIT_COMMITTER_DATE'] = str(commit_time)
126
127 full_path = os.path.join(git_repo, path)
128 dirname = os.path.dirname(full_path)
129 if not os.path.exists(dirname):
130 os.makedirs(dirname)
131 with open(full_path, 'w') as f:
132 f.write(content)
133
134 util.check_call('git', 'add', path, cwd=git_repo)
135 util.check_call(
136 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
137
138
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800139def config(git_repo, *args):
140 """Wrapper of 'git config'.
141
142 Args:
143 git_repo: path of git repo.
144 args: parameters pass to 'git config'
145 """
146 util.check_call('git', 'config', *args, cwd=git_repo)
147
148
149def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800150 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800151
152 Args:
153 git_repo: path of git repo.
154 args: parameters pass to 'git fetch'
155 """
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800156 tries = 0
157 while True:
158 tries += 1
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800159 stderr_lines = []
160 try:
161 util.check_call(
162 'git',
163 'fetch',
164 *args,
165 cwd=git_repo,
166 stderr_callback=stderr_lines.append)
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800167 return
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800168 except subprocess.CalledProcessError:
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800169 if tries >= 5:
170 logger.error('git fetch failed too much times')
171 raise
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800172 stderr = ''.join(stderr_lines)
173 # only retry 5xx internal server error
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800174 if 'The requested URL returned error: 5' in stderr:
175 delay = min(60, 10 * 2**tries)
176 logger.warning('git fetch failed, will retry %s seconds later', delay)
177 time.sleep(delay)
178 continue
179 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800180
181
Kuang-che Wued1bb622020-05-30 23:06:23 +0800182def _adjust_timestamp_increasingly(commits):
183 """Adjust commit timestamps.
184
185 After adjust, the timestamps are increasing.
186
187 Args:
188 commits: list of (timestamp, commit hash)
189
190 Returns:
191 (adjusted count, list of (timestamp, commit hash))
192 """
193 result = []
194 adjusted = 0
195 last_timestamp = -1
196 for timestamp, git_rev in commits:
197 if timestamp < last_timestamp:
198 adjusted += 1
199 timestamp = last_timestamp
200 else:
201 last_timestamp = timestamp
202 result.append((timestamp, git_rev))
203 return adjusted, result
204
205
206class FastLookupFailed(Exception):
207 """No data is cached for this query.
208
209 The caller should fallback to the original operation.
210 """
211
212
213class FastLookupEntry:
214 """Cached commits from one branch of given time period.
215
216 With this class, we can look up commit via commit hash and timestamp fast.
217 """
218
219 def __init__(self, git_repo, branch):
220 self.git_repo = git_repo
221 self.branch = branch
222 self.optimized_period = None
223 self.cached = []
224 self.commit_to_index = {}
225
226 def optimize(self, period):
227 assert period[0] <= period[1]
228 if (self.optimized_period and self.optimized_period[0] <= period[0] and
229 period[1] <= self.optimized_period[1]):
230 # already done
231 return
232
233 self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
234 self.optimized_period = period
235
236 # Adjust timestamps, so we can do binary search by timestamp
237 _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
238
239 self.commit_to_index = {}
240 for i, (_timestamp, rev) in enumerate(self.cached):
241 self.commit_to_index[rev] = i
242
243 def get_rev_by_time(self, timestamp):
244 if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
245 raise FastLookupFailed
246
247 # Note that, the return value might be different as "git rev-list" if the
248 # actual commit timestamps are not fully increasing.
249 x = (timestamp, '')
250 idx = bisect.bisect_right(self.cached, x)
251 if idx == 0 and timestamp < self.cached[0][0]:
252 return None
253 if idx == len(self.cached) or self.cached[idx][0] != timestamp:
254 idx -= 1
255 return self.cached[idx][1]
256
257 def is_containing_commit(self, rev):
258 if rev in self.commit_to_index:
259 return True
260 raise FastLookupFailed
261
262 def is_ancestor_commit(self, old, new):
263 old_idx = self.commit_to_index.get(old)
264 new_idx = self.commit_to_index.get(new)
265 if old_idx is not None and new_idx is not None:
266 return old_idx < new_idx
267 raise FastLookupFailed
268
269
270class FastLookup:
271 """Collection of FastLookupEntry"""
272
273 def __init__(self):
274 self.entries = {}
275 self.target_period = None
276
277 def optimize(self, period):
278 self.target_period = period
279
280 def disable(self):
281 self.target_period = None
282 self.entries = {}
283
284 def get_rev_by_time(self, git_repo, timestamp, branch):
285 if not self.target_period:
286 raise FastLookupFailed
287 if not self.target_period[0] <= timestamp <= self.target_period[1]:
288 raise FastLookupFailed
289
290 if git_repo not in self.entries:
291 self.entries[git_repo] = {}
292 if branch not in self.entries[git_repo]:
293 self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
294 entry = self.entries[git_repo][branch]
295 entry.optimize(self.target_period)
296 return entry.get_rev_by_time(timestamp)
297
298 def is_containing_commit(self, git_repo, rev):
299 # This function is optimized only after get_rev_by_time() is invoked.
300 if git_repo not in self.entries:
301 raise FastLookupFailed
302
303 for entry in self.entries[git_repo].values():
304 try:
305 return entry.is_containing_commit(rev)
306 except FastLookupFailed:
307 pass
308 raise FastLookupFailed
309
310 def is_ancestor_commit(self, git_repo, old, new):
311 # This function is optimized only after get_rev_by_time() is invoked.
312 if git_repo not in self.entries:
313 raise FastLookupFailed
314
315 for entry in self.entries[git_repo].values():
316 try:
317 return entry.is_ancestor_commit(old, new)
318 except FastLookupFailed:
319 pass
320 raise FastLookupFailed
321
322
323fast_lookup = FastLookup()
324
325
Kuang-che Wue41e0062017-09-01 19:04:14 +0800326def is_containing_commit(git_repo, rev):
327 """Determines given commit exists.
328
329 Args:
330 git_repo: path of git repo.
331 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800332
333 Returns:
334 True if rev is inside given git repo. If git_repo is not a git folder,
335 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800336 """
337 try:
Kuang-che Wued1bb622020-05-30 23:06:23 +0800338 return fast_lookup.is_containing_commit(git_repo, rev)
339 except FastLookupFailed:
340 pass
341
342 try:
Kuang-che Wue41e0062017-09-01 19:04:14 +0800343 return util.check_output(
344 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
345 except subprocess.CalledProcessError:
346 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800347 except OSError:
348 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800349
350
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800351def is_ancestor_commit(git_repo, old, new):
352 """Determines `old` commit is ancestor of `new` commit.
353
354 Args:
355 git_repo: path of git repo.
356 old: the ancestor commit.
357 new: the descendant commit.
358
359 Returns:
360 True only if `old` is the ancestor of `new`. One commit is not considered
361 as ancestor of itself.
362 """
Kuang-che Wued1bb622020-05-30 23:06:23 +0800363 try:
364 return fast_lookup.is_ancestor_commit(git_repo, old, new)
365 except FastLookupFailed:
366 pass
367
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800368 return util.check_output(
369 'git',
370 'rev-list',
371 '--ancestry-path',
372 '-1',
373 '%s..%s' % (old, new),
374 cwd=git_repo) != ''
375
376
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800377@cache_util.Cache.default_disabled
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800378def get_commit_metadata(git_repo, rev):
379 """Get metadata of given commit.
380
381 Args:
382 git_repo: path of git repo.
383 rev: git commit revision in query.
384
385 Returns:
386 dict of metadata, including (if available):
387 tree: hash of git tree object
388 parent: list of parent commits; this field is unavailable for the very
389 first commit of git repo.
390 author: name and email of author
391 author_time: author timestamp (without timezone information)
392 committer: name and email of committer
393 committer_time: commit timestamp (without timezone information)
394 message: commit message text
395 """
396 meta = {}
397 data = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800398 'git', 'cat-file', '-p', rev, cwd=git_repo, log_stdout=False)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800399 header, meta['message'] = data.split('\n\n', 1)
400 for line in header.splitlines():
401 m = re.match(r'^tree (\w+)', line)
402 if m:
403 meta['tree'] = m.group(1)
404 continue
405
406 m = re.match(r'^parent (\w+)', line)
407 if m:
408 meta['parent'] = line.split()[1:]
409 continue
410
411 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
412 if m:
413 meta[m.group(1)] = m.group(2)
414 meta['%s_time' % m.group(1)] = int(m.group(3))
415 continue
416 return meta
417
418
Kuang-che Wue41e0062017-09-01 19:04:14 +0800419def get_revlist(git_repo, old, new):
420 """Enumerates git commit between two revisions (inclusive).
421
422 Args:
423 git_repo: path of git repo.
424 old: git commit revision.
425 new: git commit revision.
426
427 Returns:
428 list of git revisions. The list contains the input revisions, old and new.
429 """
430 assert old
431 assert new
432 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
433 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
434 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800435
436
437def get_commit_log(git_repo, rev):
438 """Get git commit log.
439
440 Args:
441 git_repo: path of git repo.
442 rev: git commit revision.
443
444 Returns:
445 commit log message
446 """
447 cmd = ['git', 'log', '-1', '--format=%B', rev]
448 msg = util.check_output(*cmd, cwd=git_repo)
449 return msg
450
451
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800452def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800453 """Get git commit hash.
454
455 Args:
456 git_repo: path of git repo.
457 rev: could be git tag, branch, or (shortened) commit hash
458
459 Returns:
460 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800461
462 Raises:
463 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800464 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800465 try:
466 # Use '^{commit}' to restrict search only commits.
467 # Use '--' to avoid ambiguity, like matching rev against path name.
468 output = util.check_output(
469 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
470 git_rev = output.rstrip('-\n')
471 except subprocess.CalledProcessError:
472 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
473 # because it searches objects other than commits as well.
474 raise ValueError('%s is not unique or does not exist' % rev)
475 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800476 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800477
478
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800479def get_commit_time(git_repo, rev, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800480 """Get git commit timestamp.
481
482 Args:
483 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800484 rev: git commit id, branch name, tag name, or other git object
485 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800486
487 Returns:
488 timestamp (int)
489 """
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800490 cmd = ['git', 'log', '-1', '--format=%ct', rev]
491 if path:
492 cmd += ['--', path]
493 line = util.check_output(*cmd, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800494 return int(line)
495
496
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800497def is_symbolic_link(git_repo, rev, path):
498 """Check if a file is symbolic link.
499
500 Args:
501 git_repo: path of git repo
502 rev: git commit id
503 path: file path
504
505 Returns:
506 True if the specified file is a symbolic link in repo.
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800507
508 Raises:
509 ValueError if not found
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800510 """
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800511 # format: 120000 blob 8735a8c1dd96ede39a21d983d5c96792fd15c1a5 default.xml
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800512 # TODO(kcwu): handle escaped path with special characters
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800513 splitted = util.check_output(
514 'git', 'ls-tree', rev, '--full-name', path, cwd=git_repo).split()
515 if len(splitted) >= 4 and splitted[3] == path:
516 return stat.S_ISLNK(int(splitted[0], 8))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800517
Kuang-che Wud1b74152020-05-20 08:46:46 +0800518 raise ValueError('file %s is not found in repo:%s rev:%s' %
519 (path, git_repo, rev))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800520
521
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800522@cache_util.Cache.default_disabled
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800523def get_file_from_revision(git_repo, rev, path):
524 """Get file content of given revision.
525
526 Args:
527 git_repo: path of git repo
528 rev: git commit id
529 path: file path
530
531 Returns:
532 file content (str)
533 """
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800534 result = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800535 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_stdout=False)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800536
537 # It might be a symbolic link.
538 # In extreme case, it's possible that filenames contain special characters,
539 # like newlines. In practice, it should be safe to assume no such cases and
540 # reduce disk i/o.
541 if '\n' not in result and is_symbolic_link(git_repo, rev, path):
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800542 return get_file_from_revision(git_repo, rev, result)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800543
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800544 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800545
546
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800547def list_dir_from_revision(git_repo, rev, path):
548 """Lists entries of directory of given revision.
549
550 Args:
551 git_repo: path of git repo
552 rev: git commit id
553 path: directory path, relative to git root
554
555 Returns:
556 list of names
557
558 Raises:
559 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
560 """
561 return util.check_output(
562 'git',
563 'ls-tree',
564 '--name-only',
565 '%s:%s' % (rev, path),
566 cwd=git_repo,
Kuang-che Wubcafc552019-08-15 15:27:02 +0800567 log_stdout=False).splitlines()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800568
569
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800570def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800571 """Query commit of given time.
572
573 Args:
574 git_repo: path of git repo.
575 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800576 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
577 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800578 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800579
580 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800581 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800582 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800583 if not branch:
584 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800585
Kuang-che Wued1bb622020-05-30 23:06:23 +0800586 if not path:
587 try:
588 return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
589 except FastLookupFailed:
590 pass
591
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800592 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800593 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800594 'rev-list',
595 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800596 '-1',
597 '--before',
598 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800599 branch,
600 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800601 if path:
602 cmd += ['--', path]
603
604 result = util.check_output(*cmd, cwd=git_repo).strip()
605 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800606
607
Kuang-che Wued1bb622020-05-30 23:06:23 +0800608def get_revlist_by_period(git_repo, branch, period):
609 # Find the last commit before period[0].
610 text = util.check_output(
611 'git',
612 'rev-list',
613 '--timestamp',
614 '-1',
615 '--before',
616 str(period[0] - 1),
617 branch,
618 cwd=git_repo)
619
620 # Find commits in the period.
621 text += util.check_output(
622 'git',
623 'rev-list',
624 '--timestamp',
625 '--reverse',
626 '--after',
627 str(period[0]),
628 '--before',
629 str(period[1]),
630 branch,
631 cwd=git_repo)
632
633 result = []
634 for line in text.splitlines():
635 timestamp, commit = line.split()
636 result.append((int(timestamp), commit))
637 return result
638
639
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800640def reset_hard(git_repo):
641 """Restore modified and deleted files.
642
643 This is simply wrapper of "git reset --hard".
644
645 Args:
646 git_repo: path of git repo.
647 """
648 util.check_call('git', 'reset', '--hard', cwd=git_repo)
649
650
651def list_untracked(git_repo, excludes=None):
652 """List untracked files and directories.
653
654 Args:
655 git_repo: path of git repo.
656 excludes: files and/or directories to ignore, relative to git_repo
657
658 Returns:
659 list of paths, relative to git_repo
660 """
661 exclude_flags = []
662 if excludes:
663 for exclude in excludes:
664 assert not os.path.isabs(exclude), 'should be relative'
665 exclude_flags += ['--exclude', '/' + re.escape(exclude)]
666
667 result = []
668 for path in util.check_output(
669 'git',
670 'ls-files',
671 '--others',
672 '--exclude-standard',
673 *exclude_flags,
674 cwd=git_repo).splitlines():
675 # Remove the trailing slash, which means directory.
676 path = path.rstrip('/')
677 result.append(path)
678 return result
679
680
681def distclean(git_repo, excludes=None):
682 """Clean up git repo directory.
683
684 Restore modified and deleted files. Delete untracked files.
685
686 Args:
687 git_repo: path of git repo.
688 excludes: files and/or directories to ignore, relative to git_repo
689 """
690 reset_hard(git_repo)
691
692 # Delete untracked files.
693 for untracked in list_untracked(git_repo, excludes=excludes):
694 path = os.path.join(git_repo, untracked)
695 logger.debug('delete untracked: %s', path)
Zheng-Jie Chang42fd2e42020-04-10 07:21:23 +0800696 if os.path.islink(path):
697 os.unlink(path)
698 elif os.path.isdir(path):
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800699 shutil.rmtree(path)
700 else:
701 os.unlink(path)
702
703
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800704def get_history(git_repo,
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800705 path=None,
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800706 branch=None,
707 after=None,
708 before=None,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800709 padding_begin=False,
710 padding_end=False,
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800711 with_subject=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800712 """Get commit history of given path.
713
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800714 `after` and `before` could be outside of lifetime of `path`. `padding` is
715 used to control what to return for such cases.
716
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800717 Args:
718 git_repo: path of git repo.
719 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800720 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800721 after: limit history after given time (inclusive)
722 before: limit history before given time (inclusive)
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800723 padding_begin: If True, pads returned result with dummy record at exact
724 'after' time, if 'path' existed at that time.
725 padding_end: If True, pads returned result with dummy record at exact
726 'before' time, if 'path' existed at that time.
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800727 with_subject: If True, return commit subject together
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800728
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800729 Returns:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800730 List of (timestamp, git hash, subject); or (timestamp, git hash) depends
731 on with_subject flag. They are all events when `path` was added, removed,
732 modified, and start and end time if `padding` is true. If `padding` and
733 `with_subject` are both true, 'dummy subject' will be returned as padding
734 history's subject.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800735
736 For each pair, at `timestamp`, the repo state is `git hash`. In other
737 words, `timestamp` is not necessary the commit time of `git hash` for the
738 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800739 """
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800740 log_format = '%ct %H' if not with_subject else '%ct %H %s'
741 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=' + log_format]
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800742 if after:
743 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800744 if before:
745 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800746 if branch:
747 assert not is_git_rev(branch)
748 cmd += [branch]
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800749 if path:
750 # '--' is necessary otherwise if `path` is removed in current revision, git
751 # will complain it's an ambiguous argument which may be path or something
752 # else (like git branch name, tag name, etc.)
753 cmd += ['--', path]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800754
755 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800756 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800757 # array = [timestamp, git_rev, subject] or [timestamp, git_rev]
758 array = line.split(' ', 2)
759 array[0] = int(array[0])
760 result.append(tuple(array))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800761
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800762 if padding_begin or padding_end:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800763 history = [0, '']
764 if with_subject:
765 history.append('dummy subject')
766
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800767 if padding_end:
768 assert before, 'padding_end=True make no sense if before=None'
769 if get_rev_by_time(git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800770 before = int(before)
771 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800772 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800773 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800774 history[0:2] = [before, git_rev]
775 result.append(tuple(history))
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800776
777 if padding_begin:
778 assert after, 'padding_begin=True make no sense if after=None'
779 if get_rev_by_time(git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800780 after = int(after)
781 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800782 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800783 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800784 history[0:2] = [after, git_rev]
785 result.insert(0, tuple(history))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800786
787 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800788
789
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800790def get_history_recursively(git_repo,
791 path,
792 after,
793 before,
794 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800795 padding_end=True,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800796 branch=None):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800797 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800798
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800799 In comparison to get_history(), get_history_recursively also takes
800 dependencies into consideration. For example, if file A referenced file B,
801 get_history_recursively(A) will return commits of B in addition to A. This
802 applies recursively, so commits of C will be included if file B referenced
803 file C, and so on.
804
805 This function is file type neutral. `parser_callback(filename, content)` will
806 be invoked to parse file content and should return list of filename of
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800807 dependencies. If `parser_callback` returns None (usually syntax error), the
808 commit is omitted.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800809
810 Args:
811 git_repo: path of git repo
812 path: path to query, relative to git_repo
813 after: limit history after given time (inclusive)
814 before: limit history before given time (inclusive)
815 parser_callback: callback to parse file content. See above comment.
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800816 padding_end: If True, pads returned result with dummy record at exact
817 'after' time, if 'path' existed at that time.
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800818 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800819
820 Returns:
821 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800822 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800823 history = get_history(
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800824 git_repo,
825 path,
826 after=after,
827 before=before,
828 padding_begin=True,
829 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800830
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800831 # Collect include information of each commit.
832 includes = {}
833 for commit_time, git_rev in history:
834 content = get_file_from_revision(git_repo, git_rev, path)
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800835 parse_result = parser_callback(path, content)
836 if parse_result is None:
837 continue
838 for include_name in parse_result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800839 if include_name not in includes:
840 includes[include_name] = set()
841 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800842
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800843 # Analyze the start time and end time of each include.
844 dependencies = []
845 for include in includes:
846 appeared = None
847 for commit_time, git_rev in history:
848 if git_rev in includes[include]:
849 if not appeared:
850 appeared = commit_time
851 else:
852 if appeared:
Zheng-Jie Chang4d617a42020-02-15 06:46:00 +0800853 # dependency file exists in time range [appeared, commit_time)
854 dependencies.append((include, appeared, commit_time - 1))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800855 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800856
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800857 if appeared is not None:
858 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800859
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800860 # Recursion and merge.
861 result = list(history)
862 for include, appeared, disappeared in dependencies:
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800863 result += get_history_recursively(
864 git_repo,
865 include,
866 appeared,
867 disappeared,
868 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800869 padding_end=False,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800870 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800871
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800872 # Sort and padding.
873 result.sort(key=lambda x: x[0])
874 if padding_end:
875 pad = (before,)
876 pad += result[-1][1:]
877 result.append(pad)
878
879 # Dedup.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800880 result2 = []
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800881 for x in result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800882 if result2 and result2[-1] == x:
883 continue
884 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800885
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800886 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800887
888
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800889def get_branches(git_repo, all_branches=True, commit=None):
890 """Get branches of a repository.
891
892 Args:
893 git_repo: path of git repo
894 all_branches: return remote branches if is set to True
895 commit: return branches containing this commit if is not None
896
897 Returns:
898 list of branch names
899 """
900 cmd = ['git', 'branch', '--format=%(refname)']
901 if all_branches:
902 cmd += ['-a']
903 if commit:
904 cmd += ['--contains', commit]
905
906 result = []
907 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
908 result.append(line.strip())
909 return result
910
911
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800912def list_commits_between_commits(git_repo, old, new):
913 """Get all commits between (old, new].
914
915 Args:
916 git_repo: path of git repo.
917 old: old commit hash (exclusive)
918 new: new commit hash (inclusive)
919
920 Returns:
921 list of (timestamp, rev)
922 """
923 assert old and new
Kuang-che Wu470866e2020-05-27 11:03:10 +0800924 if old == new:
925 return []
926
927 assert is_ancestor_commit(git_repo, old, new)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800928 commits = []
929 # --first-parent is necessary for Android, see following link for more
930 # discussion.
931 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
932 for line in util.check_output(
933 'git',
934 'rev-list',
935 '--timestamp',
936 '--reverse',
937 '--first-parent',
938 '%s..%s' % (old, new),
939 cwd=git_repo).splitlines():
940 timestamp, git_rev = line.split()
Kuang-che Wued1bb622020-05-30 23:06:23 +0800941 commits.append((int(timestamp), git_rev))
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800942
943 # bisect-kit has a fundamental assumption that commit timestamps are
944 # increasing because we sort and bisect the commits by timestamp across git
945 # repos. If not increasing, we have to adjust the timestamp as workaround.
946 # This might lead to bad bisect result, however the bad probability is low in
947 # practice since most machines' clocks are good enough.
Kuang-che Wued1bb622020-05-30 23:06:23 +0800948 adjusted, commits = _adjust_timestamp_increasingly(commits)
949 if adjusted != 0:
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800950 logger.warning('Commit timestamps are not increasing')
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800951 logger.warning('%d timestamps adjusted', adjusted)
952
953 return commits