blob: 82a202befb04d9151e3c038bd6a6b5c31e85d55f [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
Kuang-che Wued1bb622020-05-30 23:06:23 +08008import bisect
Kuang-che Wue41e0062017-09-01 19:04:14 +08009import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +080010import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080011import re
Kuang-che Wu3d04eda2019-09-05 23:56:40 +080012import shutil
Zheng-Jie Chang29144442020-02-18 11:53:25 +080013import stat
Kuang-che Wue41e0062017-09-01 19:04:14 +080014import subprocess
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080015import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080016
17from bisect_kit import cli
18from bisect_kit import util
19
20logger = logging.getLogger(__name__)
21
22GIT_FULL_COMMIT_ID_LENGTH = 40
23
24# Minimal acceptable length of git commit id.
25#
26# For chromium, hash collision rate over number of digits:
27# - 6 digits: 4.85%
28# - 7 digits: 0.32%
29# - 8 digits: 0.01%
30# As foolproof check, 7 digits should be enough.
31GIT_MIN_COMMIT_ID_LENGTH = 7
32
33
34def is_git_rev(s):
35 """Is a git hash-like version string.
36
37 It accepts shortened hash with at least 7 digits.
38 """
39 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
40 return False
41 return bool(re.match(r'^[0-9a-f]+$', s))
42
43
44def argtype_git_rev(s):
45 """Validates git hash."""
46 if not is_git_rev(s):
47 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
48 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
49 return s
50
51
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080052def is_git_root(path):
53 """Is given path root of git repo."""
54 return os.path.exists(os.path.join(path, '.git'))
55
56
Kuang-che Wu08366542019-01-12 12:37:49 +080057def is_git_bare_dir(path):
58 """Is inside .git folder or bare git checkout."""
59 if not os.path.isdir(path):
60 return False
61 try:
62 return util.check_output(
63 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
64 except subprocess.CalledProcessError:
65 return False
66
67
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080068def clone(git_repo, repo_url, reference=None):
69 if not os.path.exists(git_repo):
70 os.makedirs(git_repo)
71 cmd = ['git', 'clone', repo_url, '.']
72 if reference:
73 cmd += ['--reference', reference]
74 util.check_call(*cmd, cwd=git_repo)
75
76
Kuang-che Wue41e0062017-09-01 19:04:14 +080077def checkout_version(git_repo, rev):
78 """git checkout.
79
80 Args:
81 git_repo: path of git repo.
82 rev: git commit revision to checkout.
83 """
84 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
85
86
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080087def init(git_repo):
88 """git init.
89
90 git_repo and its parent directories will be created if they don't exist.
91
92 Args:
93 git_repo: path of git repo.
94 """
95 if not os.path.exists(git_repo):
96 os.makedirs(git_repo)
97
98 util.check_call('git', 'init', '-q', cwd=git_repo)
99
100
101def commit_file(git_repo,
102 path,
103 message,
104 content,
105 commit_time=None,
106 author_time=None):
107 """Commit a file.
108
109 Args:
110 git_repo: path of git repo
111 path: file path, relative to git_repo
112 message: commit message
113 content: file content
114 commit_time: commit timestamp
115 author_time: author timestamp
116 """
117 if author_time is None:
118 author_time = commit_time
119
120 env = {}
121 if author_time:
122 env['GIT_AUTHOR_DATE'] = str(author_time)
123 if commit_time:
124 env['GIT_COMMITTER_DATE'] = str(commit_time)
125
126 full_path = os.path.join(git_repo, path)
127 dirname = os.path.dirname(full_path)
128 if not os.path.exists(dirname):
129 os.makedirs(dirname)
130 with open(full_path, 'w') as f:
131 f.write(content)
132
133 util.check_call('git', 'add', path, cwd=git_repo)
134 util.check_call(
135 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
136
137
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800138def config(git_repo, *args):
139 """Wrapper of 'git config'.
140
141 Args:
142 git_repo: path of git repo.
143 args: parameters pass to 'git config'
144 """
145 util.check_call('git', 'config', *args, cwd=git_repo)
146
147
148def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800149 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800150
151 Args:
152 git_repo: path of git repo.
153 args: parameters pass to 'git fetch'
154 """
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800155 tries = 0
156 while True:
157 tries += 1
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800158 stderr_lines = []
159 try:
160 util.check_call(
161 'git',
162 'fetch',
163 *args,
164 cwd=git_repo,
165 stderr_callback=stderr_lines.append)
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800166 return
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800167 except subprocess.CalledProcessError:
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800168 if tries >= 5:
169 logger.error('git fetch failed too much times')
170 raise
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800171 stderr = ''.join(stderr_lines)
172 # only retry 5xx internal server error
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800173 if 'The requested URL returned error: 5' in stderr:
174 delay = min(60, 10 * 2**tries)
175 logger.warning('git fetch failed, will retry %s seconds later', delay)
176 time.sleep(delay)
177 continue
178 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800179
180
Kuang-che Wued1bb622020-05-30 23:06:23 +0800181def _adjust_timestamp_increasingly(commits):
182 """Adjust commit timestamps.
183
184 After adjust, the timestamps are increasing.
185
186 Args:
187 commits: list of (timestamp, commit hash)
188
189 Returns:
190 (adjusted count, list of (timestamp, commit hash))
191 """
192 result = []
193 adjusted = 0
194 last_timestamp = -1
195 for timestamp, git_rev in commits:
196 if timestamp < last_timestamp:
197 adjusted += 1
198 timestamp = last_timestamp
199 else:
200 last_timestamp = timestamp
201 result.append((timestamp, git_rev))
202 return adjusted, result
203
204
205class FastLookupFailed(Exception):
206 """No data is cached for this query.
207
208 The caller should fallback to the original operation.
209 """
210
211
212class FastLookupEntry:
213 """Cached commits from one branch of given time period.
214
215 With this class, we can look up commit via commit hash and timestamp fast.
216 """
217
218 def __init__(self, git_repo, branch):
219 self.git_repo = git_repo
220 self.branch = branch
221 self.optimized_period = None
222 self.cached = []
223 self.commit_to_index = {}
224
225 def optimize(self, period):
226 assert period[0] <= period[1]
227 if (self.optimized_period and self.optimized_period[0] <= period[0] and
228 period[1] <= self.optimized_period[1]):
229 # already done
230 return
231
232 self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
233 self.optimized_period = period
234
235 # Adjust timestamps, so we can do binary search by timestamp
236 _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
237
238 self.commit_to_index = {}
239 for i, (_timestamp, rev) in enumerate(self.cached):
240 self.commit_to_index[rev] = i
241
242 def get_rev_by_time(self, timestamp):
243 if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
244 raise FastLookupFailed
245
246 # Note that, the return value might be different as "git rev-list" if the
247 # actual commit timestamps are not fully increasing.
248 x = (timestamp, '')
249 idx = bisect.bisect_right(self.cached, x)
250 if idx == 0 and timestamp < self.cached[0][0]:
251 return None
252 if idx == len(self.cached) or self.cached[idx][0] != timestamp:
253 idx -= 1
254 return self.cached[idx][1]
255
256 def is_containing_commit(self, rev):
257 if rev in self.commit_to_index:
258 return True
259 raise FastLookupFailed
260
261 def is_ancestor_commit(self, old, new):
262 old_idx = self.commit_to_index.get(old)
263 new_idx = self.commit_to_index.get(new)
264 if old_idx is not None and new_idx is not None:
265 return old_idx < new_idx
266 raise FastLookupFailed
267
268
269class FastLookup:
270 """Collection of FastLookupEntry"""
271
272 def __init__(self):
273 self.entries = {}
274 self.target_period = None
275
276 def optimize(self, period):
277 self.target_period = period
278
279 def disable(self):
280 self.target_period = None
281 self.entries = {}
282
283 def get_rev_by_time(self, git_repo, timestamp, branch):
284 if not self.target_period:
285 raise FastLookupFailed
286 if not self.target_period[0] <= timestamp <= self.target_period[1]:
287 raise FastLookupFailed
288
289 if git_repo not in self.entries:
290 self.entries[git_repo] = {}
291 if branch not in self.entries[git_repo]:
292 self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
293 entry = self.entries[git_repo][branch]
294 entry.optimize(self.target_period)
295 return entry.get_rev_by_time(timestamp)
296
297 def is_containing_commit(self, git_repo, rev):
298 # This function is optimized only after get_rev_by_time() is invoked.
299 if git_repo not in self.entries:
300 raise FastLookupFailed
301
302 for entry in self.entries[git_repo].values():
303 try:
304 return entry.is_containing_commit(rev)
305 except FastLookupFailed:
306 pass
307 raise FastLookupFailed
308
309 def is_ancestor_commit(self, git_repo, old, new):
310 # This function is optimized only after get_rev_by_time() is invoked.
311 if git_repo not in self.entries:
312 raise FastLookupFailed
313
314 for entry in self.entries[git_repo].values():
315 try:
316 return entry.is_ancestor_commit(old, new)
317 except FastLookupFailed:
318 pass
319 raise FastLookupFailed
320
321
322fast_lookup = FastLookup()
323
324
Kuang-che Wue41e0062017-09-01 19:04:14 +0800325def is_containing_commit(git_repo, rev):
326 """Determines given commit exists.
327
328 Args:
329 git_repo: path of git repo.
330 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800331
332 Returns:
333 True if rev is inside given git repo. If git_repo is not a git folder,
334 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800335 """
336 try:
Kuang-che Wued1bb622020-05-30 23:06:23 +0800337 return fast_lookup.is_containing_commit(git_repo, rev)
338 except FastLookupFailed:
339 pass
340
341 try:
Kuang-che Wue41e0062017-09-01 19:04:14 +0800342 return util.check_output(
343 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
344 except subprocess.CalledProcessError:
345 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800346 except OSError:
347 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800348
349
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800350def is_ancestor_commit(git_repo, old, new):
351 """Determines `old` commit is ancestor of `new` commit.
352
353 Args:
354 git_repo: path of git repo.
355 old: the ancestor commit.
356 new: the descendant commit.
357
358 Returns:
359 True only if `old` is the ancestor of `new`. One commit is not considered
360 as ancestor of itself.
361 """
Kuang-che Wued1bb622020-05-30 23:06:23 +0800362 try:
363 return fast_lookup.is_ancestor_commit(git_repo, old, new)
364 except FastLookupFailed:
365 pass
366
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800367 return util.check_output(
368 'git',
369 'rev-list',
370 '--ancestry-path',
371 '-1',
372 '%s..%s' % (old, new),
373 cwd=git_repo) != ''
374
375
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800376def get_commit_metadata(git_repo, rev):
377 """Get metadata of given commit.
378
379 Args:
380 git_repo: path of git repo.
381 rev: git commit revision in query.
382
383 Returns:
384 dict of metadata, including (if available):
385 tree: hash of git tree object
386 parent: list of parent commits; this field is unavailable for the very
387 first commit of git repo.
388 author: name and email of author
389 author_time: author timestamp (without timezone information)
390 committer: name and email of committer
391 committer_time: commit timestamp (without timezone information)
392 message: commit message text
393 """
394 meta = {}
395 data = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800396 'git', 'cat-file', '-p', rev, cwd=git_repo, log_stdout=False)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800397 header, meta['message'] = data.split('\n\n', 1)
398 for line in header.splitlines():
399 m = re.match(r'^tree (\w+)', line)
400 if m:
401 meta['tree'] = m.group(1)
402 continue
403
404 m = re.match(r'^parent (\w+)', line)
405 if m:
406 meta['parent'] = line.split()[1:]
407 continue
408
409 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
410 if m:
411 meta[m.group(1)] = m.group(2)
412 meta['%s_time' % m.group(1)] = int(m.group(3))
413 continue
414 return meta
415
416
Kuang-che Wue41e0062017-09-01 19:04:14 +0800417def get_revlist(git_repo, old, new):
418 """Enumerates git commit between two revisions (inclusive).
419
420 Args:
421 git_repo: path of git repo.
422 old: git commit revision.
423 new: git commit revision.
424
425 Returns:
426 list of git revisions. The list contains the input revisions, old and new.
427 """
428 assert old
429 assert new
430 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
431 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
432 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800433
434
435def get_commit_log(git_repo, rev):
436 """Get git commit log.
437
438 Args:
439 git_repo: path of git repo.
440 rev: git commit revision.
441
442 Returns:
443 commit log message
444 """
445 cmd = ['git', 'log', '-1', '--format=%B', rev]
446 msg = util.check_output(*cmd, cwd=git_repo)
447 return msg
448
449
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800450def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800451 """Get git commit hash.
452
453 Args:
454 git_repo: path of git repo.
455 rev: could be git tag, branch, or (shortened) commit hash
456
457 Returns:
458 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800459
460 Raises:
461 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800462 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800463 try:
464 # Use '^{commit}' to restrict search only commits.
465 # Use '--' to avoid ambiguity, like matching rev against path name.
466 output = util.check_output(
467 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
468 git_rev = output.rstrip('-\n')
469 except subprocess.CalledProcessError:
470 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
471 # because it searches objects other than commits as well.
472 raise ValueError('%s is not unique or does not exist' % rev)
473 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800474 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800475
476
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800477def get_commit_time(git_repo, rev, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800478 """Get git commit timestamp.
479
480 Args:
481 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800482 rev: git commit id, branch name, tag name, or other git object
483 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800484
485 Returns:
486 timestamp (int)
487 """
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800488 cmd = ['git', 'log', '-1', '--format=%ct', rev]
489 if path:
490 cmd += ['--', path]
491 line = util.check_output(*cmd, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800492 return int(line)
493
494
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800495def is_symbolic_link(git_repo, rev, path):
496 """Check if a file is symbolic link.
497
498 Args:
499 git_repo: path of git repo
500 rev: git commit id
501 path: file path
502
503 Returns:
504 True if the specified file is a symbolic link in repo.
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800505
506 Raises:
507 ValueError if not found
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800508 """
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800509 # format: 120000 blob 8735a8c1dd96ede39a21d983d5c96792fd15c1a5 default.xml
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800510 # TODO(kcwu): handle escaped path with special characters
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800511 splitted = util.check_output(
512 'git', 'ls-tree', rev, '--full-name', path, cwd=git_repo).split()
513 if len(splitted) >= 4 and splitted[3] == path:
514 return stat.S_ISLNK(int(splitted[0], 8))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800515
Kuang-che Wud1b74152020-05-20 08:46:46 +0800516 raise ValueError('file %s is not found in repo:%s rev:%s' %
517 (path, git_repo, rev))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800518
519
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800520def get_file_from_revision(git_repo, rev, path):
521 """Get file content of given revision.
522
523 Args:
524 git_repo: path of git repo
525 rev: git commit id
526 path: file path
527
528 Returns:
529 file content (str)
530 """
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800531 result = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800532 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_stdout=False)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800533
534 # It might be a symbolic link.
535 # In extreme case, it's possible that filenames contain special characters,
536 # like newlines. In practice, it should be safe to assume no such cases and
537 # reduce disk i/o.
538 if '\n' not in result and is_symbolic_link(git_repo, rev, path):
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800539 return get_file_from_revision(git_repo, rev, result)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800540
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800541 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800542
543
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800544def list_dir_from_revision(git_repo, rev, path):
545 """Lists entries of directory of given revision.
546
547 Args:
548 git_repo: path of git repo
549 rev: git commit id
550 path: directory path, relative to git root
551
552 Returns:
553 list of names
554
555 Raises:
556 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
557 """
558 return util.check_output(
559 'git',
560 'ls-tree',
561 '--name-only',
562 '%s:%s' % (rev, path),
563 cwd=git_repo,
Kuang-che Wubcafc552019-08-15 15:27:02 +0800564 log_stdout=False).splitlines()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800565
566
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800567def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800568 """Query commit of given time.
569
570 Args:
571 git_repo: path of git repo.
572 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800573 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
574 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800575 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800576
577 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800578 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800579 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800580 if not branch:
581 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800582
Kuang-che Wued1bb622020-05-30 23:06:23 +0800583 if not path:
584 try:
585 return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
586 except FastLookupFailed:
587 pass
588
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800589 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800590 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800591 'rev-list',
592 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800593 '-1',
594 '--before',
595 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800596 branch,
597 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800598 if path:
599 cmd += ['--', path]
600
601 result = util.check_output(*cmd, cwd=git_repo).strip()
602 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800603
604
Kuang-che Wued1bb622020-05-30 23:06:23 +0800605def get_revlist_by_period(git_repo, branch, period):
606 # Find the last commit before period[0].
607 text = util.check_output(
608 'git',
609 'rev-list',
610 '--timestamp',
611 '-1',
612 '--before',
613 str(period[0] - 1),
614 branch,
615 cwd=git_repo)
616
617 # Find commits in the period.
618 text += util.check_output(
619 'git',
620 'rev-list',
621 '--timestamp',
622 '--reverse',
623 '--after',
624 str(period[0]),
625 '--before',
626 str(period[1]),
627 branch,
628 cwd=git_repo)
629
630 result = []
631 for line in text.splitlines():
632 timestamp, commit = line.split()
633 result.append((int(timestamp), commit))
634 return result
635
636
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800637def reset_hard(git_repo):
638 """Restore modified and deleted files.
639
640 This is simply wrapper of "git reset --hard".
641
642 Args:
643 git_repo: path of git repo.
644 """
645 util.check_call('git', 'reset', '--hard', cwd=git_repo)
646
647
648def list_untracked(git_repo, excludes=None):
649 """List untracked files and directories.
650
651 Args:
652 git_repo: path of git repo.
653 excludes: files and/or directories to ignore, relative to git_repo
654
655 Returns:
656 list of paths, relative to git_repo
657 """
658 exclude_flags = []
659 if excludes:
660 for exclude in excludes:
661 assert not os.path.isabs(exclude), 'should be relative'
662 exclude_flags += ['--exclude', '/' + re.escape(exclude)]
663
664 result = []
665 for path in util.check_output(
666 'git',
667 'ls-files',
668 '--others',
669 '--exclude-standard',
670 *exclude_flags,
671 cwd=git_repo).splitlines():
672 # Remove the trailing slash, which means directory.
673 path = path.rstrip('/')
674 result.append(path)
675 return result
676
677
678def distclean(git_repo, excludes=None):
679 """Clean up git repo directory.
680
681 Restore modified and deleted files. Delete untracked files.
682
683 Args:
684 git_repo: path of git repo.
685 excludes: files and/or directories to ignore, relative to git_repo
686 """
687 reset_hard(git_repo)
688
689 # Delete untracked files.
690 for untracked in list_untracked(git_repo, excludes=excludes):
691 path = os.path.join(git_repo, untracked)
692 logger.debug('delete untracked: %s', path)
Zheng-Jie Chang42fd2e42020-04-10 07:21:23 +0800693 if os.path.islink(path):
694 os.unlink(path)
695 elif os.path.isdir(path):
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800696 shutil.rmtree(path)
697 else:
698 os.unlink(path)
699
700
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800701def get_history(git_repo,
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800702 path=None,
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800703 branch=None,
704 after=None,
705 before=None,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800706 padding_begin=False,
707 padding_end=False,
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800708 with_subject=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800709 """Get commit history of given path.
710
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800711 `after` and `before` could be outside of lifetime of `path`. `padding` is
712 used to control what to return for such cases.
713
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800714 Args:
715 git_repo: path of git repo.
716 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800717 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800718 after: limit history after given time (inclusive)
719 before: limit history before given time (inclusive)
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800720 padding_begin: If True, pads returned result with dummy record at exact
721 'after' time, if 'path' existed at that time.
722 padding_end: If True, pads returned result with dummy record at exact
723 'before' time, if 'path' existed at that time.
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800724 with_subject: If True, return commit subject together
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800725
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800726 Returns:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800727 List of (timestamp, git hash, subject); or (timestamp, git hash) depends
728 on with_subject flag. They are all events when `path` was added, removed,
729 modified, and start and end time if `padding` is true. If `padding` and
730 `with_subject` are both true, 'dummy subject' will be returned as padding
731 history's subject.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800732
733 For each pair, at `timestamp`, the repo state is `git hash`. In other
734 words, `timestamp` is not necessary the commit time of `git hash` for the
735 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800736 """
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800737 log_format = '%ct %H' if not with_subject else '%ct %H %s'
738 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=' + log_format]
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800739 if after:
740 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800741 if before:
742 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800743 if branch:
744 assert not is_git_rev(branch)
745 cmd += [branch]
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800746 if path:
747 # '--' is necessary otherwise if `path` is removed in current revision, git
748 # will complain it's an ambiguous argument which may be path or something
749 # else (like git branch name, tag name, etc.)
750 cmd += ['--', path]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800751
752 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800753 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800754 # array = [timestamp, git_rev, subject] or [timestamp, git_rev]
755 array = line.split(' ', 2)
756 array[0] = int(array[0])
757 result.append(tuple(array))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800758
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800759 if padding_begin or padding_end:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800760 history = [0, '']
761 if with_subject:
762 history.append('dummy subject')
763
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800764 if padding_end:
765 assert before, 'padding_end=True make no sense if before=None'
766 if get_rev_by_time(git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800767 before = int(before)
768 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800769 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800770 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800771 history[0:2] = [before, git_rev]
772 result.append(tuple(history))
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800773
774 if padding_begin:
775 assert after, 'padding_begin=True make no sense if after=None'
776 if get_rev_by_time(git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800777 after = int(after)
778 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800779 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800780 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800781 history[0:2] = [after, git_rev]
782 result.insert(0, tuple(history))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800783
784 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800785
786
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800787def get_history_recursively(git_repo,
788 path,
789 after,
790 before,
791 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800792 padding_end=True,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800793 branch=None):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800794 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800795
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800796 In comparison to get_history(), get_history_recursively also takes
797 dependencies into consideration. For example, if file A referenced file B,
798 get_history_recursively(A) will return commits of B in addition to A. This
799 applies recursively, so commits of C will be included if file B referenced
800 file C, and so on.
801
802 This function is file type neutral. `parser_callback(filename, content)` will
803 be invoked to parse file content and should return list of filename of
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800804 dependencies. If `parser_callback` returns None (usually syntax error), the
805 commit is omitted.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800806
807 Args:
808 git_repo: path of git repo
809 path: path to query, relative to git_repo
810 after: limit history after given time (inclusive)
811 before: limit history before given time (inclusive)
812 parser_callback: callback to parse file content. See above comment.
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800813 padding_end: If True, pads returned result with dummy record at exact
814 'after' time, if 'path' existed at that time.
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800815 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800816
817 Returns:
818 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800819 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800820 history = get_history(
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800821 git_repo,
822 path,
823 after=after,
824 before=before,
825 padding_begin=True,
826 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800827
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800828 # Collect include information of each commit.
829 includes = {}
830 for commit_time, git_rev in history:
831 content = get_file_from_revision(git_repo, git_rev, path)
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800832 parse_result = parser_callback(path, content)
833 if parse_result is None:
834 continue
835 for include_name in parse_result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800836 if include_name not in includes:
837 includes[include_name] = set()
838 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800839
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800840 # Analyze the start time and end time of each include.
841 dependencies = []
842 for include in includes:
843 appeared = None
844 for commit_time, git_rev in history:
845 if git_rev in includes[include]:
846 if not appeared:
847 appeared = commit_time
848 else:
849 if appeared:
Zheng-Jie Chang4d617a42020-02-15 06:46:00 +0800850 # dependency file exists in time range [appeared, commit_time)
851 dependencies.append((include, appeared, commit_time - 1))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800852 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800853
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800854 if appeared is not None:
855 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800856
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800857 # Recursion and merge.
858 result = list(history)
859 for include, appeared, disappeared in dependencies:
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800860 result += get_history_recursively(
861 git_repo,
862 include,
863 appeared,
864 disappeared,
865 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800866 padding_end=False,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800867 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800868
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800869 # Sort and padding.
870 result.sort(key=lambda x: x[0])
871 if padding_end:
872 pad = (before,)
873 pad += result[-1][1:]
874 result.append(pad)
875
876 # Dedup.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800877 result2 = []
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800878 for x in result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800879 if result2 and result2[-1] == x:
880 continue
881 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800882
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800883 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800884
885
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800886def get_branches(git_repo, all_branches=True, commit=None):
887 """Get branches of a repository.
888
889 Args:
890 git_repo: path of git repo
891 all_branches: return remote branches if is set to True
892 commit: return branches containing this commit if is not None
893
894 Returns:
895 list of branch names
896 """
897 cmd = ['git', 'branch', '--format=%(refname)']
898 if all_branches:
899 cmd += ['-a']
900 if commit:
901 cmd += ['--contains', commit]
902
903 result = []
904 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
905 result.append(line.strip())
906 return result
907
908
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800909def list_commits_between_commits(git_repo, old, new):
910 """Get all commits between (old, new].
911
912 Args:
913 git_repo: path of git repo.
914 old: old commit hash (exclusive)
915 new: new commit hash (inclusive)
916
917 Returns:
918 list of (timestamp, rev)
919 """
920 assert old and new
Kuang-che Wu470866e2020-05-27 11:03:10 +0800921 if old == new:
922 return []
923
924 assert is_ancestor_commit(git_repo, old, new)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800925 commits = []
926 # --first-parent is necessary for Android, see following link for more
927 # discussion.
928 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
929 for line in util.check_output(
930 'git',
931 'rev-list',
932 '--timestamp',
933 '--reverse',
934 '--first-parent',
935 '%s..%s' % (old, new),
936 cwd=git_repo).splitlines():
937 timestamp, git_rev = line.split()
Kuang-che Wued1bb622020-05-30 23:06:23 +0800938 commits.append((int(timestamp), git_rev))
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800939
940 # bisect-kit has a fundamental assumption that commit timestamps are
941 # increasing because we sort and bisect the commits by timestamp across git
942 # repos. If not increasing, we have to adjust the timestamp as workaround.
943 # This might lead to bad bisect result, however the bad probability is low in
944 # practice since most machines' clocks are good enough.
Kuang-che Wued1bb622020-05-30 23:06:23 +0800945 adjusted, commits = _adjust_timestamp_increasingly(commits)
946 if adjusted != 0:
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800947 logger.warning('Commit timestamps are not increasing')
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800948 logger.warning('%d timestamps adjusted', adjusted)
949
950 return commits