blob: fb60a2611cceaba66734fd117c52e5ed8949ee09 [file] [log] [blame]
Kuang-che Wu6e4beca2018-06-27 17:45:02 +08001# -*- coding: utf-8 -*-
Kuang-che Wue41e0062017-09-01 19:04:14 +08002# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Git utility."""
6
7from __future__ import print_function
Kuang-che Wued1bb622020-05-30 23:06:23 +08008import bisect
Kuang-che Wue41e0062017-09-01 19:04:14 +08009import logging
Kuang-che Wubfc4a642018-04-19 11:54:08 +080010import os
Kuang-che Wue41e0062017-09-01 19:04:14 +080011import re
Kuang-che Wu3d04eda2019-09-05 23:56:40 +080012import shutil
Zheng-Jie Chang29144442020-02-18 11:53:25 +080013import stat
Kuang-che Wue41e0062017-09-01 19:04:14 +080014import subprocess
Kuang-che Wu13acc7b2020-06-15 10:45:35 +080015import tempfile
Kuang-che Wu2b1286b2019-05-20 20:37:26 +080016import time
Kuang-che Wue41e0062017-09-01 19:04:14 +080017
Kuang-che Wufcbcc502020-06-01 11:48:20 +080018from bisect_kit import cache_util
Kuang-che Wue41e0062017-09-01 19:04:14 +080019from bisect_kit import cli
20from bisect_kit import util
21
22logger = logging.getLogger(__name__)
23
24GIT_FULL_COMMIT_ID_LENGTH = 40
25
26# Minimal acceptable length of git commit id.
27#
28# For chromium, hash collision rate over number of digits:
29# - 6 digits: 4.85%
30# - 7 digits: 0.32%
31# - 8 digits: 0.01%
32# As foolproof check, 7 digits should be enough.
33GIT_MIN_COMMIT_ID_LENGTH = 7
34
35
36def is_git_rev(s):
37 """Is a git hash-like version string.
38
39 It accepts shortened hash with at least 7 digits.
40 """
41 if not GIT_MIN_COMMIT_ID_LENGTH <= len(s) <= GIT_FULL_COMMIT_ID_LENGTH:
42 return False
43 return bool(re.match(r'^[0-9a-f]+$', s))
44
45
46def argtype_git_rev(s):
47 """Validates git hash."""
48 if not is_git_rev(s):
49 msg = 'should be git hash, at least %d digits' % GIT_MIN_COMMIT_ID_LENGTH
50 raise cli.ArgTypeError(msg, '1a2b3c4d5e')
51 return s
52
53
Kuang-che Wu3eb6b502018-06-06 16:15:18 +080054def is_git_root(path):
55 """Is given path root of git repo."""
56 return os.path.exists(os.path.join(path, '.git'))
57
58
Kuang-che Wu08366542019-01-12 12:37:49 +080059def is_git_bare_dir(path):
60 """Is inside .git folder or bare git checkout."""
61 if not os.path.isdir(path):
62 return False
63 try:
64 return util.check_output(
65 'git', 'rev-parse', '--is-bare-repository', cwd=path) == 'true\n'
66 except subprocess.CalledProcessError:
67 return False
68
69
Kuang-che Wu6948ecc2018-09-11 17:43:49 +080070def clone(git_repo, repo_url, reference=None):
71 if not os.path.exists(git_repo):
72 os.makedirs(git_repo)
73 cmd = ['git', 'clone', repo_url, '.']
74 if reference:
75 cmd += ['--reference', reference]
76 util.check_call(*cmd, cwd=git_repo)
77
78
Kuang-che Wue41e0062017-09-01 19:04:14 +080079def checkout_version(git_repo, rev):
80 """git checkout.
81
82 Args:
83 git_repo: path of git repo.
84 rev: git commit revision to checkout.
85 """
86 util.check_call('git', 'checkout', '-q', '-f', rev, cwd=git_repo)
87
88
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +080089def init(git_repo):
90 """git init.
91
92 git_repo and its parent directories will be created if they don't exist.
93
94 Args:
95 git_repo: path of git repo.
96 """
97 if not os.path.exists(git_repo):
98 os.makedirs(git_repo)
99
100 util.check_call('git', 'init', '-q', cwd=git_repo)
101
102
103def commit_file(git_repo,
104 path,
105 message,
106 content,
107 commit_time=None,
108 author_time=None):
109 """Commit a file.
110
111 Args:
112 git_repo: path of git repo
113 path: file path, relative to git_repo
114 message: commit message
115 content: file content
116 commit_time: commit timestamp
117 author_time: author timestamp
118 """
119 if author_time is None:
120 author_time = commit_time
121
122 env = {}
123 if author_time:
124 env['GIT_AUTHOR_DATE'] = str(author_time)
125 if commit_time:
126 env['GIT_COMMITTER_DATE'] = str(commit_time)
127
128 full_path = os.path.join(git_repo, path)
129 dirname = os.path.dirname(full_path)
130 if not os.path.exists(dirname):
131 os.makedirs(dirname)
132 with open(full_path, 'w') as f:
133 f.write(content)
134
135 util.check_call('git', 'add', path, cwd=git_repo)
136 util.check_call(
137 'git', 'commit', '-q', '-m', message, path, cwd=git_repo, env=env)
138
139
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800140def config(git_repo, *args):
141 """Wrapper of 'git config'.
142
143 Args:
144 git_repo: path of git repo.
145 args: parameters pass to 'git config'
146 """
147 util.check_call('git', 'config', *args, cwd=git_repo)
148
149
150def fetch(git_repo, *args):
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800151 """Wrapper of 'git fetch' with retry support.
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800152
153 Args:
154 git_repo: path of git repo.
155 args: parameters pass to 'git fetch'
156 """
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800157 tries = 0
158 while True:
159 tries += 1
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800160 stderr_lines = []
161 try:
162 util.check_call(
163 'git',
164 'fetch',
165 *args,
166 cwd=git_repo,
167 stderr_callback=stderr_lines.append)
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800168 return
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800169 except subprocess.CalledProcessError:
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800170 if tries >= 5:
171 logger.error('git fetch failed too much times')
172 raise
Kuang-che Wu2b1286b2019-05-20 20:37:26 +0800173 stderr = ''.join(stderr_lines)
174 # only retry 5xx internal server error
Kuang-che Wu5bea88a2020-03-16 10:55:24 +0800175 if 'The requested URL returned error: 5' in stderr:
176 delay = min(60, 10 * 2**tries)
177 logger.warning('git fetch failed, will retry %s seconds later', delay)
178 time.sleep(delay)
179 continue
180 raise
Kuang-che Wu1e49f512018-12-06 15:27:42 +0800181
182
Kuang-che Wued1bb622020-05-30 23:06:23 +0800183def _adjust_timestamp_increasingly(commits):
184 """Adjust commit timestamps.
185
186 After adjust, the timestamps are increasing.
187
188 Args:
189 commits: list of (timestamp, commit hash)
190
191 Returns:
192 (adjusted count, list of (timestamp, commit hash))
193 """
194 result = []
195 adjusted = 0
196 last_timestamp = -1
197 for timestamp, git_rev in commits:
198 if timestamp < last_timestamp:
199 adjusted += 1
200 timestamp = last_timestamp
201 else:
202 last_timestamp = timestamp
203 result.append((timestamp, git_rev))
204 return adjusted, result
205
206
207class FastLookupFailed(Exception):
208 """No data is cached for this query.
209
210 The caller should fallback to the original operation.
211 """
212
213
214class FastLookupEntry:
215 """Cached commits from one branch of given time period.
216
217 With this class, we can look up commit via commit hash and timestamp fast.
218 """
219
220 def __init__(self, git_repo, branch):
221 self.git_repo = git_repo
222 self.branch = branch
223 self.optimized_period = None
224 self.cached = []
225 self.commit_to_index = {}
226
227 def optimize(self, period):
228 assert period[0] <= period[1]
229 if (self.optimized_period and self.optimized_period[0] <= period[0] and
230 period[1] <= self.optimized_period[1]):
231 # already done
232 return
233
234 self.cached = get_revlist_by_period(self.git_repo, self.branch, period)
235 self.optimized_period = period
236
237 # Adjust timestamps, so we can do binary search by timestamp
238 _adjusted, self.cached = _adjust_timestamp_increasingly(self.cached)
239
240 self.commit_to_index = {}
241 for i, (_timestamp, rev) in enumerate(self.cached):
242 self.commit_to_index[rev] = i
243
244 def get_rev_by_time(self, timestamp):
245 if not self.optimized_period[0] <= timestamp <= self.optimized_period[1]:
246 raise FastLookupFailed
247
248 # Note that, the return value might be different as "git rev-list" if the
249 # actual commit timestamps are not fully increasing.
250 x = (timestamp, '')
251 idx = bisect.bisect_right(self.cached, x)
252 if idx == 0 and timestamp < self.cached[0][0]:
253 return None
254 if idx == len(self.cached) or self.cached[idx][0] != timestamp:
255 idx -= 1
256 return self.cached[idx][1]
257
258 def is_containing_commit(self, rev):
259 if rev in self.commit_to_index:
260 return True
261 raise FastLookupFailed
262
263 def is_ancestor_commit(self, old, new):
264 old_idx = self.commit_to_index.get(old)
265 new_idx = self.commit_to_index.get(new)
266 if old_idx is not None and new_idx is not None:
267 return old_idx < new_idx
268 raise FastLookupFailed
269
270
271class FastLookup:
272 """Collection of FastLookupEntry"""
273
274 def __init__(self):
275 self.entries = {}
276 self.target_period = None
277
278 def optimize(self, period):
279 self.target_period = period
280
281 def disable(self):
282 self.target_period = None
283 self.entries = {}
284
285 def get_rev_by_time(self, git_repo, timestamp, branch):
286 if not self.target_period:
287 raise FastLookupFailed
288 if not self.target_period[0] <= timestamp <= self.target_period[1]:
289 raise FastLookupFailed
290
291 if git_repo not in self.entries:
292 self.entries[git_repo] = {}
293 if branch not in self.entries[git_repo]:
294 self.entries[git_repo][branch] = FastLookupEntry(git_repo, branch)
295 entry = self.entries[git_repo][branch]
296 entry.optimize(self.target_period)
297 return entry.get_rev_by_time(timestamp)
298
299 def is_containing_commit(self, git_repo, rev):
300 # This function is optimized only after get_rev_by_time() is invoked.
301 if git_repo not in self.entries:
302 raise FastLookupFailed
303
304 for entry in self.entries[git_repo].values():
305 try:
306 return entry.is_containing_commit(rev)
307 except FastLookupFailed:
308 pass
309 raise FastLookupFailed
310
311 def is_ancestor_commit(self, git_repo, old, new):
312 # This function is optimized only after get_rev_by_time() is invoked.
313 if git_repo not in self.entries:
314 raise FastLookupFailed
315
316 for entry in self.entries[git_repo].values():
317 try:
318 return entry.is_ancestor_commit(old, new)
319 except FastLookupFailed:
320 pass
321 raise FastLookupFailed
322
323
324fast_lookup = FastLookup()
325
326
Kuang-che Wu98d98462020-06-19 17:07:22 +0800327@cache_util.Cache.default_disabled
Kuang-che Wue41e0062017-09-01 19:04:14 +0800328def is_containing_commit(git_repo, rev):
329 """Determines given commit exists.
330
331 Args:
332 git_repo: path of git repo.
333 rev: git commit revision in query.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800334
335 Returns:
336 True if rev is inside given git repo. If git_repo is not a git folder,
337 returns False as well.
Kuang-che Wue41e0062017-09-01 19:04:14 +0800338 """
339 try:
Kuang-che Wued1bb622020-05-30 23:06:23 +0800340 return fast_lookup.is_containing_commit(git_repo, rev)
341 except FastLookupFailed:
342 pass
343
344 try:
Kuang-che Wue41e0062017-09-01 19:04:14 +0800345 return util.check_output(
346 'git', 'cat-file', '-t', rev, cwd=git_repo) == 'commit\n'
347 except subprocess.CalledProcessError:
348 return False
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800349 except OSError:
350 return False
Kuang-che Wue41e0062017-09-01 19:04:14 +0800351
352
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800353def is_ancestor_commit(git_repo, old, new):
354 """Determines `old` commit is ancestor of `new` commit.
355
356 Args:
357 git_repo: path of git repo.
358 old: the ancestor commit.
359 new: the descendant commit.
360
361 Returns:
362 True only if `old` is the ancestor of `new`. One commit is not considered
363 as ancestor of itself.
364 """
Kuang-che Wued1bb622020-05-30 23:06:23 +0800365 try:
366 return fast_lookup.is_ancestor_commit(git_repo, old, new)
367 except FastLookupFailed:
368 pass
369
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800370 return util.check_output(
371 'git',
372 'rev-list',
373 '--ancestry-path',
374 '-1',
375 '%s..%s' % (old, new),
376 cwd=git_repo) != ''
377
378
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800379def _parse_commit_object(s):
380 meta = {}
381 header, meta['message'] = s.split('\n\n', 1)
382 for line in header.splitlines():
383 m = re.match(r'^tree (\w+)', line)
384 if m:
385 meta['tree'] = m.group(1)
386 continue
387
388 m = re.match(r'^parent (\w+)', line)
389 if m:
390 meta['parent'] = line.split()[1:]
391 continue
392
393 m = re.match(r'^(author|committer) (.*) (\d+) (\S+)$', line)
394 if m:
395 meta[m.group(1)] = m.group(2)
396 meta['%s_time' % m.group(1)] = int(m.group(3))
397 continue
398 return meta
399
400
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800401@cache_util.Cache.default_disabled
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800402def get_commit_metadata(git_repo, rev):
403 """Get metadata of given commit.
404
405 Args:
406 git_repo: path of git repo.
407 rev: git commit revision in query.
408
409 Returns:
410 dict of metadata, including (if available):
411 tree: hash of git tree object
412 parent: list of parent commits; this field is unavailable for the very
413 first commit of git repo.
414 author: name and email of author
415 author_time: author timestamp (without timezone information)
416 committer: name and email of committer
417 committer_time: commit timestamp (without timezone information)
418 message: commit message text
419 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800420 data = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800421 'git', 'cat-file', '-p', rev, cwd=git_repo, log_stdout=False)
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800422 return _parse_commit_object(data)
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800423
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800424
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800425def get_batch_commit_metadata(git_repo, revs):
426 query = '\n'.join(revs)
427 logger.debug('get_batch_commit_metadata %r', query)
428 with tempfile.NamedTemporaryFile('w+t') as f:
429 f.write(query)
430 f.flush()
431 # util.check_output doesn't support stdin, so use shell
432 # redirect instead.
433 # binary=True because we need to count size in bytes later.
434 data = util.check_output(
435 'sh',
436 '-c',
437 'git cat-file --batch < ' + f.name,
438 cwd=git_repo,
439 binary=True)
440
441 metas = {}
442 while data:
443 first_line, data = data.split(b'\n', 1)
444 m = re.match(r'^(\w+) (\w+)(?: (\d+))?', first_line.decode('utf8'))
445 assert m, repr(first_line)
446 object_name, object_type = m.group(1, 2)
447 if not m.group(3):
448 metas[object_name] = None
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800449 continue
Kuang-che Wu13acc7b2020-06-15 10:45:35 +0800450 assert object_type == 'commit', 'unsupported object type: %s' % object_type
451 object_size = int(m.group(3))
452 assert data[object_size] == ord(b'\n'), repr(data[object_size])
453 obj, data = data[:object_size], data[object_size + 1:]
454 metas[object_name] = _parse_commit_object(obj.decode('utf8'))
455 return metas
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800456
457
Kuang-che Wue41e0062017-09-01 19:04:14 +0800458def get_revlist(git_repo, old, new):
459 """Enumerates git commit between two revisions (inclusive).
460
461 Args:
462 git_repo: path of git repo.
463 old: git commit revision.
464 new: git commit revision.
465
466 Returns:
467 list of git revisions. The list contains the input revisions, old and new.
468 """
469 assert old
470 assert new
471 cmd = ['git', 'rev-list', '--reverse', '%s^..%s' % (old, new)]
472 revlist = util.check_output(*cmd, cwd=git_repo).splitlines()
473 return revlist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800474
475
476def get_commit_log(git_repo, rev):
477 """Get git commit log.
478
479 Args:
480 git_repo: path of git repo.
481 rev: git commit revision.
482
483 Returns:
484 commit log message
485 """
486 cmd = ['git', 'log', '-1', '--format=%B', rev]
487 msg = util.check_output(*cmd, cwd=git_repo)
488 return msg
489
490
Kuang-che Wu68db08a2018-03-30 11:50:34 +0800491def get_commit_hash(git_repo, rev):
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800492 """Get git commit hash.
493
494 Args:
495 git_repo: path of git repo.
496 rev: could be git tag, branch, or (shortened) commit hash
497
498 Returns:
499 full git commit hash
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800500
501 Raises:
502 ValueError: `rev` is not unique or doesn't exist
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800503 """
Kuang-che Wu5e7c9b02019-01-03 21:16:01 +0800504 try:
505 # Use '^{commit}' to restrict search only commits.
506 # Use '--' to avoid ambiguity, like matching rev against path name.
507 output = util.check_output(
508 'git', 'rev-parse', '%s^{commit}' % rev, '--', cwd=git_repo)
509 git_rev = output.rstrip('-\n')
510 except subprocess.CalledProcessError:
511 # Do not use 'git rev-parse --disambiguate' to determine uniqueness
512 # because it searches objects other than commits as well.
513 raise ValueError('%s is not unique or does not exist' % rev)
514 assert is_git_rev(git_rev)
Kuang-che Wue2563ea2018-01-05 20:30:28 +0800515 return git_rev
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800516
517
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800518def get_commit_time(git_repo, rev, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800519 """Get git commit timestamp.
520
521 Args:
522 git_repo: path of git repo
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800523 rev: git commit id, branch name, tag name, or other git object
524 path: path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800525
526 Returns:
527 timestamp (int)
528 """
Zheng-Jie Chang868c1752020-01-21 14:42:41 +0800529 cmd = ['git', 'log', '-1', '--format=%ct', rev]
530 if path:
531 cmd += ['--', path]
532 line = util.check_output(*cmd, cwd=git_repo)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800533 return int(line)
534
535
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800536def is_symbolic_link(git_repo, rev, path):
537 """Check if a file is symbolic link.
538
539 Args:
540 git_repo: path of git repo
541 rev: git commit id
542 path: file path
543
544 Returns:
545 True if the specified file is a symbolic link in repo.
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800546
547 Raises:
548 ValueError if not found
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800549 """
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800550 # format: 120000 blob 8735a8c1dd96ede39a21d983d5c96792fd15c1a5 default.xml
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800551 # TODO(kcwu): handle escaped path with special characters
Zheng-Jie Chang29144442020-02-18 11:53:25 +0800552 splitted = util.check_output(
553 'git', 'ls-tree', rev, '--full-name', path, cwd=git_repo).split()
554 if len(splitted) >= 4 and splitted[3] == path:
555 return stat.S_ISLNK(int(splitted[0], 8))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800556
Kuang-che Wud1b74152020-05-20 08:46:46 +0800557 raise ValueError('file %s is not found in repo:%s rev:%s' %
558 (path, git_repo, rev))
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800559
560
Kuang-che Wufcbcc502020-06-01 11:48:20 +0800561@cache_util.Cache.default_disabled
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800562def get_file_from_revision(git_repo, rev, path):
563 """Get file content of given revision.
564
565 Args:
566 git_repo: path of git repo
567 rev: git commit id
568 path: file path
569
570 Returns:
571 file content (str)
572 """
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800573 result = util.check_output(
Kuang-che Wubcafc552019-08-15 15:27:02 +0800574 'git', 'show', '%s:%s' % (rev, path), cwd=git_repo, log_stdout=False)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800575
576 # It might be a symbolic link.
577 # In extreme case, it's possible that filenames contain special characters,
578 # like newlines. In practice, it should be safe to assume no such cases and
579 # reduce disk i/o.
580 if '\n' not in result and is_symbolic_link(git_repo, rev, path):
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800581 return get_file_from_revision(git_repo, rev, result)
Kuang-che Wu5bffed82020-05-27 10:50:51 +0800582
Zheng-Jie Chang1ace3012020-02-15 04:51:05 +0800583 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800584
585
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800586def list_dir_from_revision(git_repo, rev, path):
587 """Lists entries of directory of given revision.
588
589 Args:
590 git_repo: path of git repo
591 rev: git commit id
592 path: directory path, relative to git root
593
594 Returns:
595 list of names
596
597 Raises:
598 subprocess.CalledProcessError: if `path` doesn't exists in `rev`
599 """
600 return util.check_output(
601 'git',
602 'ls-tree',
603 '--name-only',
604 '%s:%s' % (rev, path),
605 cwd=git_repo,
Kuang-che Wubcafc552019-08-15 15:27:02 +0800606 log_stdout=False).splitlines()
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800607
608
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800609def get_rev_by_time(git_repo, timestamp, branch, path=None):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800610 """Query commit of given time.
611
612 Args:
613 git_repo: path of git repo.
614 timestamp: timestamp
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800615 branch: only query parent of the `branch`. If branch=None, it means 'HEAD'
616 (current branch, usually).
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800617 path: only query history of path, relative to git_repo
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800618
619 Returns:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800620 git commit hash. None if path didn't exist at the given time.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800621 """
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800622 if not branch:
623 branch = 'HEAD'
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800624
Kuang-che Wued1bb622020-05-30 23:06:23 +0800625 if not path:
626 try:
627 return fast_lookup.get_rev_by_time(git_repo, timestamp, branch)
628 except FastLookupFailed:
629 pass
630
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800631 cmd = [
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800632 'git',
Kuang-che Wud1d45b42018-07-05 00:46:45 +0800633 'rev-list',
634 '--first-parent',
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800635 '-1',
636 '--before',
637 str(timestamp),
Kuang-che Wu89ac2e72018-07-25 17:39:07 +0800638 branch,
639 ]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800640 if path:
641 cmd += ['--', path]
642
643 result = util.check_output(*cmd, cwd=git_repo).strip()
644 return result or None
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800645
646
Kuang-che Wued1bb622020-05-30 23:06:23 +0800647def get_revlist_by_period(git_repo, branch, period):
648 # Find the last commit before period[0].
649 text = util.check_output(
650 'git',
651 'rev-list',
652 '--timestamp',
653 '-1',
654 '--before',
655 str(period[0] - 1),
656 branch,
657 cwd=git_repo)
658
659 # Find commits in the period.
660 text += util.check_output(
661 'git',
662 'rev-list',
663 '--timestamp',
664 '--reverse',
665 '--after',
666 str(period[0]),
667 '--before',
668 str(period[1]),
669 branch,
670 cwd=git_repo)
671
672 result = []
673 for line in text.splitlines():
674 timestamp, commit = line.split()
675 result.append((int(timestamp), commit))
676 return result
677
678
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800679def reset_hard(git_repo):
680 """Restore modified and deleted files.
681
682 This is simply wrapper of "git reset --hard".
683
684 Args:
685 git_repo: path of git repo.
686 """
687 util.check_call('git', 'reset', '--hard', cwd=git_repo)
688
689
690def list_untracked(git_repo, excludes=None):
691 """List untracked files and directories.
692
693 Args:
694 git_repo: path of git repo.
695 excludes: files and/or directories to ignore, relative to git_repo
696
697 Returns:
698 list of paths, relative to git_repo
699 """
700 exclude_flags = []
701 if excludes:
702 for exclude in excludes:
703 assert not os.path.isabs(exclude), 'should be relative'
704 exclude_flags += ['--exclude', '/' + re.escape(exclude)]
705
706 result = []
707 for path in util.check_output(
708 'git',
709 'ls-files',
710 '--others',
711 '--exclude-standard',
712 *exclude_flags,
713 cwd=git_repo).splitlines():
714 # Remove the trailing slash, which means directory.
715 path = path.rstrip('/')
716 result.append(path)
717 return result
718
719
720def distclean(git_repo, excludes=None):
721 """Clean up git repo directory.
722
723 Restore modified and deleted files. Delete untracked files.
724
725 Args:
726 git_repo: path of git repo.
727 excludes: files and/or directories to ignore, relative to git_repo
728 """
729 reset_hard(git_repo)
730
731 # Delete untracked files.
732 for untracked in list_untracked(git_repo, excludes=excludes):
733 path = os.path.join(git_repo, untracked)
734 logger.debug('delete untracked: %s', path)
Zheng-Jie Chang42fd2e42020-04-10 07:21:23 +0800735 if os.path.islink(path):
736 os.unlink(path)
737 elif os.path.isdir(path):
Kuang-che Wu3d04eda2019-09-05 23:56:40 +0800738 shutil.rmtree(path)
739 else:
740 os.unlink(path)
741
742
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800743def get_history(git_repo,
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800744 path=None,
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800745 branch=None,
746 after=None,
747 before=None,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800748 padding_begin=False,
749 padding_end=False,
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800750 with_subject=False):
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800751 """Get commit history of given path.
752
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800753 `after` and `before` could be outside of lifetime of `path`. `padding` is
754 used to control what to return for such cases.
755
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800756 Args:
757 git_repo: path of git repo.
758 path: path to query, relative to git_repo
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800759 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800760 after: limit history after given time (inclusive)
761 before: limit history before given time (inclusive)
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800762 padding_begin: If True, pads returned result with dummy record at exact
763 'after' time, if 'path' existed at that time.
764 padding_end: If True, pads returned result with dummy record at exact
765 'before' time, if 'path' existed at that time.
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800766 with_subject: If True, return commit subject together
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800767
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800768 Returns:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800769 List of (timestamp, git hash, subject); or (timestamp, git hash) depends
770 on with_subject flag. They are all events when `path` was added, removed,
771 modified, and start and end time if `padding` is true. If `padding` and
772 `with_subject` are both true, 'dummy subject' will be returned as padding
773 history's subject.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800774
775 For each pair, at `timestamp`, the repo state is `git hash`. In other
776 words, `timestamp` is not necessary the commit time of `git hash` for the
777 padded entries.
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800778 """
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800779 log_format = '%ct %H' if not with_subject else '%ct %H %s'
780 cmd = ['git', 'log', '--reverse', '--first-parent', '--format=' + log_format]
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800781 if after:
782 cmd += ['--after', str(after)]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800783 if before:
784 cmd += ['--before', str(before)]
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800785 if branch:
786 assert not is_git_rev(branch)
787 cmd += [branch]
Zheng-Jie Chang0fc704b2019-12-09 18:43:38 +0800788 if path:
789 # '--' is necessary otherwise if `path` is removed in current revision, git
790 # will complain it's an ambiguous argument which may be path or something
791 # else (like git branch name, tag name, etc.)
792 cmd += ['--', path]
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800793
794 result = []
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800795 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800796 # array = [timestamp, git_rev, subject] or [timestamp, git_rev]
797 array = line.split(' ', 2)
798 array[0] = int(array[0])
799 result.append(tuple(array))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800800
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800801 if padding_begin or padding_end:
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800802 history = [0, '']
803 if with_subject:
804 history.append('dummy subject')
805
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800806 if padding_end:
807 assert before, 'padding_end=True make no sense if before=None'
808 if get_rev_by_time(git_repo, before, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800809 before = int(before)
810 if not result or result[-1][0] != before:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800811 git_rev = get_rev_by_time(git_repo, before, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800812 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800813 history[0:2] = [before, git_rev]
814 result.append(tuple(history))
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800815
816 if padding_begin:
817 assert after, 'padding_begin=True make no sense if after=None'
818 if get_rev_by_time(git_repo, after, branch, path=path):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800819 after = int(after)
820 if not result or result[0][0] != after:
Kuang-che Wu8a28a9d2018-09-11 17:43:36 +0800821 git_rev = get_rev_by_time(git_repo, after, branch)
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800822 assert git_rev
Zheng-Jie Chang127c3302019-09-10 17:17:04 +0800823 history[0:2] = [after, git_rev]
824 result.insert(0, tuple(history))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800825
826 return result
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800827
828
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800829def get_history_recursively(git_repo,
830 path,
831 after,
832 before,
833 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800834 padding_end=True,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800835 branch=None):
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800836 """Get commit history of given path and its dependencies.
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800837
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800838 In comparison to get_history(), get_history_recursively also takes
839 dependencies into consideration. For example, if file A referenced file B,
840 get_history_recursively(A) will return commits of B in addition to A. This
841 applies recursively, so commits of C will be included if file B referenced
842 file C, and so on.
843
844 This function is file type neutral. `parser_callback(filename, content)` will
845 be invoked to parse file content and should return list of filename of
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800846 dependencies. If `parser_callback` returns None (usually syntax error), the
847 commit is omitted.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800848
849 Args:
850 git_repo: path of git repo
851 path: path to query, relative to git_repo
852 after: limit history after given time (inclusive)
853 before: limit history before given time (inclusive)
854 parser_callback: callback to parse file content. See above comment.
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800855 padding_end: If True, pads returned result with dummy record at exact
856 'after' time, if 'path' existed at that time.
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800857 branch: branch name or ref name
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800858
859 Returns:
860 list of (commit timestamp, git hash)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800861 """
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800862 history = get_history(
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800863 git_repo,
864 path,
865 after=after,
866 before=before,
867 padding_begin=True,
868 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800869
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800870 # Collect include information of each commit.
871 includes = {}
872 for commit_time, git_rev in history:
873 content = get_file_from_revision(git_repo, git_rev, path)
Kuang-che Wu7d0c7592019-09-16 09:59:28 +0800874 parse_result = parser_callback(path, content)
875 if parse_result is None:
876 continue
877 for include_name in parse_result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800878 if include_name not in includes:
879 includes[include_name] = set()
880 includes[include_name].add(git_rev)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800881
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800882 # Analyze the start time and end time of each include.
883 dependencies = []
884 for include in includes:
885 appeared = None
886 for commit_time, git_rev in history:
887 if git_rev in includes[include]:
888 if not appeared:
889 appeared = commit_time
890 else:
891 if appeared:
Zheng-Jie Chang4d617a42020-02-15 06:46:00 +0800892 # dependency file exists in time range [appeared, commit_time)
893 dependencies.append((include, appeared, commit_time - 1))
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800894 appeared = None
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800895
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800896 if appeared is not None:
897 dependencies.append((include, appeared, before))
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800898
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800899 # Recursion and merge.
900 result = list(history)
901 for include, appeared, disappeared in dependencies:
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800902 result += get_history_recursively(
903 git_repo,
904 include,
905 appeared,
906 disappeared,
907 parser_callback,
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800908 padding_end=False,
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800909 branch=branch)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800910
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800911 # Sort and padding.
912 result.sort(key=lambda x: x[0])
913 if padding_end:
914 pad = (before,)
915 pad += result[-1][1:]
916 result.append(pad)
917
918 # Dedup.
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800919 result2 = []
Zheng-Jie Chang313eec32020-02-18 16:17:07 +0800920 for x in result:
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800921 if result2 and result2[-1] == x:
922 continue
923 result2.append(x)
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800924
Kuang-che Wue4bae0b2018-07-19 12:10:14 +0800925 return result2
Kuang-che Wubfc4a642018-04-19 11:54:08 +0800926
927
Kuang-che Wud558a042020-06-06 02:11:00 +0800928def get_branches(git_repo, all_branches=True, commit=None, remote=False):
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800929 """Get branches of a repository.
930
931 Args:
932 git_repo: path of git repo
933 all_branches: return remote branches if is set to True
934 commit: return branches containing this commit if is not None
Kuang-che Wud558a042020-06-06 02:11:00 +0800935 remote: only remote tracking branches
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800936
937 Returns:
938 list of branch names
939 """
940 cmd = ['git', 'branch', '--format=%(refname)']
941 if all_branches:
942 cmd += ['-a']
943 if commit:
944 cmd += ['--contains', commit]
Kuang-che Wud558a042020-06-06 02:11:00 +0800945 if remote:
946 cmd.append('--remote')
Zheng-Jie Changd968f552020-01-16 13:31:57 +0800947
948 result = []
949 for line in util.check_output(*cmd, cwd=git_repo).splitlines():
950 result.append(line.strip())
951 return result
952
953
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800954def list_commits_between_commits(git_repo, old, new):
955 """Get all commits between (old, new].
956
957 Args:
958 git_repo: path of git repo.
959 old: old commit hash (exclusive)
960 new: new commit hash (inclusive)
961
962 Returns:
963 list of (timestamp, rev)
964 """
965 assert old and new
Kuang-che Wu470866e2020-05-27 11:03:10 +0800966 if old == new:
967 return []
968
969 assert is_ancestor_commit(git_repo, old, new)
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800970 commits = []
971 # --first-parent is necessary for Android, see following link for more
972 # discussion.
973 # https://docs.google.com/document/d/1c8qiq14_ObRRjLT62sk9r5V5cyCGHX66dLYab4MVnks/edit#heading=h.n3i6mt2n6xuu
974 for line in util.check_output(
975 'git',
976 'rev-list',
977 '--timestamp',
978 '--reverse',
979 '--first-parent',
980 '%s..%s' % (old, new),
981 cwd=git_repo).splitlines():
982 timestamp, git_rev = line.split()
Kuang-che Wued1bb622020-05-30 23:06:23 +0800983 commits.append((int(timestamp), git_rev))
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800984
985 # bisect-kit has a fundamental assumption that commit timestamps are
986 # increasing because we sort and bisect the commits by timestamp across git
987 # repos. If not increasing, we have to adjust the timestamp as workaround.
988 # This might lead to bad bisect result, however the bad probability is low in
989 # practice since most machines' clocks are good enough.
Kuang-che Wued1bb622020-05-30 23:06:23 +0800990 adjusted, commits = _adjust_timestamp_increasingly(commits)
991 if adjusted != 0:
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800992 logger.warning('Commit timestamps are not increasing')
Kuang-che Wu3eb6b502018-06-06 16:15:18 +0800993 logger.warning('%d timestamps adjusted', adjusted)
994
995 return commits