| # -*- coding: utf-8 -*- |
| # Copyright 2018 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| """Model of source code organization and changes. |
| |
| This module modeled complex source code organization, i.e. nested git repos, |
| and their version relationship, i.e. pinned or floating git repo. In other |
| words, it's abstraction of chrome's gclient DEPS, and chromeos and Android's |
| repo manifest. |
| """ |
| |
| from __future__ import print_function |
| import collections |
| import copy |
| import json |
| import logging |
| import os |
| import re |
| import shutil |
| |
| from bisect_kit import cli |
| from bisect_kit import errors |
| from bisect_kit import git_util |
| |
| logger = logging.getLogger(__name__) |
| |
| _re_intra_rev = r'^([^,]+)~([^,]+)/(\d+)$' |
| |
| SPEC_FIXED = 'fixed' |
| SPEC_FLOAT = 'float' |
| _DIFF_CACHE_DIR = 'bisectkit-cache' |
| |
| |
| def make_intra_rev(a, b, index): |
| """Makes intra-rev version string. |
| |
| Between two major "named" versions a and b, there are many small changes |
| (commits) in-between. bisect-kit will identify all those instances and bisect |
| them. We give names to those instances and call these names as "intra-rev" |
| which stands for minor version numbers within two major version. |
| |
| Note, a+index (without b) is not enough to identify an unique change due to |
| branches. Take chromeos as example, both 9900.1.0 and 9901.0.0 are derived |
| from 9900.0.0, so "9900.0.0 plus 100 changes" may ambiguously refer to states |
| in 9900.1.0 and 9901.0.0. |
| |
| Args: |
| a: the start version |
| b: the end version |
| index: the index number of changes between a and b |
| |
| Returns: |
| the intra-rev version string |
| """ |
| return '%s~%s/%d' % (a, b, index) |
| |
| |
| def parse_intra_rev(rev): |
| """Decomposes intra-rev string. |
| |
| See comments of make_intra_rev for what is intra-rev. |
| |
| Args: |
| rev: intra-rev string or normal version number |
| |
| Returns: |
| (start, end, index). If rev is not intra-rev, it must be normal version |
| number and returns (rev, rev, 0). |
| """ |
| m = re.match(_re_intra_rev, rev) |
| if not m: |
| return rev, rev, 0 |
| |
| return m.group(1), m.group(2), int(m.group(3)) |
| |
| |
| def argtype_intra_rev(argtype): |
| """Validates argument is intra-rev. |
| |
| Args: |
| argtype: argtype function which validates major version number |
| |
| Returns: |
| A new argtype function which matches intra-rev |
| """ |
| |
| def argtype_function(s): |
| examples = [] |
| try: |
| return argtype(s) |
| except cli.ArgTypeError as e: |
| examples += e.example |
| |
| m = re.match(_re_intra_rev, s) |
| if m: |
| try: |
| argtype(m.group(1)) |
| argtype(m.group(2)) |
| return s |
| except cli.ArgTypeError as e: |
| for example in e.example: |
| examples.append(make_intra_rev(example, example, 10)) |
| raise cli.ArgTypeError('Invalid intra rev', examples) |
| |
| examples.append(make_intra_rev('<rev1>', '<rev2>', 10)) |
| raise cli.ArgTypeError('Invalid rev', examples) |
| |
| return argtype_function |
| |
| |
| def _normalize_repo_url(repo_url): |
| repo_url = re.sub(r'https://chrome-internal.googlesource.com/a/', |
| r'https://chrome-internal.googlesource.com/', repo_url) |
| repo_url = re.sub(r'\.git$', '', repo_url) |
| return repo_url |
| |
| |
| class PathSpec: |
| """Specified code version of one path. |
| |
| Attributes: |
| path: local path, relative to project base dir |
| repo_url: code repository location |
| at: code version, could be git hash or branch name |
| """ |
| |
| def __init__(self, path, repo_url, at): |
| self.path = path |
| self.repo_url = repo_url |
| self.at = at |
| |
| def is_static(self): |
| return git_util.is_git_rev(self.at) |
| |
| def __eq__(self, rhs): |
| if self.path != rhs.path: |
| return False |
| if self.at != rhs.at: |
| return False |
| if _normalize_repo_url(self.repo_url) != _normalize_repo_url(rhs.repo_url): |
| return False |
| return True |
| |
| def __ne__(self, rhs): |
| return not self == rhs |
| |
| |
| class Spec: |
| """Collection of PathSpec. |
| |
| Spec is analogy to gclient's DEPS and repo's manifest. |
| |
| Attributes: |
| spec_type: type of spec, SPEC_FIXED or SPEC_FLOAT. SPEC_FIXED means code |
| version is pinned and fixed. On the other hand, SPEC_FLOAT is not |
| pinned and the actual version (git commit) may change over time. |
| name: name of this spec, for debugging purpose. usually version number |
| or git hash |
| timestamp: timestamp of this spec |
| path: path of spec |
| entries: paths to PathSpec dict |
| revision: a commit id of manifest-internal indicates the manifest revision, |
| this argument is not used in DEPS. |
| """ |
| |
| def __init__(self, |
| spec_type, |
| name, |
| timestamp, |
| path, |
| entries=None, |
| revision=None): |
| self.spec_type = spec_type |
| self.name = name |
| self.timestamp = timestamp |
| self.path = path |
| self.entries = entries |
| self.revision = revision |
| |
| def copy(self): |
| return copy.deepcopy(self) |
| |
| def similar_score(self, rhs): |
| """Calculates similar score to another Spec. |
| |
| Returns: |
| score of similarity. Smaller value is more similar. |
| """ |
| score = 0 |
| for path in set(self.entries) & set(rhs.entries): |
| if rhs[path] == self[path]: |
| continue |
| if rhs[path].at == self[path].at: |
| # it's often that remote repo moved around but should be treated as the |
| # same one |
| score += 0.1 |
| else: |
| score += 1 |
| score += len(set(self.entries) ^ set(rhs.entries)) |
| return score |
| |
| def is_static(self): |
| return all(path_spec.is_static() for path_spec in self.entries.values()) |
| |
| def is_subset(self, rhs): |
| return set(self.entries.keys()) <= set(rhs.entries.keys()) |
| |
| def __getitem__(self, path): |
| return self.entries[path] |
| |
| def __contains__(self, path): |
| return path in self.entries |
| |
| def apply(self, action_group): |
| self.timestamp = action_group.timestamp |
| self.name = '(%s)' % self.timestamp |
| for action in action_group.actions: |
| if isinstance(action, GitAddRepo): |
| self.entries[action.path] = PathSpec(action.path, action.repo_url, |
| action.rev) |
| elif isinstance(action, GitCheckoutCommit): |
| self.entries[action.path].at = action.rev |
| elif isinstance(action, GitRemoveRepo): |
| del self.entries[action.path] |
| else: |
| assert 0, 'unknown action: %s' % action.__class__.__name__ |
| |
| def dump(self): |
| # for debugging |
| print(self.name, self.path, self.timestamp) |
| print('size', len(self.entries)) |
| for path, path_spec in sorted(self.entries.items()): |
| print(path, path_spec.at) |
| |
| def diff(self, rhs): |
| logger.info('diff between %s and %s', self.name, rhs.name) |
| expect = set(self.entries) |
| actual = set(rhs.entries) |
| common_count = 0 |
| for path in sorted(expect - actual): |
| logger.info('-%s', path) |
| for path in sorted(actual - expect): |
| logger.info('+%s', path) |
| for path in sorted(expect & actual): |
| if self[path] == rhs[path]: |
| common_count += 1 |
| continue |
| if self[path].at != rhs[path].at: |
| logger.info(' %s: at %s vs %s', path, self[path].at, rhs[path].at) |
| if self[path].repo_url != rhs[path].repo_url: |
| logger.info(' %s: repo_url %s vs %s', path, self[path].repo_url, |
| rhs[path].repo_url) |
| logger.info('and common=%s', common_count) |
| |
| |
| class Action: |
| """Actions describe changes from one Spec to another. |
| |
| Attributes: |
| timestamp: action time |
| path: action path, which is relative to project root |
| """ |
| |
| def __init__(self, timestamp, path): |
| self.timestamp = timestamp |
| self.path = path |
| |
| def apply(self, _code_storage, _root_dir): |
| raise NotImplementedError |
| |
| def summary(self): |
| raise NotImplementedError |
| |
| def __eq__(self, rhs): |
| return self.__dict__ == rhs.__dict__ |
| |
| def serialize(self): |
| return self.__class__.__name__, self.__dict__ |
| |
| |
| def unserialize_action(data): |
| classes = [GitCheckoutCommit, GitAddRepo, GitRemoveRepo] |
| class_name, values = data |
| assert class_name in [cls.__name__ for cls in classes |
| ], 'unknown action class: %s' % class_name |
| for cls in classes: |
| if class_name == cls.__name__: |
| action = cls(**values) |
| break |
| return action |
| |
| |
| class ActionGroup: |
| """Atomic group of Action objects |
| |
| This models atomic actions, ex: |
| - repo added/removed in the same manifest commit |
| - commits appears at the same time due to repo add |
| - gerrit topic |
| - circular CQ-DEPEND (Cq-Depend) |
| Otherwise, one ActionGroup usually consists only one Action object. |
| """ |
| |
| def __init__(self, timestamp, comment=None): |
| self.timestamp = timestamp |
| self.name = None |
| self.actions = [] |
| self.comment = comment |
| |
| def add(self, action): |
| self.actions.append(action) |
| |
| def serialize(self): |
| return dict( |
| timestamp=self.timestamp, |
| name=self.name, |
| comment=self.comment, |
| actions=[a.serialize() for a in self.actions]) |
| |
| def summary(self): |
| result = {} |
| if self.comment: |
| result['comment'] = self.comment |
| result['actions'] = [action.summary() for action in self.actions] |
| return result |
| |
| @staticmethod |
| def unserialize(data): |
| ag = ActionGroup(data['timestamp']) |
| ag.name = data['name'] |
| ag.comment = data['comment'] |
| for x in data['actions']: |
| ag.add(unserialize_action(x)) |
| return ag |
| |
| def apply(self, code_storage, root_dir): |
| for action in self.actions: |
| action.apply(code_storage, root_dir) |
| |
| |
| class GitCheckoutCommit(Action): |
| """Describes a git commit action. |
| |
| Attributes: |
| repo_url: the corresponding url of git repo |
| rev: git commit to checkout |
| """ |
| |
| def __init__(self, timestamp, path, repo_url, rev): |
| super().__init__(timestamp, path) |
| self.repo_url = repo_url |
| self.rev = rev |
| |
| def apply(self, code_storage, root_dir): |
| del code_storage # unused |
| git_repo = os.path.join(root_dir, self.path) |
| assert git_util.is_git_root(git_repo) |
| git_util.checkout_version(git_repo, self.rev) |
| |
| def summary(self): |
| text = 'commit %s %s' % (self.rev[:10], self.path) |
| return dict( |
| timestamp=self.timestamp, |
| action_type='commit', |
| path=self.path, |
| repo_url=self.repo_url, |
| rev=self.rev, |
| text=text, |
| ) |
| |
| |
| class GitAddRepo(Action): |
| """Describes a git repo add action. |
| |
| Attributes: |
| repo_url: the corresponding url of git repo to add |
| rev: git commit to checkout |
| """ |
| |
| def __init__(self, timestamp, path, repo_url, rev): |
| super().__init__(timestamp, path) |
| self.repo_url = repo_url |
| self.rev = rev |
| |
| def apply(self, code_storage, root_dir): |
| git_repo = os.path.join(root_dir, self.path) |
| if os.path.exists(git_repo): |
| if os.path.isdir(git_repo) and not os.listdir(git_repo): |
| # mimic gclient's behavior; don't panic |
| logger.warning( |
| 'adding repo %s; there is already an empty directory; ' |
| 'assume it is okay', git_repo) |
| else: |
| assert not os.path.exists(git_repo), '%s already exists' % git_repo |
| |
| reference = code_storage.cached_git_root(self.repo_url) |
| git_util.clone(git_repo, self.repo_url, reference=reference) |
| git_util.checkout_version(git_repo, self.rev) |
| |
| code_storage.add_to_project_list(root_dir, self.path, self.repo_url) |
| |
| def summary(self): |
| text = 'add repo %s from %s@%s' % (self.path, self.repo_url, self.rev[:10]) |
| return dict( |
| timestamp=self.timestamp, |
| action_type='add_repo', |
| path=self.path, |
| repo_url=self.repo_url, |
| rev=self.rev, |
| text=text, |
| ) |
| |
| |
| class GitRemoveRepo(Action): |
| """Describes a git repo remove action.""" |
| |
| def apply(self, code_storage, root_dir): |
| assert self.path |
| root_dir = os.path.normpath(root_dir) |
| git_repo = os.path.join(root_dir, self.path) |
| assert git_util.is_git_root(git_repo), '%r should be a git repo' % git_repo |
| # TODO(kcwu): other projects may be sub-tree of `git_repo`. |
| # They should not be deleted. (crbug/930047) |
| shutil.rmtree(git_repo) |
| |
| # Remove empty parents. (But don't delete `root_dir` and its upper parents.) |
| parent = os.path.dirname(git_repo) |
| while (parent != root_dir and |
| os.path.commonpath([parent, root_dir]) == root_dir): |
| if os.listdir(parent): |
| break |
| os.rmdir(parent) |
| parent = os.path.dirname(parent) |
| |
| code_storage.remove_from_project_list(root_dir, self.path) |
| |
| def summary(self): |
| return dict( |
| timestamp=self.timestamp, |
| action_type='remove_repo', |
| path=self.path, |
| text='remove repo %s' % self.path, |
| ) |
| |
| |
| def apply_actions(code_storage, action_groups, root_dir): |
| # Speed optimization: only apply the last one of consecutive commits per |
| # repo. It is possible to optimize further, but need to take care git repo |
| # add/remove within another repo. |
| commits = {} |
| |
| def batch_apply(commits): |
| for i, _, commit_action in sorted(commits.values(), key=lambda x: x[:2]): |
| logger.debug('[%d] applying "%r"', i, commit_action.summary()) |
| commit_action.apply(code_storage, root_dir) |
| |
| for i, action_group in enumerate(action_groups, 1): |
| for action in action_group.actions: |
| if not isinstance(action, GitCheckoutCommit): |
| break |
| else: |
| # If all actions are commits, defer them for batch processing. |
| for j, action in enumerate(action_group.actions): |
| commits[action.path] = (i, j, action) |
| continue |
| |
| batch_apply(commits) |
| commits = {} |
| logger.debug('[%d] applying "%r"', i, action_group.summary()) |
| action_group.apply(code_storage, root_dir) |
| |
| batch_apply(commits) |
| |
| |
| class SpecManager: |
| """Spec related abstract operations. |
| |
| This class enumerates Spec instances and switch disk state to Spec. |
| |
| In other words, this class abstracts: |
| - discovery of gclient's DEPS and repo's manifest |
| - gclient sync and repo sync |
| """ |
| |
| def collect_float_spec(self, old, new, fixed_specs=None): |
| """Collects float Spec between two versions. |
| |
| This method may fetch spec from network. However, it should not switch tree |
| version state. |
| |
| Args: |
| old: old version |
| new: new version |
| fixed_specs: fixed specs from collect_fixed_spec(old, new) for Chrome OS |
| or None for others |
| """ |
| raise NotImplementedError |
| |
| def collect_fixed_spec(self, old, new): |
| """Collects fixed Spec between two versions. |
| |
| This method may fetch spec from network. However, it should not switch tree |
| version state. |
| """ |
| raise NotImplementedError |
| |
| def parse_spec(self, spec): |
| """Parses information for Spec object. |
| |
| Args: |
| spec: Spec object. It specifies what to parse and the parsed information |
| is stored inside. |
| """ |
| raise NotImplementedError |
| |
| def sync_disk_state(self, rev): |
| """Switch source tree state to given version.""" |
| raise NotImplementedError |
| |
| |
| class CodeStorage: |
| """Query code history and commit relationship without checkout. |
| |
| Because paths inside source tree may be deleted or map to different remote |
| repo in different versions, we cannot query git information of one version |
| but the tree state is at another version. In order to query information |
| without changing tree state and fast, we need out of tree source code |
| storage. |
| |
| This class assumes all git repos are mirrored somewhere on local disk. |
| Subclasses just need to implement cached_git_root() which returns the |
| location. |
| |
| In other words, this class abstracts operations upon gclient's cache-dir |
| repo's mirror. |
| """ |
| |
| def cached_git_root(self, repo_url): |
| """The cached path of given remote git repo. |
| |
| Args: |
| repo_url: URL of git remote repo |
| |
| Returns: |
| path of cache folder |
| """ |
| raise NotImplementedError |
| |
| def add_to_project_list(self, project_root, path, repo_url): |
| raise NotImplementedError |
| |
| def remove_from_project_list(self, project_root, path): |
| raise NotImplementedError |
| |
| def is_ancestor_commit(self, spec, path, old, new): |
| """Determine one commit is ancestor of another. |
| |
| Args: |
| spec: Spec object |
| path: local path relative to project root |
| old: commit id |
| new: commit id |
| |
| Returns: |
| True if `old` is ancestor of `new` |
| """ |
| git_root = self.cached_git_root(spec[path].repo_url) |
| return git_util.is_ancestor_commit(git_root, old, new) |
| |
| def get_rev_by_time(self, spec, path, timestamp): |
| """Get commit hash of given spec by time. |
| |
| Args: |
| spec: Spec object |
| path: local path relative to project root |
| timestamp: timestamp |
| |
| Returns: |
| The commit hash of given time. If there are commits with the given |
| timestamp, returns the last commit. |
| """ |
| git_root = self.cached_git_root(spec[path].repo_url) |
| # spec[path].at is remote reference name. Since git_root is a mirror (not |
| # a local checkout), there is no need to convert the name. |
| return git_util.get_rev_by_time(git_root, timestamp, spec[path].at) |
| |
| def get_actions_between_two_commit(self, |
| spec, |
| path, |
| old, |
| new, |
| ignore_not_ancestor=False): |
| git_root = self.cached_git_root(spec[path].repo_url) |
| result = [] |
| # not in the same branch, regard as an atomic operation |
| # this situation happens when |
| # 1. new is branched from old and |
| # 2. commit timestamp is not reliable(i.e. commit time != merged time) |
| # old and new might not have ancestor relation |
| if ignore_not_ancestor and old != new and not git_util.is_ancestor_commit( |
| git_root, old, new): |
| timestamp = git_util.get_commit_time(git_root, new) |
| result.append( |
| GitCheckoutCommit(timestamp, path, spec[path].repo_url, new)) |
| return result |
| |
| for timestamp, git_rev in git_util.list_commits_between_commits( |
| git_root, old, new): |
| result.append( |
| GitCheckoutCommit(timestamp, path, spec[path].repo_url, git_rev)) |
| return result |
| |
| def is_containing_commit(self, spec, path, rev): |
| git_root = self.cached_git_root(spec[path].repo_url) |
| return git_util.is_containing_commit(git_root, rev) |
| |
| def are_spec_commits_available(self, spec): |
| for path, path_spec in spec.entries.items(): |
| if not path_spec.is_static(): |
| continue |
| if not self.is_containing_commit(spec, path, path_spec.at): |
| return False |
| return True |
| |
| |
| class CodeManager: |
| """Class to reconstruct historical source tree state. |
| |
| This class can reconstruct all moments of source tree state and diffs between |
| them. |
| |
| Attributes: |
| root_dir: root path of project source tree |
| spec_manager: SpecManager object |
| code_storage: CodeStorage object |
| """ |
| |
| def __init__(self, root_dir, spec_manager, code_storage): |
| self.root_dir = root_dir |
| self.spec_manager = spec_manager |
| self.code_storage = code_storage |
| |
| def generate_action_groups_between_specs(self, prev_float, next_float): |
| """Generates actions between two float specs. |
| |
| Args: |
| prev_float: start of spec object (exclusive) |
| next_float: end of spec object (inclusive) |
| |
| Returns: |
| list of ActionGroup object (ordered) |
| """ |
| groups = [] |
| last_group = ActionGroup(next_float.timestamp) |
| is_removed = set() |
| |
| # `branch_between_float_specs` is currently a chromeos-only logic, |
| # and branch behavior is not verified for android and chrome now. |
| is_chromeos_branched = False |
| if hasattr(self.spec_manager, 'branch_between_float_specs' |
| ) and self.spec_manager.branch_between_float_specs( |
| prev_float, next_float): |
| is_chromeos_branched = True |
| |
| # Sort alphabetically, so parent directories are handled before children |
| # directories. |
| for path in sorted(set(prev_float.entries) | set(next_float.entries)): |
| # Add repo |
| if path not in prev_float: |
| if next_float[path].is_static(): |
| next_at = next_float[path].at |
| else: |
| next_at = self.code_storage.get_rev_by_time(next_float, path, |
| next_float.timestamp) |
| last_group.add( |
| GitAddRepo(next_float.timestamp, path, next_float[path].repo_url, |
| next_at)) |
| continue |
| |
| # Existing path is floating. |
| if not prev_float[path].is_static(): |
| # Enumerates commits until next spec. Get `prev_at` and `till_at` |
| # by prev_float and next_float's timestamp. |
| # |
| # 1. Non-branched case: |
| # |
| # prev_at till_at |
| # prev branch ---> o --------> o --------> o --------> o --------> ... |
| # ^ ^ |
| # prev_float.timestamp next_float.timestamp |
| # |
| # building an image between prev_at and till_at should follow |
| # prev_float's spec. |
| # |
| # 2. Branched case: |
| # |
| # till_at |
| # /------->o----------> |
| # / ^ next_float.timestamp |
| # / prev_at |
| # ---------->o----------------------> |
| # ^prev_float.timestamp |
| # |
| # building an image between prev_at and till_at should follow |
| # next_float's spec. |
| # |
| prev_at = self.code_storage.get_rev_by_time(prev_float, path, |
| prev_float.timestamp) |
| if is_chromeos_branched: |
| till_at = self.code_storage.get_rev_by_time(next_float, path, |
| next_float.timestamp) |
| else: |
| till_at = self.code_storage.get_rev_by_time(prev_float, path, |
| next_float.timestamp) |
| actions = self.code_storage.get_actions_between_two_commit( |
| prev_float, path, prev_at, till_at, ignore_not_ancestor=True) |
| |
| # Assume commits with the same timestamp as manifest/DEPS change are |
| # atomic. |
| if actions and actions[-1].timestamp == next_float.timestamp: |
| last_group.add(actions.pop()) |
| |
| for action in actions: |
| group = ActionGroup(action.timestamp) |
| group.add(action) |
| groups.append(group) |
| else: |
| prev_at = till_at = prev_float[path].at |
| |
| # At next_float.timestamp. |
| if path not in next_float: |
| if path in is_removed: |
| continue |
| # remove repo |
| next_at = None |
| sub_repos = [p for p in prev_float.entries if p.startswith(path + '/')] |
| # Remove deeper repo first |
| for path2 in sorted(sub_repos, reverse=True): |
| last_group.add(GitRemoveRepo(next_float.timestamp, path2)) |
| is_removed.add(path2) |
| last_group.add(GitRemoveRepo(next_float.timestamp, path)) |
| is_removed.add(path) |
| for path2 in sorted(set(sub_repos) & set(next_float.entries)): |
| last_group.add( |
| GitAddRepo(next_float.timestamp, path2, |
| next_float[path2].repo_url, prev_float[path2].at)) |
| |
| elif next_float[path].is_static(): |
| # pinned to certain commit on different branch |
| next_at = next_float[path].at |
| |
| elif next_float[path].at == prev_float[path].at: |
| # keep floating on the same branch |
| next_at = till_at |
| |
| else: |
| # switch to another branch |
| # prev_at till_at |
| # prev branch ---> o --------> o --------> o --------> o --------> ... |
| # |
| # next_at |
| # next branch ...... o ------> o --------> o -----> ... |
| # ^ ^ |
| # prev_float.timestamp next_float.timestamp |
| next_at = self.code_storage.get_rev_by_time(next_float, path, |
| next_float.timestamp) |
| |
| if next_at and next_at != till_at: |
| last_group.add( |
| GitCheckoutCommit(next_float.timestamp, path, |
| next_float[path].repo_url, next_at)) |
| |
| groups.sort(key=lambda x: x.timestamp) |
| if last_group.actions: |
| groups.append(last_group) |
| return groups |
| |
| def synthesize_fixed_spec(self, float_spec, timestamp): |
| """Synthesizes fixed spec from float spec of given time. |
| |
| Args: |
| float_spec: the float spec |
| timestamp: snapshot time |
| |
| Returns: |
| Spec object |
| """ |
| result = {} |
| for path, path_spec in float_spec.entries.items(): |
| if not path_spec.is_static(): |
| at = self.code_storage.get_rev_by_time(float_spec, path, timestamp) |
| path_spec = PathSpec(path_spec.path, path_spec.repo_url, at) |
| |
| result[path] = copy.deepcopy(path_spec) |
| |
| name = '%s@%s' % (float_spec.path, timestamp) |
| return Spec(SPEC_FIXED, name, timestamp, float_spec.path, result) |
| |
| def match_spec(self, target, specs, start_index=0): |
| threshold = 3600 |
| # ideal_index is the index of last spec before target |
| # begin and end are the range of indexes within threshold (inclusive) |
| ideal_index = None |
| begin, end = None, None |
| for i, spec in enumerate(specs[start_index:], start_index): |
| if spec.timestamp <= target.timestamp: |
| ideal_index = i |
| if abs(spec.timestamp - target.timestamp) < threshold: |
| if begin is None: |
| begin = i |
| end = i |
| |
| candidates = [] |
| if ideal_index is not None: |
| candidates.append(ideal_index) |
| if begin is not None: |
| candidates.extend(list(range(begin, end + 1))) |
| if not candidates: |
| logger.error('unable to match %s: all specs are after it', target.name) |
| return None |
| |
| compatible_candidates = [ |
| i for i in candidates if specs[i].is_subset(target) |
| ] |
| if not compatible_candidates: |
| logger.error('unable to match %s: no compatible specs', target.name) |
| spec = specs[candidates[0]] |
| target.diff(spec) |
| return None |
| |
| scores = [] |
| for i in compatible_candidates: |
| # Tie-break: prefer earlier timestamp and smaller difference. |
| if specs[i].timestamp <= target.timestamp: |
| timediff = 0, target.timestamp - specs[i].timestamp |
| else: |
| timediff = 1, specs[i].timestamp - target.timestamp |
| scores.append((specs[i].similar_score(target), timediff, i)) |
| scores.sort() |
| |
| score, _, index = scores[0] |
| if score != 0: |
| logger.warning('not exactly match (score=%s): %s', score, target.name) |
| target.diff(specs[index]) |
| |
| if index < ideal_index: |
| logger.warning( |
| '%s (%s) matched earlier spec at %s instead of %s, racing? offset %d', |
| target.name, target.timestamp, specs[index].timestamp, |
| specs[ideal_index].timestamp, |
| specs[index].timestamp - target.timestamp) |
| if index > ideal_index: |
| logger.warning( |
| 'spec committed at %d matched later commit at %d. bad server clock?', |
| target.timestamp, specs[index].timestamp) |
| |
| return index |
| |
| def associate_fixed_and_synthesized_specs(self, fixed_specs, |
| synthesized_specs): |
| # All fixed specs are snapshot of float specs. Theoretically, they |
| # should be identical to one of the synthesized specs. |
| # However, it's not always true for some reasons --- maybe due to race |
| # condition, maybe due to bugs of this bisect-kit. |
| # To overcome this glitch, we try to match them by similarity instead of |
| # exact match. |
| result = [] |
| last_index = 0 |
| for i, fixed_spec in enumerate(fixed_specs): |
| matched_index = self.match_spec(fixed_spec, synthesized_specs, last_index) |
| if matched_index is None: |
| if i in (0, len(fixed_specs) - 1): |
| logger.error('essential spec mismatch, unable to continue') |
| raise ValueError('Commit history analyze failed. ' |
| 'Bisector cannot deal with this version range.') |
| logger.warning('%s do not match, skip', fixed_spec.name) |
| continue |
| result.append((i, matched_index)) |
| last_index = matched_index |
| |
| return result |
| |
| def _create_make_up_actions(self, fixed_spec, synthesized): |
| timestamp = synthesized.timestamp |
| make_up = ActionGroup( |
| timestamp, comment='make up glitch for %s' % fixed_spec.name) |
| for path in set(fixed_spec.entries) & set(synthesized.entries): |
| if fixed_spec[path].at == synthesized[path].at: |
| continue |
| action = GitCheckoutCommit(timestamp, path, synthesized[path].repo_url, |
| synthesized[path].at) |
| make_up.add(action) |
| |
| if not make_up.actions: |
| return None |
| return make_up |
| |
| def _batch_fill_action_commit_log(self, details): |
| group_by_repo = collections.defaultdict(list) |
| for detail in details.values(): |
| for action in detail.get('actions', []): |
| if action['action_type'] == 'commit': |
| group_by_repo[action['repo_url']].append(action) |
| |
| for repo_url, actions in group_by_repo.items(): |
| git_root = self.code_storage.cached_git_root(repo_url) |
| revs = set(a['rev'] for a in actions) |
| metas = git_util.get_batch_commit_metadata(git_root, revs) |
| for action in actions: |
| meta = metas[action['rev']] |
| if meta is None: |
| commit_summary = '(unknown)' |
| else: |
| commit_summary = meta['message'].splitlines()[0] |
| action['commit_summary'] = commit_summary |
| |
| def build_revlist(self, old, new): |
| """Build revlist. |
| |
| Returns: |
| (revlist, details): |
| revlist: list of rev string |
| details: dict of rev to rev detail |
| """ |
| logger.info('build_revlist: old=%s, new=%s', old, new) |
| revlist = [] |
| details = {} |
| |
| # Enable cache for repetitive git operations. The space complexity is |
| # O(number of candidates). |
| git_util.get_commit_metadata.enable_cache() |
| git_util.get_file_from_revision.enable_cache() |
| git_util.is_containing_commit.enable_cache() |
| git_util.is_ancestor_commit.enable_cache() |
| |
| # step 1, find all float and fixed specs in the given range. |
| fixed_specs = self.spec_manager.collect_fixed_spec(old, new) |
| assert fixed_specs |
| for spec in fixed_specs: |
| self.spec_manager.parse_spec(spec) |
| |
| float_specs = self.spec_manager.collect_float_spec(old, new, fixed_specs) |
| assert float_specs |
| while float_specs[-1].timestamp > fixed_specs[-1].timestamp: |
| float_specs.pop() |
| assert float_specs |
| for spec in float_specs: |
| self.spec_manager.parse_spec(spec) |
| |
| git_util.fast_lookup.optimize( |
| (float_specs[0].timestamp, float_specs[-1].timestamp)) |
| # step 2, synthesize all fixed specs in the range from float specs. |
| specs = float_specs + [fixed_specs[-1]] |
| action_groups = [] |
| logger.debug('len(specs)=%d', len(specs)) |
| for i in range(len(specs) - 1): |
| prev_float = specs[i] |
| next_float = specs[i + 1] |
| logger.debug('[%d], between %s (%s) and %s (%s)', i, prev_float.name, |
| prev_float.timestamp, next_float.name, next_float.timestamp) |
| action_groups += self.generate_action_groups_between_specs( |
| prev_float, next_float) |
| |
| spec = self.synthesize_fixed_spec(float_specs[0], fixed_specs[0].timestamp) |
| synthesized = [spec.copy()] |
| for action_group in action_groups: |
| spec.apply(action_group) |
| synthesized.append(spec.copy()) |
| |
| # step 3, associate fixed specs with synthesized specs. |
| associated_pairs = self.associate_fixed_and_synthesized_specs( |
| fixed_specs, synthesized) |
| |
| # step 4, group actions and cache them |
| for i, (fixed_index, synthesized_index) in enumerate(associated_pairs[:-1]): |
| next_fixed_index, next_synthesized_index = associated_pairs[i + 1] |
| revlist.append(fixed_specs[fixed_index].name) |
| this_action_groups = [] |
| |
| # handle glitch |
| if fixed_specs[fixed_index].similar_score( |
| synthesized[synthesized_index]) != 0: |
| assert synthesized[synthesized_index].is_subset( |
| fixed_specs[fixed_index]) |
| skipped = set(fixed_specs[fixed_index].entries) - set( |
| synthesized[synthesized_index].entries) |
| if skipped: |
| logger.warning( |
| 'between %s and %s, ' |
| 'bisect-kit cannot analyze commit history of following paths:', |
| fixed_specs[fixed_index].name, fixed_specs[next_fixed_index].name) |
| for path in sorted(skipped): |
| logger.warning(' %s', path) |
| |
| make_up = self._create_make_up_actions(fixed_specs[fixed_index], |
| synthesized[synthesized_index]) |
| if make_up: |
| this_action_groups.append(make_up) |
| |
| this_action_groups.extend( |
| action_groups[synthesized_index:next_synthesized_index]) |
| for idx, ag in enumerate(this_action_groups, 1): |
| rev = make_intra_rev(fixed_specs[fixed_index].name, |
| fixed_specs[next_fixed_index].name, idx) |
| ag.name = rev |
| revlist.append(rev) |
| details[rev] = ag.summary() |
| |
| self.save_action_groups_between_releases( |
| fixed_specs[fixed_index].name, fixed_specs[next_fixed_index].name, |
| this_action_groups) |
| revlist.append(fixed_specs[associated_pairs[-1][0]].name) |
| |
| self._batch_fill_action_commit_log(details) |
| |
| # Verify all repos in between are cached. |
| for spec in reversed(float_specs): |
| if self.code_storage.are_spec_commits_available(spec): |
| continue |
| raise errors.InternalError('Some commits in %s (%s) are unavailable' % |
| (spec.name, spec.path)) |
| |
| # Disable cache because there might be write or even destructive git |
| # operations when switch git versions. Be conservative now. We can cache |
| # more if we observed more slow git operations later. |
| git_util.fast_lookup.disable() |
| git_util.get_commit_metadata.disable_cache() |
| git_util.get_file_from_revision.disable_cache() |
| git_util.is_containing_commit.disable_cache() |
| git_util.is_ancestor_commit.disable_cache() |
| |
| return revlist, details |
| |
| def save_action_groups_between_releases(self, old, new, action_groups): |
| data = [ag.serialize() for ag in action_groups] |
| |
| cache_dir = os.path.join(self.root_dir, _DIFF_CACHE_DIR) |
| if not os.path.exists(cache_dir): |
| os.makedirs(cache_dir) |
| cache_filename = os.path.join(cache_dir, '%s,%s.json' % (old, new)) |
| with open(cache_filename, 'w') as fp: |
| json.dump(data, fp, indent=4, sort_keys=True) |
| |
| def load_action_groups_between_releases(self, old, new): |
| cache_dir = os.path.join(self.root_dir, _DIFF_CACHE_DIR) |
| cache_filename = os.path.join(cache_dir, '%s,%s.json' % (old, new)) |
| if not os.path.exists(cache_filename): |
| raise errors.InternalError('cached revlist not found: %s' % |
| cache_filename) |
| |
| result = [] |
| with open(cache_filename) as f: |
| for data in json.load(f): |
| result.append(ActionGroup.unserialize(data)) |
| |
| return result |
| |
| def switch(self, rev): |
| rev_old, action_groups = self.get_intra_and_diff(rev) |
| self.spec_manager.sync_disk_state(rev_old) |
| apply_actions(self.code_storage, action_groups, self.root_dir) |
| |
| def get_intra_and_diff(self, rev): |
| # easy case |
| if not re.match(_re_intra_rev, rev): |
| return rev, [] |
| |
| rev_old, rev_new, idx = parse_intra_rev(rev) |
| action_groups = self.load_action_groups_between_releases(rev_old, rev_new) |
| assert 0 <= idx <= len(action_groups) |
| action_groups = action_groups[:idx] |
| return rev_old, action_groups |