Add git-number script to calculate generation numbers for commits.

Compatible with any git topology (multiple roots, weird branching/merging, etc.)
I can't get it to be any faster (in python). Suggestions welcome :).

On z600/linux, this takes 5.1s to calculate the initial count for 2e3de954ef0a
(HEAD on src.git at the time of writing). Subsequent lookups take ~0.06s. For
reference, this machine takes 3s to just list the revisions in sorted order
without any additional processing (using rev-list).

All calculations are stored in a git-notes-style ref with the exception that the
leaf 'tree' object which would normally be stored in a git-notes world is
replaced with a packed binary file which consists of records [hash int]. Each
run of this script will create only 1 commit object on this internal ref which
will have as its parents:
  * The previous git number commit
  * All of the target commits we calculated numbers for.
This ref is then excluded on subsequent invocations of rev-list, which means that
git-number will only ever process commit objects which it hasn't already
calculated a value for. It also prevents you from attempting to number this
special ref :).

This implementation only has a 1-byte fanout which seems to be the best
performance for the repos we're dealing with (i.e. on the order of 500k commit
objects).  Bumping this up to a 2-byte fanout became extremely slow (I suspect
the internal caching structures I'm using are not efficient in this mode and
could be improved). Using no fanout is slower than the 1 byte fanout for lookups
by about 30%.

R=agable@chromium.org, stip@chromium.org, szager@chromium.org
BUG=280154,309692,skia:1639

Review URL: https://codereview.chromium.org/26109002

git-svn-id: svn://svn.chromium.org/chrome/trunk/tools/depot_tools@236035 0039d316-1c4b-4281-b951-d872f2087c98
diff --git a/git_number.py b/git_number.py
new file mode 100755
index 0000000..04f676c
--- /dev/null
+++ b/git_number.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python
+# Copyright 2013 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Usage: %prog [options] [<commitref>]*
+
+If no <commitref>'s are supplied, it defaults to HEAD.
+
+Calculates the generation number for one or more commits in a git repo.
+
+Generation number of a commit C with parents P is defined as:
+  generation_number(C, []) = 0
+  generation_number(C, P)  = max(map(generation_number, P)) + 1
+
+This number can be used to order commits relative to each other, as long as for
+any pair of the commits, one is an ancestor of the other.
+
+Since calculating the generation number of a commit requires walking that
+commit's entire history, this script caches all calculated data inside the git
+repo that it operates on in the ref 'refs/number/commits'.
+"""
+
+import binascii
+import collections
+import logging
+import optparse
+import os
+import struct
+import sys
+import tempfile
+
+import git_common as git
+import subprocess2
+
+CHUNK_FMT = '!20sL'
+CHUNK_SIZE = struct.calcsize(CHUNK_FMT)
+DIRTY_TREES = collections.defaultdict(int)
+REF = 'refs/number/commits'
+
+# Number of bytes to use for the prefix on our internal number structure.
+# 0 is slow to deserialize. 2 creates way too much bookeeping overhead (would
+# need to reimplement cache data structures to be a bit more sophisticated than
+# dicts. 1 seems to be just right.
+PREFIX_LEN = 1
+
+# Set this to 'threads' to gather coverage data while testing.
+POOL_KIND = 'procs'
+
+
+def pathlify(hash_prefix):
+  """Converts a binary object hash prefix into a posix path, one folder per
+  byte.
+
+  >>> pathlify('\xDE\xAD')
+  'de/ad'
+  """
+  return '/'.join('%02x' % ord(b) for b in hash_prefix)
+
+
+@git.memoize_one(threadsafe=False)
+def get_number_tree(prefix_bytes):
+  """Returns a dictionary of the git-number registry specified by
+  |prefix_bytes|.
+
+  This is in the form of {<full binary ref>: <gen num> ...}
+
+  >>> get_number_tree('\x83\xb4')
+  {'\x83\xb4\xe3\xe4W\xf9J*\x8f/c\x16\xecD\xd1\x04\x8b\xa9qz': 169, ...}
+  """
+  ref = '%s:%s' % (REF, pathlify(prefix_bytes))
+
+  try:
+    raw = buffer(git.run('cat-file', 'blob', ref, autostrip=False))
+    return dict(struct.unpack_from(CHUNK_FMT, raw, i * CHUNK_SIZE)
+                for i in xrange(len(raw) / CHUNK_SIZE))
+  except subprocess2.CalledProcessError:
+    return {}
+
+
+@git.memoize_one(threadsafe=False)
+def get_num(commit_hash):
+  """Returns the generation number for a commit.
+
+  Returns None if the generation number for this commit hasn't been calculated
+  yet (see load_generation_numbers()).
+  """
+  return get_number_tree(commit_hash[:PREFIX_LEN]).get(commit_hash)
+
+
+def clear_caches(on_disk=False):
+  """Clears in-process caches for e.g. unit testing."""
+  get_number_tree.clear()
+  get_num.clear()
+  if on_disk:
+    git.run('update-ref', '-d', REF)
+
+
+def intern_number_tree(tree):
+  """Transforms a number tree (in the form returned by |get_number_tree|) into
+  a git blob.
+
+  Returns the git blob id as hex-encoded string.
+
+  >>> d = {'\x83\xb4\xe3\xe4W\xf9J*\x8f/c\x16\xecD\xd1\x04\x8b\xa9qz': 169}
+  >>> intern_number_tree(d)
+  'c552317aa95ca8c3f6aae3357a4be299fbcb25ce'
+  """
+  with tempfile.TemporaryFile() as f:
+    for k, v in sorted(tree.iteritems()):
+      f.write(struct.pack(CHUNK_FMT, k, v))
+    f.seek(0)
+    return git.intern_f(f)
+
+
+def leaf_map_fn((pre, tree)):
+  """Converts a prefix and number tree into a git index line."""
+  return '100644 blob %s\t%s\0' % (intern_number_tree(tree), pathlify(pre))
+
+
+def finalize(targets):
+  """Saves all cache data to the git repository.
+
+  After calculating the generation number for |targets|, call finalize() to
+  save all the work to the git repository.
+
+  This in particular saves the trees referred to by DIRTY_TREES.
+  """
+  if not DIRTY_TREES:
+    return
+
+  msg = 'git-number Added %s numbers' % sum(DIRTY_TREES.itervalues())
+
+  idx = os.path.join(git.run('rev-parse', '--git-dir'), 'number.idx')
+  env = os.environ.copy()
+  env['GIT_INDEX_FILE'] = idx
+
+  progress_message = 'Finalizing: (%%(count)d/%d)' % len(DIRTY_TREES)
+  with git.ProgressPrinter(progress_message) as inc:
+    git.run('read-tree', REF, env=env)
+
+    prefixes_trees = ((p, get_number_tree(p)) for p in sorted(DIRTY_TREES))
+    updater = subprocess2.Popen(['git', 'update-index', '-z', '--index-info'],
+                                stdin=subprocess2.PIPE, env=env)
+
+    with git.ScopedPool(kind=POOL_KIND) as leaf_pool:
+      for item in leaf_pool.imap(leaf_map_fn, prefixes_trees):
+        updater.stdin.write(item)
+        inc()
+
+    updater.stdin.close()
+    updater.wait()
+    assert updater.returncode == 0
+
+    tree_id = git.run('write-tree', env=env)
+    commit_cmd = ['commit-tree', '-m', msg, '-p'] + git.hashes(REF)
+    for t in targets:
+      commit_cmd.extend(['-p', binascii.hexlify(t)])
+    commit_cmd.append(tree_id)
+    commit_hash = git.run(*commit_cmd)
+    git.run('update-ref', REF, commit_hash)
+  DIRTY_TREES.clear()
+
+
+def preload_tree(prefix):
+  """Returns the prefix and parsed tree object for the specified prefix."""
+  return prefix, get_number_tree(prefix)
+
+
+def all_prefixes(depth=PREFIX_LEN):
+  for x in (chr(i) for i in xrange(255)):
+    # This isn't covered because PREFIX_LEN currently == 1
+    if depth > 1:  # pragma: no cover
+      for r in all_prefixes(depth - 1):
+        yield x + r
+    else:
+      yield x
+
+
+def load_generation_numbers(targets):
+  """Populates the caches of get_num and get_number_tree so they contain
+  the results for |targets|.
+
+  Loads cached numbers from disk, and calculates missing numbers if one or
+  more of |targets| is newer than the cached calculations.
+
+  Args:
+    targets - An iterable of binary-encoded full git commit hashes.
+  """
+  # In case they pass us a generator, listify targets.
+  targets = list(targets)
+
+  if all(get_num(t) is not None for t in targets):
+    return
+
+  if git.tree(REF) is None:
+    empty = git.mktree({})
+    commit_hash = git.run('commit-tree', '-m', 'Initial commit from git-number',
+                          empty)
+    git.run('update-ref', REF, commit_hash)
+
+  with git.ScopedPool(kind=POOL_KIND) as pool:
+    preload_iter = pool.imap_unordered(preload_tree, all_prefixes())
+
+    rev_list = []
+
+    with git.ProgressPrinter('Loading commits: %(count)d') as inc:
+      # Curiously, buffering the list into memory seems to be the fastest
+      # approach in python (as opposed to iterating over the lines in the
+      # stdout as they're produced). GIL strikes again :/
+      cmd = [
+        'rev-list', '--topo-order', '--parents', '--reverse', '^' + REF,
+      ] + map(binascii.hexlify, targets)
+      for line in git.run(*cmd).splitlines():
+        tokens = map(binascii.unhexlify, line.split())
+        rev_list.append((tokens[0], tokens[1:]))
+        inc()
+
+    get_number_tree.update(preload_iter)
+
+  with git.ProgressPrinter('Counting: %%(count)d/%d' % len(rev_list)) as inc:
+    for commit_hash, pars in rev_list:
+      num = max(map(get_num, pars)) + 1 if pars else 0
+
+      prefix = commit_hash[:PREFIX_LEN]
+      get_number_tree(prefix)[commit_hash] = num
+      DIRTY_TREES[prefix] += 1
+      get_num.set(commit_hash, num)
+
+      inc()
+
+
+def main():  # pragma: no cover
+  parser = optparse.OptionParser(usage=sys.modules[__name__].__doc__)
+  parser.add_option('--no-cache', action='store_true',
+                    help='Do not actually cache anything we calculate.')
+  parser.add_option('--reset', action='store_true',
+                    help='Reset the generation number cache and quit.')
+  parser.add_option('-v', '--verbose', action='count', default=0,
+                    help='Be verbose. Use more times for more verbosity.')
+  opts, args = parser.parse_args()
+
+  levels = [logging.ERROR, logging.INFO, logging.DEBUG]
+  logging.basicConfig(level=levels[min(opts.verbose, len(levels) - 1)])
+
+  try:
+    if opts.reset:
+      clear_caches(on_disk=True)
+      return
+
+    try:
+      targets = git.parse_commitrefs(*(args or ['HEAD']))
+    except git.BadCommitRefException as e:
+      parser.error(e)
+
+    load_generation_numbers(targets)
+    if not opts.no_cache:
+      finalize(targets)
+
+    print '\n'.join(map(str, map(get_num, targets)))
+    return 0
+  except KeyboardInterrupt:
+    return 1
+
+
+if __name__ == '__main__':  # pragma: no cover
+  sys.exit(main())