Add git-number script to calculate generation numbers for commits.

Compatible with any git topology (multiple roots, weird branching/merging, etc.)
I can't get it to be any faster (in python). Suggestions welcome :).

On z600/linux, this takes 5.1s to calculate the initial count for 2e3de954ef0a
(HEAD on src.git at the time of writing). Subsequent lookups take ~0.06s. For
reference, this machine takes 3s to just list the revisions in sorted order
without any additional processing (using rev-list).

All calculations are stored in a git-notes-style ref with the exception that the
leaf 'tree' object which would normally be stored in a git-notes world is
replaced with a packed binary file which consists of records [hash int]. Each
run of this script will create only 1 commit object on this internal ref which
will have as its parents:
  * The previous git number commit
  * All of the target commits we calculated numbers for.
This ref is then excluded on subsequent invocations of rev-list, which means that
git-number will only ever process commit objects which it hasn't already
calculated a value for. It also prevents you from attempting to number this
special ref :).

This implementation only has a 1-byte fanout which seems to be the best
performance for the repos we're dealing with (i.e. on the order of 500k commit
objects).  Bumping this up to a 2-byte fanout became extremely slow (I suspect
the internal caching structures I'm using are not efficient in this mode and
could be improved). Using no fanout is slower than the 1 byte fanout for lookups
by about 30%.

R=agable@chromium.org, stip@chromium.org, szager@chromium.org
BUG=280154,309692,skia:1639

Review URL: https://codereview.chromium.org/26109002

git-svn-id: svn://svn.chromium.org/chrome/trunk/tools/depot_tools@236035 0039d316-1c4b-4281-b951-d872f2087c98
diff --git a/git_common.py b/git_common.py
new file mode 100644
index 0000000..1215d9c
--- /dev/null
+++ b/git_common.py
@@ -0,0 +1,301 @@
+# Copyright 2013 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Monkeypatch IMapIterator so that Ctrl-C can kill everything properly.
+# Derived from https://gist.github.com/aljungberg/626518
+import multiprocessing.pool
+from multiprocessing.pool import IMapIterator
+def wrapper(func):
+  def wrap(self, timeout=None):
+    return func(self, timeout=timeout or 1e100)
+  return wrap
+IMapIterator.next = wrapper(IMapIterator.next)
+IMapIterator.__next__ = IMapIterator.next
+# TODO(iannucci): Monkeypatch all other 'wait' methods too.
+
+
+import binascii
+import contextlib
+import functools
+import logging
+import signal
+import sys
+import tempfile
+import threading
+
+import subprocess2
+
+
+GIT_EXE = 'git.bat' if sys.platform.startswith('win') else 'git'
+
+
+class BadCommitRefException(Exception):
+  def __init__(self, refs):
+    msg = ('one of %s does not seem to be a valid commitref.' %
+           str(refs))
+    super(BadCommitRefException, self).__init__(msg)
+
+
+def memoize_one(**kwargs):
+  """Memoizes a single-argument pure function.
+
+  Values of None are not cached.
+
+  Kwargs:
+    threadsafe (bool) - REQUIRED. Specifies whether to use locking around
+      cache manipulation functions. This is a kwarg so that users of memoize_one
+      are forced to explicitly and verbosely pick True or False.
+
+  Adds three methods to the decorated function:
+    * get(key, default=None) - Gets the value for this key from the cache.
+    * set(key, value) - Sets the value for this key from the cache.
+    * clear() - Drops the entire contents of the cache.  Useful for unittests.
+    * update(other) - Updates the contents of the cache from another dict.
+  """
+  assert 'threadsafe' in kwargs, 'Must specify threadsafe={True,False}'
+  threadsafe = kwargs['threadsafe']
+
+  if threadsafe:
+    def withlock(lock, f):
+      def inner(*args, **kwargs):
+        with lock:
+          return f(*args, **kwargs)
+      return inner
+  else:
+    def withlock(_lock, f):
+      return f
+
+  def decorator(f):
+    # Instantiate the lock in decorator, in case users of memoize_one do:
+    #
+    # memoizer = memoize_one(threadsafe=True)
+    #
+    # @memoizer
+    # def fn1(val): ...
+    #
+    # @memoizer
+    # def fn2(val): ...
+
+    lock = threading.Lock() if threadsafe else None
+    cache = {}
+    _get = withlock(lock, cache.get)
+    _set = withlock(lock, cache.__setitem__)
+
+    @functools.wraps(f)
+    def inner(arg):
+      ret = _get(arg)
+      if ret is None:
+        ret = f(arg)
+        if ret is not None:
+          _set(arg, ret)
+      return ret
+    inner.get = _get
+    inner.set = _set
+    inner.clear = withlock(lock, cache.clear)
+    inner.update = withlock(lock, cache.update)
+    return inner
+  return decorator
+
+
+def _ScopedPool_initer(orig, orig_args):  # pragma: no cover
+  """Initializer method for ScopedPool's subprocesses.
+
+  This helps ScopedPool handle Ctrl-C's correctly.
+  """
+  signal.signal(signal.SIGINT, signal.SIG_IGN)
+  if orig:
+    orig(*orig_args)
+
+
+@contextlib.contextmanager
+def ScopedPool(*args, **kwargs):
+  """Context Manager which returns a multiprocessing.pool instance which
+  correctly deals with thrown exceptions.
+
+  *args - Arguments to multiprocessing.pool
+
+  Kwargs:
+    kind ('threads', 'procs') - The type of underlying coprocess to use.
+    **etc - Arguments to multiprocessing.pool
+  """
+  if kwargs.pop('kind', None) == 'threads':
+    pool = multiprocessing.pool.ThreadPool(*args, **kwargs)
+  else:
+    orig, orig_args = kwargs.get('initializer'), kwargs.get('initargs', ())
+    kwargs['initializer'] = _ScopedPool_initer
+    kwargs['initargs'] = orig, orig_args
+    pool = multiprocessing.pool.Pool(*args, **kwargs)
+
+  try:
+    yield pool
+    pool.close()
+  except:
+    pool.terminate()
+    raise
+  finally:
+    pool.join()
+
+
+class ProgressPrinter(object):
+  """Threaded single-stat status message printer."""
+  def __init__(self, fmt, enabled=None, stream=sys.stderr, period=0.5):
+    """Create a ProgressPrinter.
+
+    Use it as a context manager which produces a simple 'increment' method:
+
+      with ProgressPrinter('(%%(count)d/%d)' % 1000) as inc:
+        for i in xrange(1000):
+          # do stuff
+          if i % 10 == 0:
+            inc(10)
+
+    Args:
+      fmt - String format with a single '%(count)d' where the counter value
+        should go.
+      enabled (bool) - If this is None, will default to True if
+        logging.getLogger() is set to INFO or more verbose.
+      stream (file-like) - The stream to print status messages to.
+      period (float) - The time in seconds for the printer thread to wait
+        between printing.
+    """
+    self.fmt = fmt
+    if enabled is None:  # pragma: no cover
+      self.enabled = logging.getLogger().isEnabledFor(logging.INFO)
+    else:
+      self.enabled = enabled
+
+    self._count = 0
+    self._dead = False
+    self._dead_cond = threading.Condition()
+    self._stream = stream
+    self._thread = threading.Thread(target=self._run)
+    self._period = period
+
+  def _emit(self, s):
+    if self.enabled:
+      self._stream.write('\r' + s)
+      self._stream.flush()
+
+  def _run(self):
+    with self._dead_cond:
+      while not self._dead:
+        self._emit(self.fmt % {'count': self._count})
+        self._dead_cond.wait(self._period)
+        self._emit((self.fmt + '\n') % {'count': self._count})
+
+  def inc(self, amount=1):
+    self._count += amount
+
+  def __enter__(self):
+    self._thread.start()
+    return self.inc
+
+  def __exit__(self, _exc_type, _exc_value, _traceback):
+    self._dead = True
+    with self._dead_cond:
+      self._dead_cond.notifyAll()
+    self._thread.join()
+    del self._thread
+
+
+def parse_commitrefs(*commitrefs):
+  """Returns binary encoded commit hashes for one or more commitrefs.
+
+  A commitref is anything which can resolve to a commit. Popular examples:
+    * 'HEAD'
+    * 'origin/master'
+    * 'cool_branch~2'
+  """
+  try:
+    return map(binascii.unhexlify, hashes(*commitrefs))
+  except subprocess2.CalledProcessError:
+    raise BadCommitRefException(commitrefs)
+
+
+def run(*cmd, **kwargs):
+  """Runs a git command. Returns stdout as a string.
+
+  If logging is DEBUG, we'll print the command before we run it.
+
+  kwargs
+    autostrip (bool) - Strip the output. Defaults to True.
+  Output string is always strip()'d.
+  """
+  autostrip = kwargs.pop('autostrip', True)
+  cmd = (GIT_EXE,) + cmd
+  logging.debug('Running %s', ' '.join(repr(tok) for tok in cmd))
+  ret = subprocess2.check_output(cmd, stderr=subprocess2.PIPE, **kwargs)
+  if autostrip:
+    ret = (ret or '').strip()
+  return ret
+
+
+def hashes(*reflike):
+  return run('rev-parse', *reflike).splitlines()
+
+
+def intern_f(f, kind='blob'):
+  """Interns a file object into the git object store.
+
+  Args:
+    f (file-like object) - The file-like object to intern
+    kind (git object type) - One of 'blob', 'commit', 'tree', 'tag'.
+
+  Returns the git hash of the interned object (hex encoded).
+  """
+  ret = run('hash-object', '-t', kind, '-w', '--stdin', stdin=f)
+  f.close()
+  return ret
+
+
+def tree(treeref, recurse=False):
+  """Returns a dict representation of a git tree object.
+
+  Args:
+    treeref (str) - a git ref which resolves to a tree (commits count as trees).
+    recurse (bool) - include all of the tree's decendants too. File names will
+      take the form of 'some/path/to/file'.
+
+  Return format:
+    { 'file_name': (mode, type, ref) }
+
+    mode is an integer where:
+      * 0040000 - Directory
+      * 0100644 - Regular non-executable file
+      * 0100664 - Regular non-executable group-writeable file
+      * 0100755 - Regular executable file
+      * 0120000 - Symbolic link
+      * 0160000 - Gitlink
+
+    type is a string where it's one of 'blob', 'commit', 'tree', 'tag'.
+
+    ref is the hex encoded hash of the entry.
+  """
+  ret = {}
+  opts = ['ls-tree', '--full-tree']
+  if recurse:
+    opts.append('-r')
+  opts.append(treeref)
+  try:
+    for line in run(*opts).splitlines():
+      mode, typ, ref, name = line.split(None, 3)
+      ret[name] = (mode, typ, ref)
+  except subprocess2.CalledProcessError:
+    return None
+  return ret
+
+
+def mktree(treedict):
+  """Makes a git tree object and returns its hash.
+
+  See |tree()| for the values of mode, type, and ref.
+
+  Args:
+    treedict - { name: (mode, type, ref) }
+  """
+  with tempfile.TemporaryFile() as f:
+    for name, (mode, typ, ref) in treedict.iteritems():
+      f.write('%s %s %s\t%s\0' % (mode, typ, ref, name))
+    f.seek(0)
+    return run('mktree', '-z', stdin=f)