dep_tracker: New script to track dependencies between files.

The new dep_tracker script can be used to track dependencies between
files. This patch creates the dep_tracker script with support for
dynamic library dependencies in ELF files.

BUG=chromium:404692
TEST=Unittest added.
TEST=`dep_tracker --json /tmp/stats.json --portage-db /build/link/var/db/pkg/ ../build/images/link/latest/dir_3`

Change-Id: Ibd3753dda4ffca0e12a7aeecadbcfbc73678c486
Reviewed-on: https://chromium-review.googlesource.com/212947
Tested-by: Alex Deymo <deymo@chromium.org>
Reviewed-by: Gaurav Shah <gauravsh@chromium.org>
Reviewed-by: David James <davidjames@chromium.org>
Commit-Queue: Alex Deymo <deymo@chromium.org>
diff --git a/scripts/dep_tracker.py b/scripts/dep_tracker.py
new file mode 100644
index 0000000..ca8478c
--- /dev/null
+++ b/scripts/dep_tracker.py
@@ -0,0 +1,287 @@
+# Copyright 2014 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""\
+Script to discover dependencies and other file information from a build.
+
+Some files in the image are installed to provide some functionality, such as
+chrome, shill or bluetoothd provide different functionality that can be
+present or not on a given build. Many other files are dependencies from these
+files that need to be present in the image for them to work. These dependencies
+come from needed shared libraries, executed files and other configuration files
+read.
+
+This script currently discovers dependencies between ELF files for libraries
+required at load time (libraries loaded by the dynamic linker) but not
+libraries loaded at runtime with dlopen().
+"""
+
+import itertools
+import json
+import multiprocessing
+import os
+import re
+import stat
+
+from chromite.lib import commandline
+from chromite.lib import cros_build_lib
+from chromite.lib import parseelf
+from chromite.scripts import lddtree
+
+
+# Regex to parse Gentoo atoms. This should match the following ebuild names,
+# splitting the package name from the version.
+# without version:
+#   chromeos-base/tty
+#   chromeos-base/libchrome-271506
+#   sys-kernel/chromeos-kernel-3_8
+# with version:
+#   chromeos-base/tty-0.0.1-r4
+#   chromeos-base/libchrome-271506-r5
+#   sys-kernel/chromeos-kernel-3_8-3.8.11-r35
+RE_EBUILD_WITHOUT_VERSION = r'^([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)$'
+RE_EBUILD_WITH_VERSION = (
+    r'^=?([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)\-([^\-]+(\-r\d+)?)$')
+
+
+def ParseELFWithArgs(args):
+  """Wrapper to parseelf.ParseELF accepting a single arg.
+
+  This wrapper is required to use multiprocessing.Pool.map function.
+
+  Returns:
+    A 2-tuple with the passed relative path and the result of ParseELF(). On
+    error, when ParseELF() returns None, this function returns None.
+  """
+  elf = parseelf.ParseELF(*args)
+  if elf is None:
+    return
+  return args[1], elf
+
+
+class DepTracker(object):
+  """Tracks dependencies and file information in a root directory.
+
+  This class computes dependencies and other information related to the files
+  in the root image.
+  """
+
+  def __init__(self, root, jobs=1):
+    root_st = os.lstat(root)
+    if not stat.S_ISDIR(root_st.st_mode):
+      raise Exception('root (%s) must be a directory' % root)
+    self._root = root.rstrip('/') + '/'
+
+    # A wrapper to the multiprocess map function. We avoid launching a pool
+    # of processes when jobs is 1 so python exceptions kill the main process,
+    # useful for debugging.
+    if jobs > 1:
+      self._pool = multiprocessing.Pool(jobs)
+      self._imap = self._pool.map
+    else:
+      self._imap = itertools.imap
+
+    self._files = {}
+    self._ebuilds = {}
+
+    # Mapping of rel_paths for symlinks and hardlinks. Hardlinks are assumed
+    # to point to the lowest lexicographically file with the same inode.
+    self._symlinks = {}
+    self._hardlinks = {}
+
+
+  def Init(self):
+    """Generates the initial list of files."""
+    # First iteration over all the files in root searching for symlinks and
+    # non-regular files.
+    seen_inodes = {}
+    for basepath, _, filenames in sorted(os.walk(self._root)):
+      for filename in sorted(filenames):
+        full_path = os.path.join(basepath, filename)
+        rel_path = full_path[len(self._root):]
+        st = os.lstat(full_path)
+
+        file_data = {
+            'size': st.st_size,
+        }
+        self._files[rel_path] = file_data
+
+        # Track symlinks.
+        if stat.S_ISLNK(st.st_mode):
+          link_path = os.readlink(full_path)
+          # lddtree's normpath handles a little more cases than the os.path
+          # version. In particular, it handles the '//' case.
+          self._symlinks[rel_path] = (
+              link_path.lstrip('/') if link_path and link_path[0] == '/' else
+              lddtree.normpath(os.path.join(os.path.dirname(rel_path),
+                                            link_path)))
+          file_data['deps'] = {
+            'symlink': [self._symlinks[rel_path]]
+          }
+
+        # Track hardlinks.
+        if st.st_ino in seen_inodes:
+          self._hardlinks[rel_path] = seen_inodes[st.st_ino]
+          continue
+        seen_inodes[st.st_ino] = rel_path
+
+  def SaveJSON(self, filename):
+    """Save the computed information to a JSON file.
+
+    Args:
+      filename: The destination JSON file.
+    """
+    data = {
+        'files': self._files,
+        'ebuilds': self._ebuilds,
+    }
+    json.dump(data, open(filename, 'w'))
+
+
+  def ComputeEbuildDeps(self, portage_db):
+    """Compute the dependencies between ebuilds and files.
+
+    Iterates over the list of ebuilds in the database and annotates the files
+    with the ebuilds they are in. For each ebuild installing a file in the root,
+    also compute the direct dependencies. Stores the information internally.
+
+    Args:
+      portage_db: The path to the portage db. Usually "/var/db/pkg".
+    """
+    portage_db = portage_db.rstrip('/') + '/'
+    for basepath, _, filenames in sorted(os.walk(portage_db)):
+      if 'CONTENTS' in filenames:
+        full_path = os.path.join(basepath, 'CONTENTS')
+        pkg = basepath[len(portage_db):]
+        pkg_files = []
+        pkg_size = 0
+        for line in open(full_path):
+          line = line.split()
+          # Line format is: "type file_path [more space-separated fields]".
+          # Discard any other line without at least the first two fields. The
+          # remaining fields depend on the type.
+          if len(line) < 2:
+            continue
+          typ, file_path = line[:2]
+          # We ignore other entries like for example "dir".
+          if not typ in ('obj', 'sym'):
+            continue
+          file_path = file_path.lstrip('/')
+          # We ignore files installed in the SYSROOT that weren't copied to the
+          # image.
+          if not file_path in self._files:
+            continue
+          pkg_files.append(file_path)
+          file_data = self._files[file_path]
+          if 'ebuild' in file_data:
+            cros_build_lib.Warning('Duplicated entry for %s: %s and %',
+                                   file_path, file_data['ebuild'], pkg)
+          file_data['ebuild'] = pkg
+          pkg_size += file_data['size']
+        if pkg_files:
+          pkg_atom = pkg
+          pkg_version = None
+          m = re.match(RE_EBUILD_WITHOUT_VERSION, pkg)
+          if m:
+            pkg_atom = m.group(1)
+          m = re.match(RE_EBUILD_WITH_VERSION, pkg)
+          if m:
+            pkg_atom = m.group(1)
+            pkg_version = m.group(2)
+          self._ebuilds[pkg] = {
+              'size': pkg_size,
+              'files': len(pkg_files),
+              'atom': pkg_atom,
+              'version': pkg_version,
+          }
+    # TODO(deymo): Parse dependencies between ebuilds.
+
+  def ComputeELFFileDeps(self):
+    """Computes the dependencies between files.
+
+    Computes the dependencies between the files in the root directory passed
+    during construction. The dependencies are inferred for ELF files.
+    The list of dependencies for each file in the passed rootfs as a dict().
+    The result's keys are the relative path of the files and the value of each
+    file is a list of dependencies. A dependency is a tuple (dep_path,
+    dep_type) where the dep_path is relative path from the passed root to the
+    dependent file and dep_type is one the following strings stating how the
+    dependency was discovered:
+      'ldd': The dependent ELF file is listed as needed in the dynamic section.
+      'symlink': The dependent file is a symlink to the depending.
+    If there are dependencies of a given type whose target file wasn't
+    determined, a tuple (None, dep_type) is included. This is the case for
+    example is a program uses library that wasn't found.
+    """
+    ldpaths = lddtree.LoadLdpaths(self._root)
+
+    # First iteration over all the files in root searching for symlinks and
+    # non-regular files.
+    parseelf_args = []
+    for rel_path, file_data in self._files.iteritems():
+      if rel_path in self._symlinks or rel_path in self._hardlinks:
+        continue
+
+      full_path = os.path.join(self._root, rel_path)
+      st = os.lstat(full_path)
+      if not stat.S_ISREG(st.st_mode):
+        continue
+      parseelf_args.append((self._root, rel_path, ldpaths))
+
+    # Parallelize the ELF lookup step since it is quite expensive.
+    elfs = dict(x for x in self._imap(ParseELFWithArgs, parseelf_args)
+                if not x is None)
+
+    for rel_path, elf in elfs.iteritems():
+      file_data = self._files[rel_path]
+      file_deps = file_data.get('deps', {})
+      # Dependencies based on the result of ldd.
+      for lib in elf.get('needed', []):
+        lib_path = elf['libs'][lib]['path']
+        if not 'ldd' in file_deps:
+          file_deps['ldd'] = []
+        file_deps['ldd'].append(lib_path)
+
+      if file_deps:
+        file_data['deps'] = file_deps
+
+
+def ParseArgs(argv):
+  """Return parsed commandline arguments."""
+
+  parser = commandline.ArgumentParser()
+  parser.add_argument(
+      '-j', '--jobs', type=int, default=multiprocessing.cpu_count(),
+      help='number of simultaneous jobs.')
+  parser.add_argument(
+      '--portage-db', type='path', metavar='PORTAGE_DB',
+      help='parse portage DB for ebuild information')
+  parser.add_argument(
+      '--json', type='path',
+      help='store information in JSON file')
+
+  parser.add_argument(
+      'root', type='path',
+      help='path to the directory where the rootfs is mounted.')
+
+  opts = parser.parse_args(argv)
+  opts.Freeze()
+  return opts
+
+
+def main(argv):
+  """Main function to start the script."""
+  opts = ParseArgs(argv)
+  cros_build_lib.Debug('Options are %s', opts)
+
+  dt = DepTracker(opts.root, jobs=opts.jobs)
+  dt.Init()
+
+  dt.ComputeELFFileDeps()
+
+  if opts.portage_db:
+    dt.ComputeEbuildDeps(opts.portage_db)
+
+  if opts.json:
+    dt.SaveJSON(opts.json)