dep_tracker: New script to track dependencies between files.
The new dep_tracker script can be used to track dependencies between
files. This patch creates the dep_tracker script with support for
dynamic library dependencies in ELF files.
BUG=chromium:404692
TEST=Unittest added.
TEST=`dep_tracker --json /tmp/stats.json --portage-db /build/link/var/db/pkg/ ../build/images/link/latest/dir_3`
Change-Id: Ibd3753dda4ffca0e12a7aeecadbcfbc73678c486
Reviewed-on: https://chromium-review.googlesource.com/212947
Tested-by: Alex Deymo <deymo@chromium.org>
Reviewed-by: Gaurav Shah <gauravsh@chromium.org>
Reviewed-by: David James <davidjames@chromium.org>
Commit-Queue: Alex Deymo <deymo@chromium.org>
diff --git a/scripts/dep_tracker.py b/scripts/dep_tracker.py
new file mode 100644
index 0000000..ca8478c
--- /dev/null
+++ b/scripts/dep_tracker.py
@@ -0,0 +1,287 @@
+# Copyright 2014 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""\
+Script to discover dependencies and other file information from a build.
+
+Some files in the image are installed to provide some functionality, such as
+chrome, shill or bluetoothd provide different functionality that can be
+present or not on a given build. Many other files are dependencies from these
+files that need to be present in the image for them to work. These dependencies
+come from needed shared libraries, executed files and other configuration files
+read.
+
+This script currently discovers dependencies between ELF files for libraries
+required at load time (libraries loaded by the dynamic linker) but not
+libraries loaded at runtime with dlopen().
+"""
+
+import itertools
+import json
+import multiprocessing
+import os
+import re
+import stat
+
+from chromite.lib import commandline
+from chromite.lib import cros_build_lib
+from chromite.lib import parseelf
+from chromite.scripts import lddtree
+
+
+# Regex to parse Gentoo atoms. This should match the following ebuild names,
+# splitting the package name from the version.
+# without version:
+# chromeos-base/tty
+# chromeos-base/libchrome-271506
+# sys-kernel/chromeos-kernel-3_8
+# with version:
+# chromeos-base/tty-0.0.1-r4
+# chromeos-base/libchrome-271506-r5
+# sys-kernel/chromeos-kernel-3_8-3.8.11-r35
+RE_EBUILD_WITHOUT_VERSION = r'^([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)$'
+RE_EBUILD_WITH_VERSION = (
+ r'^=?([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)\-([^\-]+(\-r\d+)?)$')
+
+
+def ParseELFWithArgs(args):
+ """Wrapper to parseelf.ParseELF accepting a single arg.
+
+ This wrapper is required to use multiprocessing.Pool.map function.
+
+ Returns:
+ A 2-tuple with the passed relative path and the result of ParseELF(). On
+ error, when ParseELF() returns None, this function returns None.
+ """
+ elf = parseelf.ParseELF(*args)
+ if elf is None:
+ return
+ return args[1], elf
+
+
+class DepTracker(object):
+ """Tracks dependencies and file information in a root directory.
+
+ This class computes dependencies and other information related to the files
+ in the root image.
+ """
+
+ def __init__(self, root, jobs=1):
+ root_st = os.lstat(root)
+ if not stat.S_ISDIR(root_st.st_mode):
+ raise Exception('root (%s) must be a directory' % root)
+ self._root = root.rstrip('/') + '/'
+
+ # A wrapper to the multiprocess map function. We avoid launching a pool
+ # of processes when jobs is 1 so python exceptions kill the main process,
+ # useful for debugging.
+ if jobs > 1:
+ self._pool = multiprocessing.Pool(jobs)
+ self._imap = self._pool.map
+ else:
+ self._imap = itertools.imap
+
+ self._files = {}
+ self._ebuilds = {}
+
+ # Mapping of rel_paths for symlinks and hardlinks. Hardlinks are assumed
+ # to point to the lowest lexicographically file with the same inode.
+ self._symlinks = {}
+ self._hardlinks = {}
+
+
+ def Init(self):
+ """Generates the initial list of files."""
+ # First iteration over all the files in root searching for symlinks and
+ # non-regular files.
+ seen_inodes = {}
+ for basepath, _, filenames in sorted(os.walk(self._root)):
+ for filename in sorted(filenames):
+ full_path = os.path.join(basepath, filename)
+ rel_path = full_path[len(self._root):]
+ st = os.lstat(full_path)
+
+ file_data = {
+ 'size': st.st_size,
+ }
+ self._files[rel_path] = file_data
+
+ # Track symlinks.
+ if stat.S_ISLNK(st.st_mode):
+ link_path = os.readlink(full_path)
+ # lddtree's normpath handles a little more cases than the os.path
+ # version. In particular, it handles the '//' case.
+ self._symlinks[rel_path] = (
+ link_path.lstrip('/') if link_path and link_path[0] == '/' else
+ lddtree.normpath(os.path.join(os.path.dirname(rel_path),
+ link_path)))
+ file_data['deps'] = {
+ 'symlink': [self._symlinks[rel_path]]
+ }
+
+ # Track hardlinks.
+ if st.st_ino in seen_inodes:
+ self._hardlinks[rel_path] = seen_inodes[st.st_ino]
+ continue
+ seen_inodes[st.st_ino] = rel_path
+
+ def SaveJSON(self, filename):
+ """Save the computed information to a JSON file.
+
+ Args:
+ filename: The destination JSON file.
+ """
+ data = {
+ 'files': self._files,
+ 'ebuilds': self._ebuilds,
+ }
+ json.dump(data, open(filename, 'w'))
+
+
+ def ComputeEbuildDeps(self, portage_db):
+ """Compute the dependencies between ebuilds and files.
+
+ Iterates over the list of ebuilds in the database and annotates the files
+ with the ebuilds they are in. For each ebuild installing a file in the root,
+ also compute the direct dependencies. Stores the information internally.
+
+ Args:
+ portage_db: The path to the portage db. Usually "/var/db/pkg".
+ """
+ portage_db = portage_db.rstrip('/') + '/'
+ for basepath, _, filenames in sorted(os.walk(portage_db)):
+ if 'CONTENTS' in filenames:
+ full_path = os.path.join(basepath, 'CONTENTS')
+ pkg = basepath[len(portage_db):]
+ pkg_files = []
+ pkg_size = 0
+ for line in open(full_path):
+ line = line.split()
+ # Line format is: "type file_path [more space-separated fields]".
+ # Discard any other line without at least the first two fields. The
+ # remaining fields depend on the type.
+ if len(line) < 2:
+ continue
+ typ, file_path = line[:2]
+ # We ignore other entries like for example "dir".
+ if not typ in ('obj', 'sym'):
+ continue
+ file_path = file_path.lstrip('/')
+ # We ignore files installed in the SYSROOT that weren't copied to the
+ # image.
+ if not file_path in self._files:
+ continue
+ pkg_files.append(file_path)
+ file_data = self._files[file_path]
+ if 'ebuild' in file_data:
+ cros_build_lib.Warning('Duplicated entry for %s: %s and %',
+ file_path, file_data['ebuild'], pkg)
+ file_data['ebuild'] = pkg
+ pkg_size += file_data['size']
+ if pkg_files:
+ pkg_atom = pkg
+ pkg_version = None
+ m = re.match(RE_EBUILD_WITHOUT_VERSION, pkg)
+ if m:
+ pkg_atom = m.group(1)
+ m = re.match(RE_EBUILD_WITH_VERSION, pkg)
+ if m:
+ pkg_atom = m.group(1)
+ pkg_version = m.group(2)
+ self._ebuilds[pkg] = {
+ 'size': pkg_size,
+ 'files': len(pkg_files),
+ 'atom': pkg_atom,
+ 'version': pkg_version,
+ }
+ # TODO(deymo): Parse dependencies between ebuilds.
+
+ def ComputeELFFileDeps(self):
+ """Computes the dependencies between files.
+
+ Computes the dependencies between the files in the root directory passed
+ during construction. The dependencies are inferred for ELF files.
+ The list of dependencies for each file in the passed rootfs as a dict().
+ The result's keys are the relative path of the files and the value of each
+ file is a list of dependencies. A dependency is a tuple (dep_path,
+ dep_type) where the dep_path is relative path from the passed root to the
+ dependent file and dep_type is one the following strings stating how the
+ dependency was discovered:
+ 'ldd': The dependent ELF file is listed as needed in the dynamic section.
+ 'symlink': The dependent file is a symlink to the depending.
+ If there are dependencies of a given type whose target file wasn't
+ determined, a tuple (None, dep_type) is included. This is the case for
+ example is a program uses library that wasn't found.
+ """
+ ldpaths = lddtree.LoadLdpaths(self._root)
+
+ # First iteration over all the files in root searching for symlinks and
+ # non-regular files.
+ parseelf_args = []
+ for rel_path, file_data in self._files.iteritems():
+ if rel_path in self._symlinks or rel_path in self._hardlinks:
+ continue
+
+ full_path = os.path.join(self._root, rel_path)
+ st = os.lstat(full_path)
+ if not stat.S_ISREG(st.st_mode):
+ continue
+ parseelf_args.append((self._root, rel_path, ldpaths))
+
+ # Parallelize the ELF lookup step since it is quite expensive.
+ elfs = dict(x for x in self._imap(ParseELFWithArgs, parseelf_args)
+ if not x is None)
+
+ for rel_path, elf in elfs.iteritems():
+ file_data = self._files[rel_path]
+ file_deps = file_data.get('deps', {})
+ # Dependencies based on the result of ldd.
+ for lib in elf.get('needed', []):
+ lib_path = elf['libs'][lib]['path']
+ if not 'ldd' in file_deps:
+ file_deps['ldd'] = []
+ file_deps['ldd'].append(lib_path)
+
+ if file_deps:
+ file_data['deps'] = file_deps
+
+
+def ParseArgs(argv):
+ """Return parsed commandline arguments."""
+
+ parser = commandline.ArgumentParser()
+ parser.add_argument(
+ '-j', '--jobs', type=int, default=multiprocessing.cpu_count(),
+ help='number of simultaneous jobs.')
+ parser.add_argument(
+ '--portage-db', type='path', metavar='PORTAGE_DB',
+ help='parse portage DB for ebuild information')
+ parser.add_argument(
+ '--json', type='path',
+ help='store information in JSON file')
+
+ parser.add_argument(
+ 'root', type='path',
+ help='path to the directory where the rootfs is mounted.')
+
+ opts = parser.parse_args(argv)
+ opts.Freeze()
+ return opts
+
+
+def main(argv):
+ """Main function to start the script."""
+ opts = ParseArgs(argv)
+ cros_build_lib.Debug('Options are %s', opts)
+
+ dt = DepTracker(opts.root, jobs=opts.jobs)
+ dt.Init()
+
+ dt.ComputeELFFileDeps()
+
+ if opts.portage_db:
+ dt.ComputeEbuildDeps(opts.portage_db)
+
+ if opts.json:
+ dt.SaveJSON(opts.json)