gconv_strip: Move it to chromite.

This includes the changes up to CL:212132 and local modifications to
use the chromite style:

* gconv_strip.py is called using the wrapper.py
* subprocess is replaced with cros_build_lib.RunCommand
* the standard python argparse modules is replaced by chromite's
  commandline.

BUG=chromium:403050
TEST=`gconv_strip --debug --dry-run /path/to/rootfs` still works.

Change-Id: I217d9079738d4825792210bc9a0f7ca309825f70
Reviewed-on: https://chromium-review.googlesource.com/212261
Reviewed-by: Gaurav Shah <gauravsh@chromium.org>
Commit-Queue: Alex Deymo <deymo@chromium.org>
Tested-by: Alex Deymo <deymo@chromium.org>
diff --git a/scripts/gconv_strip.py b/scripts/gconv_strip.py
new file mode 100644
index 0000000..cd1438d
--- /dev/null
+++ b/scripts/gconv_strip.py
@@ -0,0 +1,297 @@
+#!/usr/bin/python
+# Copyright 2014 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Script to remove unused gconv charset modules from a build."""
+
+import ahocorasick
+import glob
+import operator
+import os
+import stat
+
+from chromite.lib import commandline
+from chromite.lib import cros_build_lib
+from chromite.lib import osutils
+
+
+# Path pattern to search for the gconv-modules file.
+GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
+
+# Sticky modules. These charsets modules are always included even if they
+# aren't used. You can specify any charset name as supported by 'iconv_open',
+# for example, 'LATIN1' or 'ISO-8859-1'.
+STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
+
+# List of function names (symbols) known to use a charset as a parameter.
+GCONV_SYMBOLS = (
+    # glibc
+    'iconv_open',
+    'iconv',
+    # glib
+    'g_convert',
+    'g_convert_with_fallback',
+    'g_iconv',
+    'g_locale_to_utf8',
+    'g_get_charset',
+)
+
+
+class GconvModules(object):
+  """Class to manipulate the gconv/gconv-modules file and referenced modules.
+
+  This class parses the contents of the gconv-modules file installed by glibc
+  which provides the definition of the charsets supported by iconv_open(3). It
+  allows to load the current gconv-modules file and rewrite it to include only
+  a subset of the supported modules, removing the other modules.
+
+  Each charset is involved on some transformation between that charset and an
+  internal representation. This transformation is defined on a .so file loaded
+  dynamically with dlopen(3) when the charset defined in this file is requested
+  to iconv_open(3).
+
+  See the comments on gconv-modules file for syntax details.
+  """
+
+  def __init__(self, gconv_modules_fn):
+    """Initialize the class.
+
+    Args:
+      gconv_modules_fn: Path to gconv/gconv-modules file.
+    """
+    self._filename = gconv_modules_fn
+
+    # An alias map of charsets. The key (fromcharset) is the alias name and
+    # the value (tocharset) is the real charset name. We also support a value
+    # that is an alias for another charset.
+    self._alias = {}
+
+    # The modules dict goes from charset to module names (the filenames without
+    # the .so extension). Since several transformations involving the same
+    # charset could be defined in different files, the values of this dict are
+    # a set of module names.
+    self._modules = {}
+
+  def Load(self):
+    """Load the charsets from gconv-modules."""
+    for line in open(self._filename):
+      line = line.split('#', 1)[0].strip()
+      if not line: # Comment
+        continue
+
+      lst = line.split()
+      if lst[0] == 'module':
+        _, fromset, toset, filename = lst[:4]
+        for charset in (fromset, toset):
+          charset = charset.rstrip('/')
+          mods = self._modules.get(charset, set())
+          mods.add(filename)
+          self._modules[charset] = mods
+      elif lst[0] == 'alias':
+        _, fromset, toset = lst
+        fromset = fromset.rstrip('/')
+        toset = toset.rstrip('/')
+        # Warn if the same charset is defined as two different aliases.
+        if self._alias.get(fromset, toset) != toset:
+          cros_build_lib.Error('charset "%s" already defined as "%s".',
+                               fromset, self._alias[fromset])
+        self._alias[fromset] = toset
+      else:
+        cros_build_lib.Die('Unknown line: %s', line)
+
+    cros_build_lib.Debug('Found %d modules and %d alias in %s',
+                         len(self._modules), len(self._alias), self._filename)
+    charsets = sorted(self._alias.keys() + self._modules.keys())
+    # Remove the 'INTERNAL' charset from the list, since it is not a charset
+    # but an internal representation used to convert to and from other charsets.
+    if 'INTERNAL' in charsets:
+      charsets.remove('INTERNAL')
+    return charsets
+
+  def Rewrite(self, used_charsets, dry_run=False):
+    """Rewrite gconv-modules file with only the used charsets.
+
+    Args:
+      used_charsets: A list of used charsets. This should be a subset of the
+                     list returned by Load().
+      dry_run: Whether this function should not change any file.
+    """
+
+    # Compute the used modules.
+    used_modules = set()
+    for charset in used_charsets:
+      while charset in self._alias:
+        charset = self._alias[charset]
+      used_modules.update(self._modules[charset])
+    unused_modules = reduce(set.union, self._modules.values()) - used_modules
+
+    cros_build_lib.Debug('Used modules: %s', ', '.join(sorted(used_modules)))
+
+    modules_dir = os.path.dirname(self._filename)
+    unused_size = 0
+    for module in sorted(unused_modules):
+      module_path = os.path.join(modules_dir, '%s.so' % module)
+      unused_size += os.lstat(module_path).st_size
+      cros_build_lib.Debug('rm %s', module_path)
+      if not dry_run:
+        os.unlink(module_path)
+    cros_build_lib.Info('Done. Using %d gconv modules. Removed %d unused'
+                        ' modules (%.1f KiB)',
+                        len(used_modules), len(unused_modules),
+                        unused_size / 1024.)
+
+    # Recompute the gconv-modules file with only the included gconv modules.
+    result = []
+    for line in open(self._filename):
+      lst = line.split('#', 1)[0].strip().split()
+
+      if not lst:
+        result.append(line)  # Keep comments and copyright headers.
+      elif lst[0] == 'module':
+        _, _, _, filename = lst[:4]
+        if filename in used_modules:
+          result.append(line)  # Used module
+      elif lst[0] == 'alias':
+        _, charset, _ = lst
+        charset = charset.rstrip('/')
+        while charset in self._alias:
+          charset = self._alias[charset]
+        if used_modules.intersection(self._modules[charset]):
+          result.append(line)  # Alias to an used module
+      else:
+        cros_build_lib.Die('Unknown line: %s', line)
+
+    if not dry_run:
+      osutils.WriteFile(self._filename, ''.join(result))
+
+
+def MultipleStringMatch(patterns, corpus):
+  """Search a list of strings in a corpus string.
+
+  Args:
+    patterns: A list of strings.
+    corpus: The text where to search for the strings.
+
+  Result:
+    A list of Booleans stating whether each pattern string was found in the
+    corpus or not.
+  """
+  tree = ahocorasick.KeywordTree()
+  for word in patterns:
+    tree.add(word)
+  tree.make()
+
+  result = [False] * len(patterns)
+  for i, j in tree.findall(corpus):
+    match = corpus[i:j]
+    result[patterns.index(match)] = True
+
+  return result
+
+
+def GconvStrip(opts):
+  """Process gconv-modules and remove unused modules.
+
+  Args:
+    opts: The command-line args passed to the script.
+
+  Returns:
+    The exit code number indicating whether the process succeeded.
+  """
+  root_st = os.lstat(opts.root)
+  if not stat.S_ISDIR(root_st.st_mode):
+    cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
+
+  # Detect the possible locations of the gconv-modules file.
+  gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
+
+  if not gconv_modules_files:
+    cros_build_lib.Warning('gconv-modules file not found.')
+    return 1
+
+  # Only one gconv-modules files should be present, either on /usr/lib or
+  # /usr/lib64, but not both.
+  if len(gconv_modules_files) > 1:
+    cros_build_lib.Die('Found several gconv-modules files.')
+
+  gconv_modules_fn = gconv_modules_files[0]
+  cros_build_lib.Info('Searching for unused gconv files defined in %s',
+                      gconv_modules_fn)
+
+  gmods = GconvModules(gconv_modules_fn)
+  charsets = gmods.Load()
+
+  # Use scanelf to search for all the binary files on the rootfs that require
+  # or define the symbol iconv_open. We also include the binaries that define
+  # it since there could be internal calls to it from other functions.
+  files = set()
+  for symbol in GCONV_SYMBOLS:
+    cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
+           '--symbol', symbol, opts.root]
+    result = cros_build_lib.RunCommand(cmd, redirect_stdout=True,
+                                       print_cmd=False)
+    symbol_files = result.output.splitlines()
+    cros_build_lib.Debug('Symbol %s found on %d files.',
+                         symbol, len(symbol_files))
+    files.update(symbol_files)
+
+  # The charsets are represented as nul-terminated strings in the binary files,
+  # so we append the '\0' to each string. This prevents some false positives
+  # when the name of the charset is a substring of some other string. It doesn't
+  # prevent false positives when the charset name is the suffix of another
+  # string, for example a binary with the string "DON'T DO IT\0" will match the
+  # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
+  # charsets could fall in category.
+  strings = [s + '\0' for s in charsets]
+  cros_build_lib.Info('Will search for %d strings in %d files',
+                      len(strings), len(files))
+
+  # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
+  # strings should be listed in the gconv-modules file.
+  unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
+  if unknown_sticky_modules:
+    cros_build_lib.Warning(
+        'The following charsets were explicitly requested in STICKY_MODULES '
+        'even though they don\'t exist: %s',
+        ', '.join(unknown_sticky_modules))
+  global_used = [charset in STICKY_MODULES for charset in charsets]
+
+  for fn in files:
+    used_fn = MultipleStringMatch(strings, osutils.ReadFile(fn, mode='rb'))
+
+    global_used = map(operator.or_, global_used, used_fn)
+    # Check the debug flag to avoid running an useless loop.
+    if opts.debug and any(used_fn):
+      cros_build_lib.Debug('File %s:', fn)
+      for i in range(len(used_fn)):
+        if used_fn[i]:
+          cros_build_lib.Debug(' - %s', strings[i])
+
+  used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
+  gmods.Rewrite(used_charsets, opts.dry_run)
+  return 0
+
+
+def ParseArgs(argv):
+  """Return parsed commandline arguments."""
+
+  parser = commandline.ArgumentParser()
+  parser.add_argument(
+      '--dry-run', action='store_true', default=False,
+      help='process but don\'t modify any file.')
+  parser.add_argument(
+      'root', type='path',
+      help='path to the directory where the rootfs is mounted.')
+
+  opts = parser.parse_args(argv)
+  opts.Freeze()
+  return opts
+
+
+def main(argv):
+  """Main function to start the script."""
+  opts = ParseArgs(argv)
+  cros_build_lib.Debug('Options are %s', opts)
+
+  return GconvStrip(opts)