scripts: gconv_strip: process gconv-modules.d

New glibc versions moved most gconv module information into a
gconv-modules-extra.conf file in gconv-modules.d [1]. gconv_strip
currently only looks at the gconv-modules file. As a result,
gconv_strip isn't actually stripping much right now.

This change updates gconv_strip to search for additional files in
gconv-modules.d and process them in the exact same way as it processes
the gconv-modules file.

[1]: https://sourceware.org/git/?p=glibc.git;a=blob;f=NEWS;h=f976abccbd6ffe3c2d25b6d22bc9e042ab394fab;hb=7f079fdc16e88ebb8020e17b2fd900e8924da29a#l775

BUG=b:277779682
TEST=build a tatl image; check /usr/lib64/gconv for removed modules

Change-Id: Idce5cf6d4898e41759989f757e719ae63bbdebdd
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/chromite/+/4679252
Tested-by: Robert Kolchmeyer <rkolchmeyer@google.com>
Commit-Queue: Robert Kolchmeyer <rkolchmeyer@google.com>
Reviewed-by: Mike Frysinger <vapier@chromium.org>
Reviewed-by: Alex Klein <saklein@chromium.org>
diff --git a/scripts/gconv_strip.py b/scripts/gconv_strip.py
index 4f509f5..1878003 100644
--- a/scripts/gconv_strip.py
+++ b/scripts/gconv_strip.py
@@ -64,13 +64,15 @@
     See the comments on gconv-modules file for syntax details.
     """
 
-    def __init__(self, gconv_modules_file):
+    def __init__(self, gconv_modules_file, modules_dir):
         """Initialize the class.
 
         Args:
             gconv_modules_file: Path to gconv/gconv-modules file.
+            modules_dir: Path to the directory that contains the gconv modules.
         """
         self._filename = gconv_modules_file
+        self._modules_dir = modules_dir
 
         # An alias map of charsets. The key (fromcharset) is the alias name and
         # the value (tocharset) is the real charset name. We also support a
@@ -150,8 +152,6 @@
             - used_modules
         )
 
-        modules_dir = os.path.dirname(self._filename)
-
         all_modules = set.union(used_modules, unused_modules)
         # The list of charsets that depend on a given library. For example,
         # libdeps['libCNS.so'] is the set of all the modules that require that
@@ -159,7 +159,9 @@
         libdeps = {}
         for module in all_modules:
             deps = lddtree.ParseELF(
-                os.path.join(modules_dir, "%s.so" % module), modules_dir, []
+                os.path.join(self._modules_dir, "%s.so" % module),
+                self._modules_dir,
+                [],
             )
             if "needed" not in deps:
                 continue
@@ -181,7 +183,7 @@
 
         unused_size = 0
         for module in sorted(unused_modules):
-            module_path = os.path.join(modules_dir, "%s.so" % module)
+            module_path = os.path.join(self._modules_dir, "%s.so" % module)
             unused_size += os.lstat(module_path).st_size
             logging.debug("rm %s", module_path)
             if not dryrun:
@@ -189,7 +191,7 @@
 
         unused_libdeps_size = 0
         for lib in sorted(unused_libdeps):
-            lib_path = os.path.join(modules_dir, lib)
+            lib_path = os.path.join(self._modules_dir, lib)
             unused_libdeps_size += os.lstat(lib_path).st_size
             logging.debug("rm %s", lib_path)
             if not dryrun:
@@ -288,8 +290,19 @@
         "Searching for unused gconv files defined in %s", gconv_modules_file
     )
 
-    gmods = GconvModules(gconv_modules_file)
-    charsets = gmods.Load()
+    # Additional gconv-modules configuration files can be present in the
+    # co-located gconv-modules.d. glibc installs a gconv-modules-extra.conf
+    # here by default.
+    modules_dir = os.path.dirname(gconv_modules_file)
+    extras = glob.glob(
+        os.path.join(
+            modules_dir,
+            os.path.basename(gconv_modules_file) + ".d",
+            "*.conf",
+        )
+    )
+    gmods_groups = [GconvModules(gconv_modules_file, modules_dir)]
+    gmods_groups.extend(GconvModules(x, modules_dir) for x in extras)
 
     # Use scanelf to search for all the binary files on the rootfs that require
     # or define the symbol iconv_open. We also include the binaries that define
@@ -312,46 +325,48 @@
     files = set(result.stdout.splitlines())
     logging.debug("Symbols %s found on %d files.", symbols, len(files))
 
-    # The charsets are represented as nul-terminated strings in the binary
-    # files, so we append the '\0' to each string. This prevents some false
-    # positives when the name of the charset is a substring of some other
-    # string. It doesn't prevent false positives when the charset name is the
-    # suffix of another string, for example a binary with the string "DON'T DO
-    # IT\0" will match the 'IT' charset. Empirical test on ChromeOS images
-    # suggests that only 4 charsets could fall in category.
-    strings = [s.encode("utf-8") + b"x\00" for s in charsets]
-    logging.info(
-        "Will search for %d strings in %d files", len(strings), len(files)
-    )
-
-    # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
-    # strings should be listed in the gconv-modules file.
-    unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
-    if unknown_sticky_modules:
-        logging.warning(
-            "The following charsets were explicitly requested in "
-            "STICKY_MODULES even though they don't exist: %s",
-            ", ".join(unknown_sticky_modules),
-        )
-    global_used = [charset in STICKY_MODULES for charset in charsets]
-
-    for filename in files:
-        used_filenames = MultipleStringMatch(
-            strings, osutils.ReadFile(filename, mode="rb")
+    for gmods in gmods_groups:
+        charsets = gmods.Load()
+        # The charsets are represented as nul-terminated strings in the binary
+        # files, so we append the '\0' to each string. This prevents some false
+        # positives when the name of the charset is a substring of some other
+        # string. It doesn't prevent false positives when the charset name is
+        # the suffix of another string, for example a binary with the string
+        # "DON'T DO IT\0" will match the 'IT' charset. Empirical test on
+        # ChromeOS images suggests that only 4 charsets could fall in category.
+        strings = [s.encode("utf-8") + b"x\00" for s in charsets]
+        logging.info(
+            "Will search for %d strings in %d files", len(strings), len(files)
         )
 
-        global_used = [
-            operator.or_(*x) for x in zip(global_used, used_filenames)
-        ]
-        # Check the debug flag to avoid running a useless loop.
-        if opts.debug and any(used_filenames):
-            logging.debug("File %s:", filename)
-            for i, used_filename in enumerate(used_filenames):
-                if used_filename:
-                    logging.debug(" - %s", strings[i])
+        # Charsets listed in STICKY_MOUDLES are initialized as used. Note that
+        # those strings should be listed in the gconv-modules file.
+        unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
+        if unknown_sticky_modules:
+            logging.warning(
+                "The following charsets were explicitly requested in "
+                "STICKY_MODULES even though they don't exist: %s",
+                ", ".join(unknown_sticky_modules),
+            )
+        global_used = [charset in STICKY_MODULES for charset in charsets]
 
-    used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
-    gmods.Rewrite(used_charsets, opts.dryrun)
+        for filename in files:
+            used_filenames = MultipleStringMatch(
+                strings, osutils.ReadFile(filename, mode="rb")
+            )
+
+            global_used = [
+                operator.or_(*x) for x in zip(global_used, used_filenames)
+            ]
+            # Check the debug flag to avoid running a useless loop.
+            if opts.debug and any(used_filenames):
+                logging.debug("File %s:", filename)
+                for i, used_filename in enumerate(used_filenames):
+                    if used_filename:
+                        logging.debug(" - %s", strings[i])
+
+        used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
+        gmods.Rewrite(used_charsets, opts.dryrun)
     return 0