blob: 187800345c430b392ad88afc78624eca6df5d1f2 [file] [log] [blame]
Mike Frysingerf1ba7ad2022-09-12 05:42:57 -04001# Copyright 2014 The ChromiumOS Authors
Alex Deymo2bba3812014-08-13 08:49:09 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Script to remove unused gconv charset modules from a build."""
6
Mike Frysinger9997cc02019-07-17 15:50:01 -04007import functools
Alex Deymo2bba3812014-08-13 08:49:09 -07008import glob
Chris McDonald59650c32021-07-20 15:29:28 -06009import logging
Alex Deymo2bba3812014-08-13 08:49:09 -070010import operator
11import os
12import stat
13
Chris McDonald59650c32021-07-20 15:29:28 -060014from chromite.third_party import lddtree
15
Alex Deymo2bba3812014-08-13 08:49:09 -070016from chromite.lib import commandline
17from chromite.lib import cros_build_lib
18from chromite.lib import osutils
Mike Frysinger95452702021-01-23 00:07:22 -050019
Alex Deymo2bba3812014-08-13 08:49:09 -070020
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070021try:
Alex Klein1699fab2022-09-08 08:46:06 -060022 import pytest # pylint: disable=import-error
23
24 ahocorasick = pytest.importorskip("ahocorasick")
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070025except ImportError:
Alex Klein1699fab2022-09-08 08:46:06 -060026 import ahocorasick
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070027
Alex Deymo2bba3812014-08-13 08:49:09 -070028
29# Path pattern to search for the gconv-modules file.
Alex Klein1699fab2022-09-08 08:46:06 -060030GCONV_MODULES_PATH = "usr/*/gconv/gconv-modules"
Alex Deymo2bba3812014-08-13 08:49:09 -070031
32# Sticky modules. These charsets modules are always included even if they
33# aren't used. You can specify any charset name as supported by 'iconv_open',
34# for example, 'LATIN1' or 'ISO-8859-1'.
Alex Klein1699fab2022-09-08 08:46:06 -060035STICKY_MODULES = ("UTF-16", "UTF-32", "UNICODE")
Alex Deymo2bba3812014-08-13 08:49:09 -070036
37# List of function names (symbols) known to use a charset as a parameter.
38GCONV_SYMBOLS = (
39 # glibc
Alex Klein1699fab2022-09-08 08:46:06 -060040 "iconv_open",
41 "iconv",
Alex Deymo2bba3812014-08-13 08:49:09 -070042 # glib
Alex Klein1699fab2022-09-08 08:46:06 -060043 "g_convert",
44 "g_convert_with_fallback",
45 "g_iconv",
46 "g_locale_to_utf8",
47 "g_get_charset",
Alex Deymo2bba3812014-08-13 08:49:09 -070048)
49
50
Alex Klein074f94f2023-06-22 10:32:06 -060051class GconvModules:
Alex Klein1699fab2022-09-08 08:46:06 -060052 """Class to manipulate the gconv/gconv-modules file and referenced modules.
Alex Deymo2bba3812014-08-13 08:49:09 -070053
Alex Klein1699fab2022-09-08 08:46:06 -060054 This class parses the contents of the gconv-modules file installed by glibc
55 which provides the definition of the charsets supported by iconv_open(3). It
56 allows to load the current gconv-modules file and rewrite it to include only
57 a subset of the supported modules, removing the other modules.
Alex Deymo2bba3812014-08-13 08:49:09 -070058
Alex Klein1699fab2022-09-08 08:46:06 -060059 Each charset is involved on some transformation between that charset and an
60 internal representation. This transformation is defined on a .so file loaded
Alex Klein68b270c2023-04-14 14:42:50 -060061 dynamically with dlopen(3) when the charset defined in this file is
62 requested to iconv_open(3).
Alex Deymo2bba3812014-08-13 08:49:09 -070063
Alex Klein1699fab2022-09-08 08:46:06 -060064 See the comments on gconv-modules file for syntax details.
Alex Deymo2bba3812014-08-13 08:49:09 -070065 """
66
Robert Kolchmeyerd14321e2023-07-11 17:36:25 -070067 def __init__(self, gconv_modules_file, modules_dir):
Alex Klein1699fab2022-09-08 08:46:06 -060068 """Initialize the class.
Alex Deymo2bba3812014-08-13 08:49:09 -070069
Alex Klein1699fab2022-09-08 08:46:06 -060070 Args:
Alex Klein68b270c2023-04-14 14:42:50 -060071 gconv_modules_file: Path to gconv/gconv-modules file.
Robert Kolchmeyerd14321e2023-07-11 17:36:25 -070072 modules_dir: Path to the directory that contains the gconv modules.
Alex Klein1699fab2022-09-08 08:46:06 -060073 """
74 self._filename = gconv_modules_file
Robert Kolchmeyerd14321e2023-07-11 17:36:25 -070075 self._modules_dir = modules_dir
Alex Deymoda9dd402014-08-13 08:54:18 -070076
Alex Klein1699fab2022-09-08 08:46:06 -060077 # An alias map of charsets. The key (fromcharset) is the alias name and
Alex Klein68b270c2023-04-14 14:42:50 -060078 # the value (tocharset) is the real charset name. We also support a
79 # value that is an alias for another charset.
Alex Klein1699fab2022-09-08 08:46:06 -060080 self._alias = {}
Alex Deymoda9dd402014-08-13 08:54:18 -070081
Alex Klein68b270c2023-04-14 14:42:50 -060082 # The modules dict goes from charset to module names (the filenames
83 # without the .so extension). Since several transformations involving
84 # the same charset could be defined in different files, the values of
85 # this dict are a set of module names.
Alex Klein1699fab2022-09-08 08:46:06 -060086 self._modules = {}
Alex Deymoda9dd402014-08-13 08:54:18 -070087
Alex Klein1699fab2022-09-08 08:46:06 -060088 def Load(self):
89 """Load the charsets from gconv-modules."""
Mike Frysinger31fdddd2023-02-24 15:50:55 -050090 with open(self._filename, encoding="utf-8") as fp:
Alex Klein1699fab2022-09-08 08:46:06 -060091 for line in fp:
92 line = line.split("#", 1)[0].strip()
93 if not line:
94 # Ignore blank lines & comments.
95 continue
Alex Deymoda9dd402014-08-13 08:54:18 -070096
Alex Klein1699fab2022-09-08 08:46:06 -060097 lst = line.split()
98 if lst[0] == "module":
99 _, fromset, toset, filename = lst[:4]
100 for charset in (fromset, toset):
101 charset = charset.rstrip("/")
102 mods = self._modules.get(charset, set())
103 mods.add(filename)
104 self._modules[charset] = mods
105 elif lst[0] == "alias":
106 _, fromset, toset = lst
107 fromset = fromset.rstrip("/")
108 toset = toset.rstrip("/")
Alex Klein68b270c2023-04-14 14:42:50 -0600109 # Warn if the same charset is defined as two different
110 # aliases.
Alex Klein1699fab2022-09-08 08:46:06 -0600111 if self._alias.get(fromset, toset) != toset:
112 logging.error(
113 'charset "%s" already defined as "%s".',
114 fromset,
115 self._alias[fromset],
116 )
117 self._alias[fromset] = toset
118 else:
119 cros_build_lib.Die("Unknown line: %s", line)
Alex Deymoda9dd402014-08-13 08:54:18 -0700120
Alex Klein1699fab2022-09-08 08:46:06 -0600121 logging.debug(
122 "Found %d modules and %d alias in %s",
123 len(self._modules),
124 len(self._alias),
125 self._filename,
126 )
127 charsets = sorted(list(self._alias) + list(self._modules))
128 # Remove the 'INTERNAL' charset from the list, since it is not a charset
Alex Klein68b270c2023-04-14 14:42:50 -0600129 # but an internal representation used to convert to and from other
130 # charsets.
Alex Klein1699fab2022-09-08 08:46:06 -0600131 if "INTERNAL" in charsets:
132 charsets.remove("INTERNAL")
133 return charsets
Alex Deymoda9dd402014-08-13 08:54:18 -0700134
Mike Frysinger61b792c2023-02-02 09:02:27 -0500135 def Rewrite(self, used_charsets, dryrun=False):
Alex Klein1699fab2022-09-08 08:46:06 -0600136 """Rewrite gconv-modules file with only the used charsets.
Alex Deymo2bba3812014-08-13 08:49:09 -0700137
Alex Klein1699fab2022-09-08 08:46:06 -0600138 Args:
Alex Klein68b270c2023-04-14 14:42:50 -0600139 used_charsets: A list of used charsets. This should be a subset of
140 the list returned by Load().
141 dryrun: Whether this function should not change any file.
Alex Klein1699fab2022-09-08 08:46:06 -0600142 """
Alex Deymo2bba3812014-08-13 08:49:09 -0700143
Alex Klein1699fab2022-09-08 08:46:06 -0600144 # Compute the used modules.
145 used_modules = set()
146 for charset in used_charsets:
147 while charset in self._alias:
148 charset = self._alias[charset]
149 used_modules.update(self._modules[charset])
150 unused_modules = (
151 functools.reduce(set.union, list(self._modules.values()))
152 - used_modules
153 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700154
Alex Klein1699fab2022-09-08 08:46:06 -0600155 all_modules = set.union(used_modules, unused_modules)
156 # The list of charsets that depend on a given library. For example,
157 # libdeps['libCNS.so'] is the set of all the modules that require that
158 # library. These libraries live in the same directory as the modules.
159 libdeps = {}
160 for module in all_modules:
161 deps = lddtree.ParseELF(
Robert Kolchmeyerd14321e2023-07-11 17:36:25 -0700162 os.path.join(self._modules_dir, "%s.so" % module),
163 self._modules_dir,
164 [],
Alex Klein1699fab2022-09-08 08:46:06 -0600165 )
166 if "needed" not in deps:
167 continue
168 for lib in deps["needed"]:
Alex Klein68b270c2023-04-14 14:42:50 -0600169 # Ignore the libs without a path defined (outside the
170 # modules_dir).
Alex Klein1699fab2022-09-08 08:46:06 -0600171 if deps["libs"][lib]["path"]:
172 libdeps[lib] = libdeps.get(lib, set()).union([module])
173
174 used_libdeps = set(
175 lib
176 for lib, deps in libdeps.items()
177 if deps.intersection(used_modules)
178 )
179 unused_libdeps = set(libdeps).difference(used_libdeps)
180
181 logging.debug("Used modules: %s", ", ".join(sorted(used_modules)))
182 logging.debug("Used dependency libs: %s, ".join(sorted(used_libdeps)))
183
184 unused_size = 0
185 for module in sorted(unused_modules):
Robert Kolchmeyerd14321e2023-07-11 17:36:25 -0700186 module_path = os.path.join(self._modules_dir, "%s.so" % module)
Alex Klein1699fab2022-09-08 08:46:06 -0600187 unused_size += os.lstat(module_path).st_size
188 logging.debug("rm %s", module_path)
Mike Frysinger61b792c2023-02-02 09:02:27 -0500189 if not dryrun:
Alex Klein1699fab2022-09-08 08:46:06 -0600190 os.unlink(module_path)
191
192 unused_libdeps_size = 0
193 for lib in sorted(unused_libdeps):
Robert Kolchmeyerd14321e2023-07-11 17:36:25 -0700194 lib_path = os.path.join(self._modules_dir, lib)
Alex Klein1699fab2022-09-08 08:46:06 -0600195 unused_libdeps_size += os.lstat(lib_path).st_size
196 logging.debug("rm %s", lib_path)
Mike Frysinger61b792c2023-02-02 09:02:27 -0500197 if not dryrun:
Alex Klein1699fab2022-09-08 08:46:06 -0600198 os.unlink(lib_path)
199
200 logging.info(
201 "Done. Using %d gconv modules. Removed %d unused modules"
202 " (%.1f KiB) and %d unused dependencies (%.1f KiB)",
203 len(used_modules),
204 len(unused_modules),
205 unused_size / 1024.0,
206 len(unused_libdeps),
207 unused_libdeps_size / 1024.0,
208 )
209
210 # Recompute the gconv-modules file with only the included gconv modules.
211 result = []
Mike Frysinger31fdddd2023-02-24 15:50:55 -0500212 with open(self._filename, encoding="utf-8") as fp:
Alex Klein1699fab2022-09-08 08:46:06 -0600213 for line in fp:
214 lst = line.split("#", 1)[0].strip().split()
215
216 if not lst:
217 # Keep comments and copyright headers.
218 result.append(line)
219 elif lst[0] == "module":
220 _, _, _, filename = lst[:4]
221 if filename in used_modules:
222 # Used module
223 result.append(line)
224 elif lst[0] == "alias":
225 _, charset, _ = lst
226 charset = charset.rstrip("/")
227 while charset in self._alias:
228 charset = self._alias[charset]
229 if used_modules.intersection(self._modules[charset]):
230 # Alias to an used module
231 result.append(line)
232 else:
233 cros_build_lib.Die("Unknown line: %s", line)
234
Mike Frysinger61b792c2023-02-02 09:02:27 -0500235 if not dryrun:
Alex Klein1699fab2022-09-08 08:46:06 -0600236 osutils.WriteFile(self._filename, "".join(result))
Alex Deymo2bba3812014-08-13 08:49:09 -0700237
238
239def MultipleStringMatch(patterns, corpus):
Alex Klein1699fab2022-09-08 08:46:06 -0600240 """Search a list of strings in a corpus string.
Alex Deymo2bba3812014-08-13 08:49:09 -0700241
Alex Klein1699fab2022-09-08 08:46:06 -0600242 Args:
Alex Klein68b270c2023-04-14 14:42:50 -0600243 patterns: A list of strings.
244 corpus: The text where to search for the strings.
Alex Deymo2bba3812014-08-13 08:49:09 -0700245
Alex Klein1699fab2022-09-08 08:46:06 -0600246 Returns:
Alex Klein68b270c2023-04-14 14:42:50 -0600247 A list of Booleans stating whether each pattern string was found in the
248 corpus or not.
Alex Klein1699fab2022-09-08 08:46:06 -0600249 """
250 result = [False] * len(patterns)
Mike Frysinger9c927782019-10-14 02:48:48 -0400251
Alex Klein1699fab2022-09-08 08:46:06 -0600252 tree = ahocorasick.Automaton()
253 for i, word in enumerate(patterns):
254 tree.add_word(word, i)
255 tree.make_automaton()
Mike Frysinger9c927782019-10-14 02:48:48 -0400256
Alex Klein1699fab2022-09-08 08:46:06 -0600257 for _, i in tree.iter(corpus):
258 result[i] = True
Alex Deymo2bba3812014-08-13 08:49:09 -0700259
Alex Klein1699fab2022-09-08 08:46:06 -0600260 return result
Alex Deymo2bba3812014-08-13 08:49:09 -0700261
262
263def GconvStrip(opts):
Alex Klein1699fab2022-09-08 08:46:06 -0600264 """Process gconv-modules and remove unused modules.
Alex Deymo2bba3812014-08-13 08:49:09 -0700265
Alex Klein1699fab2022-09-08 08:46:06 -0600266 Args:
Alex Klein68b270c2023-04-14 14:42:50 -0600267 opts: The command-line args passed to the script.
Alex Deymo2bba3812014-08-13 08:49:09 -0700268
Alex Klein1699fab2022-09-08 08:46:06 -0600269 Returns:
Alex Klein68b270c2023-04-14 14:42:50 -0600270 The exit code number indicating whether the process succeeded.
Alex Klein1699fab2022-09-08 08:46:06 -0600271 """
272 root_st = os.lstat(opts.root)
273 if not stat.S_ISDIR(root_st.st_mode):
Alex Kleindf8ee502022-10-18 09:48:15 -0600274 cros_build_lib.Die("root (%s) must be a directory.", opts.root)
Alex Deymo2bba3812014-08-13 08:49:09 -0700275
Alex Klein1699fab2022-09-08 08:46:06 -0600276 # Detect the possible locations of the gconv-modules file.
277 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
Alex Deymo2bba3812014-08-13 08:49:09 -0700278
Alex Klein1699fab2022-09-08 08:46:06 -0600279 if not gconv_modules_files:
280 logging.warning("gconv-modules file not found.")
281 return 1
Alex Deymo2bba3812014-08-13 08:49:09 -0700282
Alex Klein1699fab2022-09-08 08:46:06 -0600283 # Only one gconv-modules files should be present, either on /usr/lib or
284 # /usr/lib64, but not both.
285 if len(gconv_modules_files) > 1:
286 cros_build_lib.Die("Found several gconv-modules files.")
Alex Deymo2bba3812014-08-13 08:49:09 -0700287
Alex Klein1699fab2022-09-08 08:46:06 -0600288 gconv_modules_file = gconv_modules_files[0]
289 logging.info(
290 "Searching for unused gconv files defined in %s", gconv_modules_file
291 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700292
Robert Kolchmeyerd14321e2023-07-11 17:36:25 -0700293 # Additional gconv-modules configuration files can be present in the
294 # co-located gconv-modules.d. glibc installs a gconv-modules-extra.conf
295 # here by default.
296 modules_dir = os.path.dirname(gconv_modules_file)
297 extras = glob.glob(
298 os.path.join(
299 modules_dir,
300 os.path.basename(gconv_modules_file) + ".d",
301 "*.conf",
302 )
303 )
304 gmods_groups = [GconvModules(gconv_modules_file, modules_dir)]
305 gmods_groups.extend(GconvModules(x, modules_dir) for x in extras)
Alex Deymo2bba3812014-08-13 08:49:09 -0700306
Alex Klein1699fab2022-09-08 08:46:06 -0600307 # Use scanelf to search for all the binary files on the rootfs that require
308 # or define the symbol iconv_open. We also include the binaries that define
309 # it since there could be internal calls to it from other functions.
310 symbols = ",".join(GCONV_SYMBOLS)
311 cmd = [
312 "scanelf",
313 "--mount",
314 "--quiet",
315 "--recursive",
316 "--format",
317 "#s%F",
318 "--symbol",
319 symbols,
320 opts.root,
321 ]
322 result = cros_build_lib.run(
323 cmd, stdout=True, print_cmd=False, encoding="utf-8"
324 )
325 files = set(result.stdout.splitlines())
326 logging.debug("Symbols %s found on %d files.", symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700327
Robert Kolchmeyerd14321e2023-07-11 17:36:25 -0700328 for gmods in gmods_groups:
329 charsets = gmods.Load()
330 # The charsets are represented as nul-terminated strings in the binary
331 # files, so we append the '\0' to each string. This prevents some false
332 # positives when the name of the charset is a substring of some other
333 # string. It doesn't prevent false positives when the charset name is
334 # the suffix of another string, for example a binary with the string
335 # "DON'T DO IT\0" will match the 'IT' charset. Empirical test on
336 # ChromeOS images suggests that only 4 charsets could fall in category.
337 strings = [s.encode("utf-8") + b"x\00" for s in charsets]
338 logging.info(
339 "Will search for %d strings in %d files", len(strings), len(files)
Alex Klein1699fab2022-09-08 08:46:06 -0600340 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700341
Robert Kolchmeyerd14321e2023-07-11 17:36:25 -0700342 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that
343 # those strings should be listed in the gconv-modules file.
344 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
345 if unknown_sticky_modules:
346 logging.warning(
347 "The following charsets were explicitly requested in "
348 "STICKY_MODULES even though they don't exist: %s",
349 ", ".join(unknown_sticky_modules),
350 )
351 global_used = [charset in STICKY_MODULES for charset in charsets]
Alex Deymo2bba3812014-08-13 08:49:09 -0700352
Robert Kolchmeyerd14321e2023-07-11 17:36:25 -0700353 for filename in files:
354 used_filenames = MultipleStringMatch(
355 strings, osutils.ReadFile(filename, mode="rb")
356 )
357
358 global_used = [
359 operator.or_(*x) for x in zip(global_used, used_filenames)
360 ]
361 # Check the debug flag to avoid running a useless loop.
362 if opts.debug and any(used_filenames):
363 logging.debug("File %s:", filename)
364 for i, used_filename in enumerate(used_filenames):
365 if used_filename:
366 logging.debug(" - %s", strings[i])
367
368 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
369 gmods.Rewrite(used_charsets, opts.dryrun)
Alex Klein1699fab2022-09-08 08:46:06 -0600370 return 0
Alex Deymo2bba3812014-08-13 08:49:09 -0700371
372
373def ParseArgs(argv):
Alex Klein1699fab2022-09-08 08:46:06 -0600374 """Return parsed commandline arguments."""
Alex Deymo2bba3812014-08-13 08:49:09 -0700375
Mike Frysinger61b792c2023-02-02 09:02:27 -0500376 parser = commandline.ArgumentParser(description=__doc__, dryrun=True)
Alex Klein1699fab2022-09-08 08:46:06 -0600377 parser.add_argument(
378 "root",
379 type="path",
380 help="path to the directory where the rootfs is mounted.",
381 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700382
Alex Klein1699fab2022-09-08 08:46:06 -0600383 opts = parser.parse_args(argv)
384 opts.Freeze()
385 return opts
Alex Deymo2bba3812014-08-13 08:49:09 -0700386
387
388def main(argv):
Alex Klein1699fab2022-09-08 08:46:06 -0600389 """Main function to start the script."""
390 opts = ParseArgs(argv)
391 logging.debug("Options are %s", opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700392
Alex Klein1699fab2022-09-08 08:46:06 -0600393 return GconvStrip(opts)