blob: 75c372deeff8ff0317f571195e5f87c616e9912e [file] [log] [blame]
Mike Frysingerf1ba7ad2022-09-12 05:42:57 -04001# Copyright 2014 The ChromiumOS Authors
Alex Deymo2bba3812014-08-13 08:49:09 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Script to remove unused gconv charset modules from a build."""
6
Mike Frysinger9997cc02019-07-17 15:50:01 -04007import functools
Alex Deymo2bba3812014-08-13 08:49:09 -07008import glob
Chris McDonald59650c32021-07-20 15:29:28 -06009import logging
Alex Deymo2bba3812014-08-13 08:49:09 -070010import operator
11import os
12import stat
13
Chris McDonald59650c32021-07-20 15:29:28 -060014from chromite.third_party import lddtree
15
Alex Deymo2bba3812014-08-13 08:49:09 -070016from chromite.lib import commandline
17from chromite.lib import cros_build_lib
18from chromite.lib import osutils
Mike Frysinger95452702021-01-23 00:07:22 -050019
Alex Deymo2bba3812014-08-13 08:49:09 -070020
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070021try:
Alex Klein1699fab2022-09-08 08:46:06 -060022 import pytest # pylint: disable=import-error
23
24 ahocorasick = pytest.importorskip("ahocorasick")
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070025except ImportError:
Alex Klein1699fab2022-09-08 08:46:06 -060026 import ahocorasick
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070027
Alex Deymo2bba3812014-08-13 08:49:09 -070028
29# Path pattern to search for the gconv-modules file.
Alex Klein1699fab2022-09-08 08:46:06 -060030GCONV_MODULES_PATH = "usr/*/gconv/gconv-modules"
Alex Deymo2bba3812014-08-13 08:49:09 -070031
32# Sticky modules. These charsets modules are always included even if they
33# aren't used. You can specify any charset name as supported by 'iconv_open',
34# for example, 'LATIN1' or 'ISO-8859-1'.
Alex Klein1699fab2022-09-08 08:46:06 -060035STICKY_MODULES = ("UTF-16", "UTF-32", "UNICODE")
Alex Deymo2bba3812014-08-13 08:49:09 -070036
37# List of function names (symbols) known to use a charset as a parameter.
38GCONV_SYMBOLS = (
39 # glibc
Alex Klein1699fab2022-09-08 08:46:06 -060040 "iconv_open",
41 "iconv",
Alex Deymo2bba3812014-08-13 08:49:09 -070042 # glib
Alex Klein1699fab2022-09-08 08:46:06 -060043 "g_convert",
44 "g_convert_with_fallback",
45 "g_iconv",
46 "g_locale_to_utf8",
47 "g_get_charset",
Alex Deymo2bba3812014-08-13 08:49:09 -070048)
49
50
51class GconvModules(object):
Alex Klein1699fab2022-09-08 08:46:06 -060052 """Class to manipulate the gconv/gconv-modules file and referenced modules.
Alex Deymo2bba3812014-08-13 08:49:09 -070053
Alex Klein1699fab2022-09-08 08:46:06 -060054 This class parses the contents of the gconv-modules file installed by glibc
55 which provides the definition of the charsets supported by iconv_open(3). It
56 allows to load the current gconv-modules file and rewrite it to include only
57 a subset of the supported modules, removing the other modules.
Alex Deymo2bba3812014-08-13 08:49:09 -070058
Alex Klein1699fab2022-09-08 08:46:06 -060059 Each charset is involved on some transformation between that charset and an
60 internal representation. This transformation is defined on a .so file loaded
Alex Klein68b270c2023-04-14 14:42:50 -060061 dynamically with dlopen(3) when the charset defined in this file is
62 requested to iconv_open(3).
Alex Deymo2bba3812014-08-13 08:49:09 -070063
Alex Klein1699fab2022-09-08 08:46:06 -060064 See the comments on gconv-modules file for syntax details.
Alex Deymo2bba3812014-08-13 08:49:09 -070065 """
66
Alex Klein1699fab2022-09-08 08:46:06 -060067 def __init__(self, gconv_modules_file):
68 """Initialize the class.
Alex Deymo2bba3812014-08-13 08:49:09 -070069
Alex Klein1699fab2022-09-08 08:46:06 -060070 Args:
Alex Klein68b270c2023-04-14 14:42:50 -060071 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Klein1699fab2022-09-08 08:46:06 -060072 """
73 self._filename = gconv_modules_file
Alex Deymoda9dd402014-08-13 08:54:18 -070074
Alex Klein1699fab2022-09-08 08:46:06 -060075 # An alias map of charsets. The key (fromcharset) is the alias name and
Alex Klein68b270c2023-04-14 14:42:50 -060076 # the value (tocharset) is the real charset name. We also support a
77 # value that is an alias for another charset.
Alex Klein1699fab2022-09-08 08:46:06 -060078 self._alias = {}
Alex Deymoda9dd402014-08-13 08:54:18 -070079
Alex Klein68b270c2023-04-14 14:42:50 -060080 # The modules dict goes from charset to module names (the filenames
81 # without the .so extension). Since several transformations involving
82 # the same charset could be defined in different files, the values of
83 # this dict are a set of module names.
Alex Klein1699fab2022-09-08 08:46:06 -060084 self._modules = {}
Alex Deymoda9dd402014-08-13 08:54:18 -070085
Alex Klein1699fab2022-09-08 08:46:06 -060086 def Load(self):
87 """Load the charsets from gconv-modules."""
Mike Frysinger31fdddd2023-02-24 15:50:55 -050088 with open(self._filename, encoding="utf-8") as fp:
Alex Klein1699fab2022-09-08 08:46:06 -060089 for line in fp:
90 line = line.split("#", 1)[0].strip()
91 if not line:
92 # Ignore blank lines & comments.
93 continue
Alex Deymoda9dd402014-08-13 08:54:18 -070094
Alex Klein1699fab2022-09-08 08:46:06 -060095 lst = line.split()
96 if lst[0] == "module":
97 _, fromset, toset, filename = lst[:4]
98 for charset in (fromset, toset):
99 charset = charset.rstrip("/")
100 mods = self._modules.get(charset, set())
101 mods.add(filename)
102 self._modules[charset] = mods
103 elif lst[0] == "alias":
104 _, fromset, toset = lst
105 fromset = fromset.rstrip("/")
106 toset = toset.rstrip("/")
Alex Klein68b270c2023-04-14 14:42:50 -0600107 # Warn if the same charset is defined as two different
108 # aliases.
Alex Klein1699fab2022-09-08 08:46:06 -0600109 if self._alias.get(fromset, toset) != toset:
110 logging.error(
111 'charset "%s" already defined as "%s".',
112 fromset,
113 self._alias[fromset],
114 )
115 self._alias[fromset] = toset
116 else:
117 cros_build_lib.Die("Unknown line: %s", line)
Alex Deymoda9dd402014-08-13 08:54:18 -0700118
Alex Klein1699fab2022-09-08 08:46:06 -0600119 logging.debug(
120 "Found %d modules and %d alias in %s",
121 len(self._modules),
122 len(self._alias),
123 self._filename,
124 )
125 charsets = sorted(list(self._alias) + list(self._modules))
126 # Remove the 'INTERNAL' charset from the list, since it is not a charset
Alex Klein68b270c2023-04-14 14:42:50 -0600127 # but an internal representation used to convert to and from other
128 # charsets.
Alex Klein1699fab2022-09-08 08:46:06 -0600129 if "INTERNAL" in charsets:
130 charsets.remove("INTERNAL")
131 return charsets
Alex Deymoda9dd402014-08-13 08:54:18 -0700132
Mike Frysinger61b792c2023-02-02 09:02:27 -0500133 def Rewrite(self, used_charsets, dryrun=False):
Alex Klein1699fab2022-09-08 08:46:06 -0600134 """Rewrite gconv-modules file with only the used charsets.
Alex Deymo2bba3812014-08-13 08:49:09 -0700135
Alex Klein1699fab2022-09-08 08:46:06 -0600136 Args:
Alex Klein68b270c2023-04-14 14:42:50 -0600137 used_charsets: A list of used charsets. This should be a subset of
138 the list returned by Load().
139 dryrun: Whether this function should not change any file.
Alex Klein1699fab2022-09-08 08:46:06 -0600140 """
Alex Deymo2bba3812014-08-13 08:49:09 -0700141
Alex Klein1699fab2022-09-08 08:46:06 -0600142 # Compute the used modules.
143 used_modules = set()
144 for charset in used_charsets:
145 while charset in self._alias:
146 charset = self._alias[charset]
147 used_modules.update(self._modules[charset])
148 unused_modules = (
149 functools.reduce(set.union, list(self._modules.values()))
150 - used_modules
151 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700152
Alex Klein1699fab2022-09-08 08:46:06 -0600153 modules_dir = os.path.dirname(self._filename)
154
155 all_modules = set.union(used_modules, unused_modules)
156 # The list of charsets that depend on a given library. For example,
157 # libdeps['libCNS.so'] is the set of all the modules that require that
158 # library. These libraries live in the same directory as the modules.
159 libdeps = {}
160 for module in all_modules:
161 deps = lddtree.ParseELF(
162 os.path.join(modules_dir, "%s.so" % module), modules_dir, []
163 )
164 if "needed" not in deps:
165 continue
166 for lib in deps["needed"]:
Alex Klein68b270c2023-04-14 14:42:50 -0600167 # Ignore the libs without a path defined (outside the
168 # modules_dir).
Alex Klein1699fab2022-09-08 08:46:06 -0600169 if deps["libs"][lib]["path"]:
170 libdeps[lib] = libdeps.get(lib, set()).union([module])
171
172 used_libdeps = set(
173 lib
174 for lib, deps in libdeps.items()
175 if deps.intersection(used_modules)
176 )
177 unused_libdeps = set(libdeps).difference(used_libdeps)
178
179 logging.debug("Used modules: %s", ", ".join(sorted(used_modules)))
180 logging.debug("Used dependency libs: %s, ".join(sorted(used_libdeps)))
181
182 unused_size = 0
183 for module in sorted(unused_modules):
184 module_path = os.path.join(modules_dir, "%s.so" % module)
185 unused_size += os.lstat(module_path).st_size
186 logging.debug("rm %s", module_path)
Mike Frysinger61b792c2023-02-02 09:02:27 -0500187 if not dryrun:
Alex Klein1699fab2022-09-08 08:46:06 -0600188 os.unlink(module_path)
189
190 unused_libdeps_size = 0
191 for lib in sorted(unused_libdeps):
192 lib_path = os.path.join(modules_dir, lib)
193 unused_libdeps_size += os.lstat(lib_path).st_size
194 logging.debug("rm %s", lib_path)
Mike Frysinger61b792c2023-02-02 09:02:27 -0500195 if not dryrun:
Alex Klein1699fab2022-09-08 08:46:06 -0600196 os.unlink(lib_path)
197
198 logging.info(
199 "Done. Using %d gconv modules. Removed %d unused modules"
200 " (%.1f KiB) and %d unused dependencies (%.1f KiB)",
201 len(used_modules),
202 len(unused_modules),
203 unused_size / 1024.0,
204 len(unused_libdeps),
205 unused_libdeps_size / 1024.0,
206 )
207
208 # Recompute the gconv-modules file with only the included gconv modules.
209 result = []
Mike Frysinger31fdddd2023-02-24 15:50:55 -0500210 with open(self._filename, encoding="utf-8") as fp:
Alex Klein1699fab2022-09-08 08:46:06 -0600211 for line in fp:
212 lst = line.split("#", 1)[0].strip().split()
213
214 if not lst:
215 # Keep comments and copyright headers.
216 result.append(line)
217 elif lst[0] == "module":
218 _, _, _, filename = lst[:4]
219 if filename in used_modules:
220 # Used module
221 result.append(line)
222 elif lst[0] == "alias":
223 _, charset, _ = lst
224 charset = charset.rstrip("/")
225 while charset in self._alias:
226 charset = self._alias[charset]
227 if used_modules.intersection(self._modules[charset]):
228 # Alias to an used module
229 result.append(line)
230 else:
231 cros_build_lib.Die("Unknown line: %s", line)
232
Mike Frysinger61b792c2023-02-02 09:02:27 -0500233 if not dryrun:
Alex Klein1699fab2022-09-08 08:46:06 -0600234 osutils.WriteFile(self._filename, "".join(result))
Alex Deymo2bba3812014-08-13 08:49:09 -0700235
236
237def MultipleStringMatch(patterns, corpus):
Alex Klein1699fab2022-09-08 08:46:06 -0600238 """Search a list of strings in a corpus string.
Alex Deymo2bba3812014-08-13 08:49:09 -0700239
Alex Klein1699fab2022-09-08 08:46:06 -0600240 Args:
Alex Klein68b270c2023-04-14 14:42:50 -0600241 patterns: A list of strings.
242 corpus: The text where to search for the strings.
Alex Deymo2bba3812014-08-13 08:49:09 -0700243
Alex Klein1699fab2022-09-08 08:46:06 -0600244 Returns:
Alex Klein68b270c2023-04-14 14:42:50 -0600245 A list of Booleans stating whether each pattern string was found in the
246 corpus or not.
Alex Klein1699fab2022-09-08 08:46:06 -0600247 """
248 result = [False] * len(patterns)
Mike Frysinger9c927782019-10-14 02:48:48 -0400249
Alex Klein1699fab2022-09-08 08:46:06 -0600250 tree = ahocorasick.Automaton()
251 for i, word in enumerate(patterns):
252 tree.add_word(word, i)
253 tree.make_automaton()
Mike Frysinger9c927782019-10-14 02:48:48 -0400254
Alex Klein1699fab2022-09-08 08:46:06 -0600255 for _, i in tree.iter(corpus):
256 result[i] = True
Alex Deymo2bba3812014-08-13 08:49:09 -0700257
Alex Klein1699fab2022-09-08 08:46:06 -0600258 return result
Alex Deymo2bba3812014-08-13 08:49:09 -0700259
260
261def GconvStrip(opts):
Alex Klein1699fab2022-09-08 08:46:06 -0600262 """Process gconv-modules and remove unused modules.
Alex Deymo2bba3812014-08-13 08:49:09 -0700263
Alex Klein1699fab2022-09-08 08:46:06 -0600264 Args:
Alex Klein68b270c2023-04-14 14:42:50 -0600265 opts: The command-line args passed to the script.
Alex Deymo2bba3812014-08-13 08:49:09 -0700266
Alex Klein1699fab2022-09-08 08:46:06 -0600267 Returns:
Alex Klein68b270c2023-04-14 14:42:50 -0600268 The exit code number indicating whether the process succeeded.
Alex Klein1699fab2022-09-08 08:46:06 -0600269 """
270 root_st = os.lstat(opts.root)
271 if not stat.S_ISDIR(root_st.st_mode):
Alex Kleindf8ee502022-10-18 09:48:15 -0600272 cros_build_lib.Die("root (%s) must be a directory.", opts.root)
Alex Deymo2bba3812014-08-13 08:49:09 -0700273
Alex Klein1699fab2022-09-08 08:46:06 -0600274 # Detect the possible locations of the gconv-modules file.
275 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
Alex Deymo2bba3812014-08-13 08:49:09 -0700276
Alex Klein1699fab2022-09-08 08:46:06 -0600277 if not gconv_modules_files:
278 logging.warning("gconv-modules file not found.")
279 return 1
Alex Deymo2bba3812014-08-13 08:49:09 -0700280
Alex Klein1699fab2022-09-08 08:46:06 -0600281 # Only one gconv-modules files should be present, either on /usr/lib or
282 # /usr/lib64, but not both.
283 if len(gconv_modules_files) > 1:
284 cros_build_lib.Die("Found several gconv-modules files.")
Alex Deymo2bba3812014-08-13 08:49:09 -0700285
Alex Klein1699fab2022-09-08 08:46:06 -0600286 gconv_modules_file = gconv_modules_files[0]
287 logging.info(
288 "Searching for unused gconv files defined in %s", gconv_modules_file
289 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700290
Alex Klein1699fab2022-09-08 08:46:06 -0600291 gmods = GconvModules(gconv_modules_file)
292 charsets = gmods.Load()
Alex Deymo2bba3812014-08-13 08:49:09 -0700293
Alex Klein1699fab2022-09-08 08:46:06 -0600294 # Use scanelf to search for all the binary files on the rootfs that require
295 # or define the symbol iconv_open. We also include the binaries that define
296 # it since there could be internal calls to it from other functions.
297 symbols = ",".join(GCONV_SYMBOLS)
298 cmd = [
299 "scanelf",
300 "--mount",
301 "--quiet",
302 "--recursive",
303 "--format",
304 "#s%F",
305 "--symbol",
306 symbols,
307 opts.root,
308 ]
309 result = cros_build_lib.run(
310 cmd, stdout=True, print_cmd=False, encoding="utf-8"
311 )
312 files = set(result.stdout.splitlines())
313 logging.debug("Symbols %s found on %d files.", symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700314
Alex Klein68b270c2023-04-14 14:42:50 -0600315 # The charsets are represented as nul-terminated strings in the binary
316 # files, so we append the '\0' to each string. This prevents some false
317 # positives when the name of the charset is a substring of some other
318 # string. It doesn't prevent false positives when the charset name is the
319 # suffix of another string, for example a binary with the string "DON'T DO
320 # IT\0" will match the 'IT' charset. Empirical test on ChromeOS images
321 # suggests that only 4 charsets could fall in category.
Alex Klein1699fab2022-09-08 08:46:06 -0600322 strings = [s.encode("utf-8") + b"x\00" for s in charsets]
323 logging.info(
324 "Will search for %d strings in %d files", len(strings), len(files)
325 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700326
Alex Klein1699fab2022-09-08 08:46:06 -0600327 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
328 # strings should be listed in the gconv-modules file.
329 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
330 if unknown_sticky_modules:
331 logging.warning(
Alex Klein68b270c2023-04-14 14:42:50 -0600332 "The following charsets were explicitly requested in "
333 "STICKY_MODULES even though they don't exist: %s",
Alex Klein1699fab2022-09-08 08:46:06 -0600334 ", ".join(unknown_sticky_modules),
335 )
336 global_used = [charset in STICKY_MODULES for charset in charsets]
Alex Deymo2bba3812014-08-13 08:49:09 -0700337
Alex Klein1699fab2022-09-08 08:46:06 -0600338 for filename in files:
339 used_filenames = MultipleStringMatch(
340 strings, osutils.ReadFile(filename, mode="rb")
341 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700342
Alex Klein1699fab2022-09-08 08:46:06 -0600343 global_used = [
344 operator.or_(*x) for x in zip(global_used, used_filenames)
345 ]
Alex Klein68b270c2023-04-14 14:42:50 -0600346 # Check the debug flag to avoid running a useless loop.
Alex Klein1699fab2022-09-08 08:46:06 -0600347 if opts.debug and any(used_filenames):
348 logging.debug("File %s:", filename)
349 for i, used_filename in enumerate(used_filenames):
350 if used_filename:
351 logging.debug(" - %s", strings[i])
Alex Deymo2bba3812014-08-13 08:49:09 -0700352
Alex Klein1699fab2022-09-08 08:46:06 -0600353 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
Mike Frysinger61b792c2023-02-02 09:02:27 -0500354 gmods.Rewrite(used_charsets, opts.dryrun)
Alex Klein1699fab2022-09-08 08:46:06 -0600355 return 0
Alex Deymo2bba3812014-08-13 08:49:09 -0700356
357
358def ParseArgs(argv):
Alex Klein1699fab2022-09-08 08:46:06 -0600359 """Return parsed commandline arguments."""
Alex Deymo2bba3812014-08-13 08:49:09 -0700360
Mike Frysinger61b792c2023-02-02 09:02:27 -0500361 parser = commandline.ArgumentParser(description=__doc__, dryrun=True)
Alex Klein1699fab2022-09-08 08:46:06 -0600362 parser.add_argument(
363 "root",
364 type="path",
365 help="path to the directory where the rootfs is mounted.",
366 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700367
Alex Klein1699fab2022-09-08 08:46:06 -0600368 opts = parser.parse_args(argv)
369 opts.Freeze()
370 return opts
Alex Deymo2bba3812014-08-13 08:49:09 -0700371
372
373def main(argv):
Alex Klein1699fab2022-09-08 08:46:06 -0600374 """Main function to start the script."""
375 opts = ParseArgs(argv)
376 logging.debug("Options are %s", opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700377
Alex Klein1699fab2022-09-08 08:46:06 -0600378 return GconvStrip(opts)