blob: 440798e7cadc82d1e5be496ebcf3525e073b802b [file] [log] [blame]
Mike Frysingerf1ba7ad2022-09-12 05:42:57 -04001# Copyright 2014 The ChromiumOS Authors
Alex Deymo2bba3812014-08-13 08:49:09 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Script to remove unused gconv charset modules from a build."""
6
Mike Frysinger9997cc02019-07-17 15:50:01 -04007import functools
Alex Deymo2bba3812014-08-13 08:49:09 -07008import glob
Chris McDonald59650c32021-07-20 15:29:28 -06009import logging
Alex Deymo2bba3812014-08-13 08:49:09 -070010import operator
11import os
12import stat
13
Chris McDonald59650c32021-07-20 15:29:28 -060014from chromite.third_party import lddtree
15
Alex Deymo2bba3812014-08-13 08:49:09 -070016from chromite.lib import commandline
17from chromite.lib import cros_build_lib
18from chromite.lib import osutils
Mike Frysinger95452702021-01-23 00:07:22 -050019
Alex Deymo2bba3812014-08-13 08:49:09 -070020
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070021try:
Alex Klein1699fab2022-09-08 08:46:06 -060022 import pytest # pylint: disable=import-error
23
24 ahocorasick = pytest.importorskip("ahocorasick")
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070025except ImportError:
Alex Klein1699fab2022-09-08 08:46:06 -060026 import ahocorasick
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070027
Alex Deymo2bba3812014-08-13 08:49:09 -070028
29# Path pattern to search for the gconv-modules file.
Alex Klein1699fab2022-09-08 08:46:06 -060030GCONV_MODULES_PATH = "usr/*/gconv/gconv-modules"
Alex Deymo2bba3812014-08-13 08:49:09 -070031
32# Sticky modules. These charsets modules are always included even if they
33# aren't used. You can specify any charset name as supported by 'iconv_open',
34# for example, 'LATIN1' or 'ISO-8859-1'.
Alex Klein1699fab2022-09-08 08:46:06 -060035STICKY_MODULES = ("UTF-16", "UTF-32", "UNICODE")
Alex Deymo2bba3812014-08-13 08:49:09 -070036
37# List of function names (symbols) known to use a charset as a parameter.
38GCONV_SYMBOLS = (
39 # glibc
Alex Klein1699fab2022-09-08 08:46:06 -060040 "iconv_open",
41 "iconv",
Alex Deymo2bba3812014-08-13 08:49:09 -070042 # glib
Alex Klein1699fab2022-09-08 08:46:06 -060043 "g_convert",
44 "g_convert_with_fallback",
45 "g_iconv",
46 "g_locale_to_utf8",
47 "g_get_charset",
Alex Deymo2bba3812014-08-13 08:49:09 -070048)
49
50
51class GconvModules(object):
Alex Klein1699fab2022-09-08 08:46:06 -060052 """Class to manipulate the gconv/gconv-modules file and referenced modules.
Alex Deymo2bba3812014-08-13 08:49:09 -070053
Alex Klein1699fab2022-09-08 08:46:06 -060054 This class parses the contents of the gconv-modules file installed by glibc
55 which provides the definition of the charsets supported by iconv_open(3). It
56 allows to load the current gconv-modules file and rewrite it to include only
57 a subset of the supported modules, removing the other modules.
Alex Deymo2bba3812014-08-13 08:49:09 -070058
Alex Klein1699fab2022-09-08 08:46:06 -060059 Each charset is involved on some transformation between that charset and an
60 internal representation. This transformation is defined on a .so file loaded
61 dynamically with dlopen(3) when the charset defined in this file is requested
62 to iconv_open(3).
Alex Deymo2bba3812014-08-13 08:49:09 -070063
Alex Klein1699fab2022-09-08 08:46:06 -060064 See the comments on gconv-modules file for syntax details.
Alex Deymo2bba3812014-08-13 08:49:09 -070065 """
66
Alex Klein1699fab2022-09-08 08:46:06 -060067 def __init__(self, gconv_modules_file):
68 """Initialize the class.
Alex Deymo2bba3812014-08-13 08:49:09 -070069
Alex Klein1699fab2022-09-08 08:46:06 -060070 Args:
71 gconv_modules_file: Path to gconv/gconv-modules file.
72 """
73 self._filename = gconv_modules_file
Alex Deymoda9dd402014-08-13 08:54:18 -070074
Alex Klein1699fab2022-09-08 08:46:06 -060075 # An alias map of charsets. The key (fromcharset) is the alias name and
76 # the value (tocharset) is the real charset name. We also support a value
77 # that is an alias for another charset.
78 self._alias = {}
Alex Deymoda9dd402014-08-13 08:54:18 -070079
Alex Klein1699fab2022-09-08 08:46:06 -060080 # The modules dict goes from charset to module names (the filenames without
81 # the .so extension). Since several transformations involving the same
82 # charset could be defined in different files, the values of this dict are
83 # a set of module names.
84 self._modules = {}
Alex Deymoda9dd402014-08-13 08:54:18 -070085
Alex Klein1699fab2022-09-08 08:46:06 -060086 def Load(self):
87 """Load the charsets from gconv-modules."""
88 with open(self._filename) as fp:
89 for line in fp:
90 line = line.split("#", 1)[0].strip()
91 if not line:
92 # Ignore blank lines & comments.
93 continue
Alex Deymoda9dd402014-08-13 08:54:18 -070094
Alex Klein1699fab2022-09-08 08:46:06 -060095 lst = line.split()
96 if lst[0] == "module":
97 _, fromset, toset, filename = lst[:4]
98 for charset in (fromset, toset):
99 charset = charset.rstrip("/")
100 mods = self._modules.get(charset, set())
101 mods.add(filename)
102 self._modules[charset] = mods
103 elif lst[0] == "alias":
104 _, fromset, toset = lst
105 fromset = fromset.rstrip("/")
106 toset = toset.rstrip("/")
107 # Warn if the same charset is defined as two different aliases.
108 if self._alias.get(fromset, toset) != toset:
109 logging.error(
110 'charset "%s" already defined as "%s".',
111 fromset,
112 self._alias[fromset],
113 )
114 self._alias[fromset] = toset
115 else:
116 cros_build_lib.Die("Unknown line: %s", line)
Alex Deymoda9dd402014-08-13 08:54:18 -0700117
Alex Klein1699fab2022-09-08 08:46:06 -0600118 logging.debug(
119 "Found %d modules and %d alias in %s",
120 len(self._modules),
121 len(self._alias),
122 self._filename,
123 )
124 charsets = sorted(list(self._alias) + list(self._modules))
125 # Remove the 'INTERNAL' charset from the list, since it is not a charset
126 # but an internal representation used to convert to and from other charsets.
127 if "INTERNAL" in charsets:
128 charsets.remove("INTERNAL")
129 return charsets
Alex Deymoda9dd402014-08-13 08:54:18 -0700130
Alex Klein1699fab2022-09-08 08:46:06 -0600131 def Rewrite(self, used_charsets, dry_run=False):
132 """Rewrite gconv-modules file with only the used charsets.
Alex Deymo2bba3812014-08-13 08:49:09 -0700133
Alex Klein1699fab2022-09-08 08:46:06 -0600134 Args:
135 used_charsets: A list of used charsets. This should be a subset of the
136 list returned by Load().
137 dry_run: Whether this function should not change any file.
138 """
Alex Deymo2bba3812014-08-13 08:49:09 -0700139
Alex Klein1699fab2022-09-08 08:46:06 -0600140 # Compute the used modules.
141 used_modules = set()
142 for charset in used_charsets:
143 while charset in self._alias:
144 charset = self._alias[charset]
145 used_modules.update(self._modules[charset])
146 unused_modules = (
147 functools.reduce(set.union, list(self._modules.values()))
148 - used_modules
149 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700150
Alex Klein1699fab2022-09-08 08:46:06 -0600151 modules_dir = os.path.dirname(self._filename)
152
153 all_modules = set.union(used_modules, unused_modules)
154 # The list of charsets that depend on a given library. For example,
155 # libdeps['libCNS.so'] is the set of all the modules that require that
156 # library. These libraries live in the same directory as the modules.
157 libdeps = {}
158 for module in all_modules:
159 deps = lddtree.ParseELF(
160 os.path.join(modules_dir, "%s.so" % module), modules_dir, []
161 )
162 if "needed" not in deps:
163 continue
164 for lib in deps["needed"]:
165 # Ignore the libs without a path defined (outside the modules_dir).
166 if deps["libs"][lib]["path"]:
167 libdeps[lib] = libdeps.get(lib, set()).union([module])
168
169 used_libdeps = set(
170 lib
171 for lib, deps in libdeps.items()
172 if deps.intersection(used_modules)
173 )
174 unused_libdeps = set(libdeps).difference(used_libdeps)
175
176 logging.debug("Used modules: %s", ", ".join(sorted(used_modules)))
177 logging.debug("Used dependency libs: %s, ".join(sorted(used_libdeps)))
178
179 unused_size = 0
180 for module in sorted(unused_modules):
181 module_path = os.path.join(modules_dir, "%s.so" % module)
182 unused_size += os.lstat(module_path).st_size
183 logging.debug("rm %s", module_path)
184 if not dry_run:
185 os.unlink(module_path)
186
187 unused_libdeps_size = 0
188 for lib in sorted(unused_libdeps):
189 lib_path = os.path.join(modules_dir, lib)
190 unused_libdeps_size += os.lstat(lib_path).st_size
191 logging.debug("rm %s", lib_path)
192 if not dry_run:
193 os.unlink(lib_path)
194
195 logging.info(
196 "Done. Using %d gconv modules. Removed %d unused modules"
197 " (%.1f KiB) and %d unused dependencies (%.1f KiB)",
198 len(used_modules),
199 len(unused_modules),
200 unused_size / 1024.0,
201 len(unused_libdeps),
202 unused_libdeps_size / 1024.0,
203 )
204
205 # Recompute the gconv-modules file with only the included gconv modules.
206 result = []
207 with open(self._filename) as fp:
208 for line in fp:
209 lst = line.split("#", 1)[0].strip().split()
210
211 if not lst:
212 # Keep comments and copyright headers.
213 result.append(line)
214 elif lst[0] == "module":
215 _, _, _, filename = lst[:4]
216 if filename in used_modules:
217 # Used module
218 result.append(line)
219 elif lst[0] == "alias":
220 _, charset, _ = lst
221 charset = charset.rstrip("/")
222 while charset in self._alias:
223 charset = self._alias[charset]
224 if used_modules.intersection(self._modules[charset]):
225 # Alias to an used module
226 result.append(line)
227 else:
228 cros_build_lib.Die("Unknown line: %s", line)
229
230 if not dry_run:
231 osutils.WriteFile(self._filename, "".join(result))
Alex Deymo2bba3812014-08-13 08:49:09 -0700232
233
234def MultipleStringMatch(patterns, corpus):
Alex Klein1699fab2022-09-08 08:46:06 -0600235 """Search a list of strings in a corpus string.
Alex Deymo2bba3812014-08-13 08:49:09 -0700236
Alex Klein1699fab2022-09-08 08:46:06 -0600237 Args:
238 patterns: A list of strings.
239 corpus: The text where to search for the strings.
Alex Deymo2bba3812014-08-13 08:49:09 -0700240
Alex Klein1699fab2022-09-08 08:46:06 -0600241 Returns:
242 A list of Booleans stating whether each pattern string was found in the
243 corpus or not.
244 """
245 result = [False] * len(patterns)
Mike Frysinger9c927782019-10-14 02:48:48 -0400246
Alex Klein1699fab2022-09-08 08:46:06 -0600247 tree = ahocorasick.Automaton()
248 for i, word in enumerate(patterns):
249 tree.add_word(word, i)
250 tree.make_automaton()
Mike Frysinger9c927782019-10-14 02:48:48 -0400251
Alex Klein1699fab2022-09-08 08:46:06 -0600252 for _, i in tree.iter(corpus):
253 result[i] = True
Alex Deymo2bba3812014-08-13 08:49:09 -0700254
Alex Klein1699fab2022-09-08 08:46:06 -0600255 return result
Alex Deymo2bba3812014-08-13 08:49:09 -0700256
257
258def GconvStrip(opts):
Alex Klein1699fab2022-09-08 08:46:06 -0600259 """Process gconv-modules and remove unused modules.
Alex Deymo2bba3812014-08-13 08:49:09 -0700260
Alex Klein1699fab2022-09-08 08:46:06 -0600261 Args:
262 opts: The command-line args passed to the script.
Alex Deymo2bba3812014-08-13 08:49:09 -0700263
Alex Klein1699fab2022-09-08 08:46:06 -0600264 Returns:
265 The exit code number indicating whether the process succeeded.
266 """
267 root_st = os.lstat(opts.root)
268 if not stat.S_ISDIR(root_st.st_mode):
Alex Kleindf8ee502022-10-18 09:48:15 -0600269 cros_build_lib.Die("root (%s) must be a directory.", opts.root)
Alex Deymo2bba3812014-08-13 08:49:09 -0700270
Alex Klein1699fab2022-09-08 08:46:06 -0600271 # Detect the possible locations of the gconv-modules file.
272 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
Alex Deymo2bba3812014-08-13 08:49:09 -0700273
Alex Klein1699fab2022-09-08 08:46:06 -0600274 if not gconv_modules_files:
275 logging.warning("gconv-modules file not found.")
276 return 1
Alex Deymo2bba3812014-08-13 08:49:09 -0700277
Alex Klein1699fab2022-09-08 08:46:06 -0600278 # Only one gconv-modules files should be present, either on /usr/lib or
279 # /usr/lib64, but not both.
280 if len(gconv_modules_files) > 1:
281 cros_build_lib.Die("Found several gconv-modules files.")
Alex Deymo2bba3812014-08-13 08:49:09 -0700282
Alex Klein1699fab2022-09-08 08:46:06 -0600283 gconv_modules_file = gconv_modules_files[0]
284 logging.info(
285 "Searching for unused gconv files defined in %s", gconv_modules_file
286 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700287
Alex Klein1699fab2022-09-08 08:46:06 -0600288 gmods = GconvModules(gconv_modules_file)
289 charsets = gmods.Load()
Alex Deymo2bba3812014-08-13 08:49:09 -0700290
Alex Klein1699fab2022-09-08 08:46:06 -0600291 # Use scanelf to search for all the binary files on the rootfs that require
292 # or define the symbol iconv_open. We also include the binaries that define
293 # it since there could be internal calls to it from other functions.
294 symbols = ",".join(GCONV_SYMBOLS)
295 cmd = [
296 "scanelf",
297 "--mount",
298 "--quiet",
299 "--recursive",
300 "--format",
301 "#s%F",
302 "--symbol",
303 symbols,
304 opts.root,
305 ]
306 result = cros_build_lib.run(
307 cmd, stdout=True, print_cmd=False, encoding="utf-8"
308 )
309 files = set(result.stdout.splitlines())
310 logging.debug("Symbols %s found on %d files.", symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700311
Alex Klein1699fab2022-09-08 08:46:06 -0600312 # The charsets are represented as nul-terminated strings in the binary files,
313 # so we append the '\0' to each string. This prevents some false positives
314 # when the name of the charset is a substring of some other string. It doesn't
315 # prevent false positives when the charset name is the suffix of another
316 # string, for example a binary with the string "DON'T DO IT\0" will match the
317 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
318 # charsets could fall in category.
319 strings = [s.encode("utf-8") + b"x\00" for s in charsets]
320 logging.info(
321 "Will search for %d strings in %d files", len(strings), len(files)
322 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700323
Alex Klein1699fab2022-09-08 08:46:06 -0600324 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
325 # strings should be listed in the gconv-modules file.
326 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
327 if unknown_sticky_modules:
328 logging.warning(
329 "The following charsets were explicitly requested in STICKY_MODULES "
330 "even though they don't exist: %s",
331 ", ".join(unknown_sticky_modules),
332 )
333 global_used = [charset in STICKY_MODULES for charset in charsets]
Alex Deymo2bba3812014-08-13 08:49:09 -0700334
Alex Klein1699fab2022-09-08 08:46:06 -0600335 for filename in files:
336 used_filenames = MultipleStringMatch(
337 strings, osutils.ReadFile(filename, mode="rb")
338 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700339
Alex Klein1699fab2022-09-08 08:46:06 -0600340 global_used = [
341 operator.or_(*x) for x in zip(global_used, used_filenames)
342 ]
343 # Check the debug flag to avoid running an useless loop.
344 if opts.debug and any(used_filenames):
345 logging.debug("File %s:", filename)
346 for i, used_filename in enumerate(used_filenames):
347 if used_filename:
348 logging.debug(" - %s", strings[i])
Alex Deymo2bba3812014-08-13 08:49:09 -0700349
Alex Klein1699fab2022-09-08 08:46:06 -0600350 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
351 gmods.Rewrite(used_charsets, opts.dry_run)
352 return 0
Alex Deymo2bba3812014-08-13 08:49:09 -0700353
354
355def ParseArgs(argv):
Alex Klein1699fab2022-09-08 08:46:06 -0600356 """Return parsed commandline arguments."""
Alex Deymo2bba3812014-08-13 08:49:09 -0700357
Alex Klein1699fab2022-09-08 08:46:06 -0600358 parser = commandline.ArgumentParser()
359 parser.add_argument(
360 "--dry-run",
361 action="store_true",
362 default=False,
363 help="process but don't modify any file.",
364 )
365 parser.add_argument(
366 "root",
367 type="path",
368 help="path to the directory where the rootfs is mounted.",
369 )
Alex Deymo2bba3812014-08-13 08:49:09 -0700370
Alex Klein1699fab2022-09-08 08:46:06 -0600371 opts = parser.parse_args(argv)
372 opts.Freeze()
373 return opts
Alex Deymo2bba3812014-08-13 08:49:09 -0700374
375
376def main(argv):
Alex Klein1699fab2022-09-08 08:46:06 -0600377 """Main function to start the script."""
378 opts = ParseArgs(argv)
379 logging.debug("Options are %s", opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700380
Alex Klein1699fab2022-09-08 08:46:06 -0600381 return GconvStrip(opts)