Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 1 | # Copyright 2014 The Chromium OS Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | |
| 5 | """Script to remove unused gconv charset modules from a build.""" |
| 6 | |
Mike Frysinger | 9997cc0 | 2019-07-17 15:50:01 -0400 | [diff] [blame] | 7 | import functools |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 8 | import glob |
Chris McDonald | 59650c3 | 2021-07-20 15:29:28 -0600 | [diff] [blame] | 9 | import logging |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 10 | import operator |
| 11 | import os |
| 12 | import stat |
| 13 | |
Chris McDonald | 59650c3 | 2021-07-20 15:29:28 -0600 | [diff] [blame] | 14 | from chromite.third_party import lddtree |
| 15 | |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 16 | from chromite.lib import commandline |
| 17 | from chromite.lib import cros_build_lib |
| 18 | from chromite.lib import osutils |
Mike Frysinger | 9545270 | 2021-01-23 00:07:22 -0500 | [diff] [blame] | 19 | |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 20 | |
Greg Edelston | a4c9b3b | 2020-01-07 17:51:13 -0700 | [diff] [blame] | 21 | try: |
| 22 | import pytest # pylint: disable=import-error |
| 23 | ahocorasick = pytest.importorskip('ahocorasick') |
| 24 | except ImportError: |
| 25 | import ahocorasick |
| 26 | |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 27 | |
| 28 | # Path pattern to search for the gconv-modules file. |
| 29 | GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules' |
| 30 | |
| 31 | # Sticky modules. These charsets modules are always included even if they |
| 32 | # aren't used. You can specify any charset name as supported by 'iconv_open', |
| 33 | # for example, 'LATIN1' or 'ISO-8859-1'. |
| 34 | STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE') |
| 35 | |
| 36 | # List of function names (symbols) known to use a charset as a parameter. |
| 37 | GCONV_SYMBOLS = ( |
| 38 | # glibc |
| 39 | 'iconv_open', |
| 40 | 'iconv', |
| 41 | # glib |
| 42 | 'g_convert', |
| 43 | 'g_convert_with_fallback', |
| 44 | 'g_iconv', |
| 45 | 'g_locale_to_utf8', |
| 46 | 'g_get_charset', |
| 47 | ) |
| 48 | |
| 49 | |
| 50 | class GconvModules(object): |
| 51 | """Class to manipulate the gconv/gconv-modules file and referenced modules. |
| 52 | |
| 53 | This class parses the contents of the gconv-modules file installed by glibc |
| 54 | which provides the definition of the charsets supported by iconv_open(3). It |
| 55 | allows to load the current gconv-modules file and rewrite it to include only |
| 56 | a subset of the supported modules, removing the other modules. |
| 57 | |
| 58 | Each charset is involved on some transformation between that charset and an |
| 59 | internal representation. This transformation is defined on a .so file loaded |
| 60 | dynamically with dlopen(3) when the charset defined in this file is requested |
| 61 | to iconv_open(3). |
| 62 | |
| 63 | See the comments on gconv-modules file for syntax details. |
| 64 | """ |
| 65 | |
Mike Frysinger | 22f6c5a | 2014-08-18 00:45:54 -0400 | [diff] [blame] | 66 | def __init__(self, gconv_modules_file): |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 67 | """Initialize the class. |
| 68 | |
| 69 | Args: |
Mike Frysinger | 22f6c5a | 2014-08-18 00:45:54 -0400 | [diff] [blame] | 70 | gconv_modules_file: Path to gconv/gconv-modules file. |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 71 | """ |
Mike Frysinger | 22f6c5a | 2014-08-18 00:45:54 -0400 | [diff] [blame] | 72 | self._filename = gconv_modules_file |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 73 | |
| 74 | # An alias map of charsets. The key (fromcharset) is the alias name and |
| 75 | # the value (tocharset) is the real charset name. We also support a value |
| 76 | # that is an alias for another charset. |
| 77 | self._alias = {} |
| 78 | |
| 79 | # The modules dict goes from charset to module names (the filenames without |
| 80 | # the .so extension). Since several transformations involving the same |
| 81 | # charset could be defined in different files, the values of this dict are |
| 82 | # a set of module names. |
| 83 | self._modules = {} |
| 84 | |
| 85 | def Load(self): |
| 86 | """Load the charsets from gconv-modules.""" |
Mike Frysinger | 9c92778 | 2019-10-14 02:48:48 -0400 | [diff] [blame] | 87 | with open(self._filename) as fp: |
| 88 | for line in fp: |
| 89 | line = line.split('#', 1)[0].strip() |
| 90 | if not line: |
| 91 | # Ignore blank lines & comments. |
| 92 | continue |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 93 | |
Mike Frysinger | 9c92778 | 2019-10-14 02:48:48 -0400 | [diff] [blame] | 94 | lst = line.split() |
| 95 | if lst[0] == 'module': |
| 96 | _, fromset, toset, filename = lst[:4] |
| 97 | for charset in (fromset, toset): |
| 98 | charset = charset.rstrip('/') |
| 99 | mods = self._modules.get(charset, set()) |
| 100 | mods.add(filename) |
| 101 | self._modules[charset] = mods |
| 102 | elif lst[0] == 'alias': |
| 103 | _, fromset, toset = lst |
| 104 | fromset = fromset.rstrip('/') |
| 105 | toset = toset.rstrip('/') |
| 106 | # Warn if the same charset is defined as two different aliases. |
| 107 | if self._alias.get(fromset, toset) != toset: |
| 108 | logging.error('charset "%s" already defined as "%s".', fromset, |
| 109 | self._alias[fromset]) |
| 110 | self._alias[fromset] = toset |
| 111 | else: |
| 112 | cros_build_lib.Die('Unknown line: %s', line) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 113 | |
Ralph Nathan | 5a582ff | 2015-03-20 18:18:30 -0700 | [diff] [blame] | 114 | logging.debug('Found %d modules and %d alias in %s', len(self._modules), |
| 115 | len(self._alias), self._filename) |
Mike Frysinger | 818d963 | 2019-08-24 14:43:05 -0400 | [diff] [blame] | 116 | charsets = sorted(list(self._alias) + list(self._modules)) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 117 | # Remove the 'INTERNAL' charset from the list, since it is not a charset |
| 118 | # but an internal representation used to convert to and from other charsets. |
| 119 | if 'INTERNAL' in charsets: |
| 120 | charsets.remove('INTERNAL') |
| 121 | return charsets |
| 122 | |
| 123 | def Rewrite(self, used_charsets, dry_run=False): |
| 124 | """Rewrite gconv-modules file with only the used charsets. |
| 125 | |
| 126 | Args: |
| 127 | used_charsets: A list of used charsets. This should be a subset of the |
| 128 | list returned by Load(). |
| 129 | dry_run: Whether this function should not change any file. |
| 130 | """ |
| 131 | |
| 132 | # Compute the used modules. |
| 133 | used_modules = set() |
| 134 | for charset in used_charsets: |
| 135 | while charset in self._alias: |
| 136 | charset = self._alias[charset] |
| 137 | used_modules.update(self._modules[charset]) |
Mike Frysinger | 1f4478c | 2019-10-20 18:33:17 -0400 | [diff] [blame] | 138 | unused_modules = (functools.reduce(set.union, list(self._modules.values())) |
| 139 | - used_modules) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 140 | |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 141 | modules_dir = os.path.dirname(self._filename) |
Alex Deymo | da9dd40 | 2014-08-13 08:54:18 -0700 | [diff] [blame] | 142 | |
| 143 | all_modules = set.union(used_modules, unused_modules) |
| 144 | # The list of charsets that depend on a given library. For example, |
| 145 | # libdeps['libCNS.so'] is the set of all the modules that require that |
| 146 | # library. These libraries live in the same directory as the modules. |
| 147 | libdeps = {} |
| 148 | for module in all_modules: |
| 149 | deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module), |
| 150 | modules_dir, []) |
Mike Frysinger | 266e4ff | 2018-07-14 00:41:05 -0400 | [diff] [blame] | 151 | if 'needed' not in deps: |
Alex Deymo | da9dd40 | 2014-08-13 08:54:18 -0700 | [diff] [blame] | 152 | continue |
| 153 | for lib in deps['needed']: |
| 154 | # Ignore the libs without a path defined (outside the modules_dir). |
| 155 | if deps['libs'][lib]['path']: |
| 156 | libdeps[lib] = libdeps.get(lib, set()).union([module]) |
| 157 | |
Mike Frysinger | 0bdbc10 | 2019-06-13 15:27:29 -0400 | [diff] [blame] | 158 | used_libdeps = set(lib for lib, deps in libdeps.items() |
Alex Deymo | da9dd40 | 2014-08-13 08:54:18 -0700 | [diff] [blame] | 159 | if deps.intersection(used_modules)) |
| 160 | unused_libdeps = set(libdeps).difference(used_libdeps) |
| 161 | |
Ralph Nathan | 5a582ff | 2015-03-20 18:18:30 -0700 | [diff] [blame] | 162 | logging.debug('Used modules: %s', ', '.join(sorted(used_modules))) |
| 163 | logging.debug('Used dependency libs: %s, '.join(sorted(used_libdeps))) |
Alex Deymo | da9dd40 | 2014-08-13 08:54:18 -0700 | [diff] [blame] | 164 | |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 165 | unused_size = 0 |
| 166 | for module in sorted(unused_modules): |
| 167 | module_path = os.path.join(modules_dir, '%s.so' % module) |
| 168 | unused_size += os.lstat(module_path).st_size |
Ralph Nathan | 5a582ff | 2015-03-20 18:18:30 -0700 | [diff] [blame] | 169 | logging.debug('rm %s', module_path) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 170 | if not dry_run: |
| 171 | os.unlink(module_path) |
Alex Deymo | da9dd40 | 2014-08-13 08:54:18 -0700 | [diff] [blame] | 172 | |
| 173 | unused_libdeps_size = 0 |
| 174 | for lib in sorted(unused_libdeps): |
| 175 | lib_path = os.path.join(modules_dir, lib) |
| 176 | unused_libdeps_size += os.lstat(lib_path).st_size |
Ralph Nathan | 5a582ff | 2015-03-20 18:18:30 -0700 | [diff] [blame] | 177 | logging.debug('rm %s', lib_path) |
Alex Deymo | da9dd40 | 2014-08-13 08:54:18 -0700 | [diff] [blame] | 178 | if not dry_run: |
| 179 | os.unlink(lib_path) |
| 180 | |
Ralph Nathan | 0304728 | 2015-03-23 11:09:32 -0700 | [diff] [blame] | 181 | logging.info('Done. Using %d gconv modules. Removed %d unused modules' |
| 182 | ' (%.1f KiB) and %d unused dependencies (%.1f KiB)', |
| 183 | len(used_modules), len(unused_modules), unused_size / 1024., |
| 184 | len(unused_libdeps), unused_libdeps_size / 1024.) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 185 | |
| 186 | # Recompute the gconv-modules file with only the included gconv modules. |
| 187 | result = [] |
Mike Frysinger | 9c92778 | 2019-10-14 02:48:48 -0400 | [diff] [blame] | 188 | with open(self._filename) as fp: |
| 189 | for line in fp: |
| 190 | lst = line.split('#', 1)[0].strip().split() |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 191 | |
Mike Frysinger | 9c92778 | 2019-10-14 02:48:48 -0400 | [diff] [blame] | 192 | if not lst: |
| 193 | # Keep comments and copyright headers. |
| 194 | result.append(line) |
| 195 | elif lst[0] == 'module': |
| 196 | _, _, _, filename = lst[:4] |
| 197 | if filename in used_modules: |
| 198 | # Used module |
| 199 | result.append(line) |
| 200 | elif lst[0] == 'alias': |
| 201 | _, charset, _ = lst |
| 202 | charset = charset.rstrip('/') |
| 203 | while charset in self._alias: |
| 204 | charset = self._alias[charset] |
| 205 | if used_modules.intersection(self._modules[charset]): |
| 206 | # Alias to an used module |
| 207 | result.append(line) |
| 208 | else: |
| 209 | cros_build_lib.Die('Unknown line: %s', line) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 210 | |
| 211 | if not dry_run: |
| 212 | osutils.WriteFile(self._filename, ''.join(result)) |
| 213 | |
| 214 | |
| 215 | def MultipleStringMatch(patterns, corpus): |
| 216 | """Search a list of strings in a corpus string. |
| 217 | |
| 218 | Args: |
| 219 | patterns: A list of strings. |
| 220 | corpus: The text where to search for the strings. |
| 221 | |
Mike Frysinger | c6a67da | 2016-09-21 00:47:20 -0400 | [diff] [blame] | 222 | Returns: |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 223 | A list of Booleans stating whether each pattern string was found in the |
| 224 | corpus or not. |
| 225 | """ |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 226 | result = [False] * len(patterns) |
Mike Frysinger | 9c92778 | 2019-10-14 02:48:48 -0400 | [diff] [blame] | 227 | |
Mike Frysinger | b582d24 | 2019-10-14 02:52:35 -0400 | [diff] [blame] | 228 | tree = ahocorasick.Automaton() |
| 229 | for i, word in enumerate(patterns): |
| 230 | tree.add_word(word, i) |
| 231 | tree.make_automaton() |
Mike Frysinger | 9c92778 | 2019-10-14 02:48:48 -0400 | [diff] [blame] | 232 | |
Mike Frysinger | b582d24 | 2019-10-14 02:52:35 -0400 | [diff] [blame] | 233 | for _, i in tree.iter(corpus): |
| 234 | result[i] = True |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 235 | |
| 236 | return result |
| 237 | |
| 238 | |
| 239 | def GconvStrip(opts): |
| 240 | """Process gconv-modules and remove unused modules. |
| 241 | |
| 242 | Args: |
| 243 | opts: The command-line args passed to the script. |
| 244 | |
| 245 | Returns: |
| 246 | The exit code number indicating whether the process succeeded. |
| 247 | """ |
| 248 | root_st = os.lstat(opts.root) |
| 249 | if not stat.S_ISDIR(root_st.st_mode): |
| 250 | cros_build_lib.Die('root (%s) must be a directory.' % opts.root) |
| 251 | |
| 252 | # Detect the possible locations of the gconv-modules file. |
| 253 | gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH)) |
| 254 | |
| 255 | if not gconv_modules_files: |
Ralph Nathan | 446aee9 | 2015-03-23 14:44:56 -0700 | [diff] [blame] | 256 | logging.warning('gconv-modules file not found.') |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 257 | return 1 |
| 258 | |
| 259 | # Only one gconv-modules files should be present, either on /usr/lib or |
| 260 | # /usr/lib64, but not both. |
| 261 | if len(gconv_modules_files) > 1: |
| 262 | cros_build_lib.Die('Found several gconv-modules files.') |
| 263 | |
Mike Frysinger | 22f6c5a | 2014-08-18 00:45:54 -0400 | [diff] [blame] | 264 | gconv_modules_file = gconv_modules_files[0] |
Ralph Nathan | 0304728 | 2015-03-23 11:09:32 -0700 | [diff] [blame] | 265 | logging.info('Searching for unused gconv files defined in %s', |
| 266 | gconv_modules_file) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 267 | |
Mike Frysinger | 22f6c5a | 2014-08-18 00:45:54 -0400 | [diff] [blame] | 268 | gmods = GconvModules(gconv_modules_file) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 269 | charsets = gmods.Load() |
| 270 | |
| 271 | # Use scanelf to search for all the binary files on the rootfs that require |
| 272 | # or define the symbol iconv_open. We also include the binaries that define |
| 273 | # it since there could be internal calls to it from other functions. |
Ned Nguyen | 2734fe8 | 2018-12-20 10:03:53 -0700 | [diff] [blame] | 274 | symbols = ','.join(GCONV_SYMBOLS) |
| 275 | cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F', |
| 276 | '--symbol', symbols, opts.root] |
Mike Frysinger | bdd40a1 | 2019-11-20 20:43:01 -0500 | [diff] [blame] | 277 | result = cros_build_lib.run(cmd, stdout=True, print_cmd=False, |
| 278 | encoding='utf-8') |
Mike Frysinger | 876a8e5 | 2022-06-23 18:07:30 -0400 | [diff] [blame] | 279 | files = set(result.stdout.splitlines()) |
Ned Nguyen | 2734fe8 | 2018-12-20 10:03:53 -0700 | [diff] [blame] | 280 | logging.debug('Symbols %s found on %d files.', symbols, len(files)) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 281 | |
| 282 | # The charsets are represented as nul-terminated strings in the binary files, |
| 283 | # so we append the '\0' to each string. This prevents some false positives |
| 284 | # when the name of the charset is a substring of some other string. It doesn't |
| 285 | # prevent false positives when the charset name is the suffix of another |
| 286 | # string, for example a binary with the string "DON'T DO IT\0" will match the |
| 287 | # 'IT' charset. Empirical test on ChromeOS images suggests that only 4 |
| 288 | # charsets could fall in category. |
Mike Frysinger | bdd40a1 | 2019-11-20 20:43:01 -0500 | [diff] [blame] | 289 | strings = [s.encode('utf-8') + b'x\00' for s in charsets] |
Ralph Nathan | 0304728 | 2015-03-23 11:09:32 -0700 | [diff] [blame] | 290 | logging.info('Will search for %d strings in %d files', len(strings), |
| 291 | len(files)) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 292 | |
| 293 | # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those |
| 294 | # strings should be listed in the gconv-modules file. |
| 295 | unknown_sticky_modules = set(STICKY_MODULES) - set(charsets) |
| 296 | if unknown_sticky_modules: |
Ralph Nathan | 446aee9 | 2015-03-23 14:44:56 -0700 | [diff] [blame] | 297 | logging.warning( |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 298 | 'The following charsets were explicitly requested in STICKY_MODULES ' |
Mike Frysinger | 80de501 | 2019-08-01 14:10:53 -0400 | [diff] [blame] | 299 | "even though they don't exist: %s", |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 300 | ', '.join(unknown_sticky_modules)) |
| 301 | global_used = [charset in STICKY_MODULES for charset in charsets] |
| 302 | |
Mike Frysinger | 22f6c5a | 2014-08-18 00:45:54 -0400 | [diff] [blame] | 303 | for filename in files: |
Mike Frysinger | 8960f7c | 2018-07-14 00:52:26 -0400 | [diff] [blame] | 304 | used_filenames = MultipleStringMatch(strings, |
| 305 | osutils.ReadFile(filename, mode='rb')) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 306 | |
Mike Frysinger | 66ce413 | 2019-07-17 22:52:52 -0400 | [diff] [blame] | 307 | global_used = [operator.or_(*x) for x in zip(global_used, used_filenames)] |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 308 | # Check the debug flag to avoid running an useless loop. |
Mike Frysinger | 8960f7c | 2018-07-14 00:52:26 -0400 | [diff] [blame] | 309 | if opts.debug and any(used_filenames): |
Ralph Nathan | 5a582ff | 2015-03-20 18:18:30 -0700 | [diff] [blame] | 310 | logging.debug('File %s:', filename) |
Mike Frysinger | 8960f7c | 2018-07-14 00:52:26 -0400 | [diff] [blame] | 311 | for i, used_filename in enumerate(used_filenames): |
| 312 | if used_filename: |
Ralph Nathan | 5a582ff | 2015-03-20 18:18:30 -0700 | [diff] [blame] | 313 | logging.debug(' - %s', strings[i]) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 314 | |
| 315 | used_charsets = [cs for cs, used in zip(charsets, global_used) if used] |
| 316 | gmods.Rewrite(used_charsets, opts.dry_run) |
| 317 | return 0 |
| 318 | |
| 319 | |
| 320 | def ParseArgs(argv): |
| 321 | """Return parsed commandline arguments.""" |
| 322 | |
| 323 | parser = commandline.ArgumentParser() |
| 324 | parser.add_argument( |
| 325 | '--dry-run', action='store_true', default=False, |
Mike Frysinger | 80de501 | 2019-08-01 14:10:53 -0400 | [diff] [blame] | 326 | help="process but don't modify any file.") |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 327 | parser.add_argument( |
| 328 | 'root', type='path', |
| 329 | help='path to the directory where the rootfs is mounted.') |
| 330 | |
| 331 | opts = parser.parse_args(argv) |
| 332 | opts.Freeze() |
| 333 | return opts |
| 334 | |
| 335 | |
| 336 | def main(argv): |
| 337 | """Main function to start the script.""" |
| 338 | opts = ParseArgs(argv) |
Ralph Nathan | 5a582ff | 2015-03-20 18:18:30 -0700 | [diff] [blame] | 339 | logging.debug('Options are %s', opts) |
Alex Deymo | 2bba381 | 2014-08-13 08:49:09 -0700 | [diff] [blame] | 340 | |
| 341 | return GconvStrip(opts) |