blob: d2486a0bc6dc35d611f91b62a6fdf7c9589fd3ee [file] [log] [blame]
Alex Deymo2bba3812014-08-13 08:49:09 -07001# Copyright 2014 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Script to remove unused gconv charset modules from a build."""
6
Mike Frysinger9997cc02019-07-17 15:50:01 -04007import functools
Alex Deymo2bba3812014-08-13 08:49:09 -07008import glob
9import operator
10import os
11import stat
12
13from chromite.lib import commandline
14from chromite.lib import cros_build_lib
Ralph Nathan5a582ff2015-03-20 18:18:30 -070015from chromite.lib import cros_logging as logging
Alex Deymo2bba3812014-08-13 08:49:09 -070016from chromite.lib import osutils
Mike Frysinger95452702021-01-23 00:07:22 -050017from chromite.third_party import lddtree
18
Alex Deymo2bba3812014-08-13 08:49:09 -070019
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070020try:
21 import pytest # pylint: disable=import-error
22 ahocorasick = pytest.importorskip('ahocorasick')
23except ImportError:
24 import ahocorasick
25
Alex Deymo2bba3812014-08-13 08:49:09 -070026
27# Path pattern to search for the gconv-modules file.
28GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
29
30# Sticky modules. These charsets modules are always included even if they
31# aren't used. You can specify any charset name as supported by 'iconv_open',
32# for example, 'LATIN1' or 'ISO-8859-1'.
33STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
34
35# List of function names (symbols) known to use a charset as a parameter.
36GCONV_SYMBOLS = (
37 # glibc
38 'iconv_open',
39 'iconv',
40 # glib
41 'g_convert',
42 'g_convert_with_fallback',
43 'g_iconv',
44 'g_locale_to_utf8',
45 'g_get_charset',
46)
47
48
49class GconvModules(object):
50 """Class to manipulate the gconv/gconv-modules file and referenced modules.
51
52 This class parses the contents of the gconv-modules file installed by glibc
53 which provides the definition of the charsets supported by iconv_open(3). It
54 allows to load the current gconv-modules file and rewrite it to include only
55 a subset of the supported modules, removing the other modules.
56
57 Each charset is involved on some transformation between that charset and an
58 internal representation. This transformation is defined on a .so file loaded
59 dynamically with dlopen(3) when the charset defined in this file is requested
60 to iconv_open(3).
61
62 See the comments on gconv-modules file for syntax details.
63 """
64
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040065 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070066 """Initialize the class.
67
68 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040069 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070070 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040071 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070072
73 # An alias map of charsets. The key (fromcharset) is the alias name and
74 # the value (tocharset) is the real charset name. We also support a value
75 # that is an alias for another charset.
76 self._alias = {}
77
78 # The modules dict goes from charset to module names (the filenames without
79 # the .so extension). Since several transformations involving the same
80 # charset could be defined in different files, the values of this dict are
81 # a set of module names.
82 self._modules = {}
83
84 def Load(self):
85 """Load the charsets from gconv-modules."""
Mike Frysinger9c927782019-10-14 02:48:48 -040086 with open(self._filename) as fp:
87 for line in fp:
88 line = line.split('#', 1)[0].strip()
89 if not line:
90 # Ignore blank lines & comments.
91 continue
Alex Deymo2bba3812014-08-13 08:49:09 -070092
Mike Frysinger9c927782019-10-14 02:48:48 -040093 lst = line.split()
94 if lst[0] == 'module':
95 _, fromset, toset, filename = lst[:4]
96 for charset in (fromset, toset):
97 charset = charset.rstrip('/')
98 mods = self._modules.get(charset, set())
99 mods.add(filename)
100 self._modules[charset] = mods
101 elif lst[0] == 'alias':
102 _, fromset, toset = lst
103 fromset = fromset.rstrip('/')
104 toset = toset.rstrip('/')
105 # Warn if the same charset is defined as two different aliases.
106 if self._alias.get(fromset, toset) != toset:
107 logging.error('charset "%s" already defined as "%s".', fromset,
108 self._alias[fromset])
109 self._alias[fromset] = toset
110 else:
111 cros_build_lib.Die('Unknown line: %s', line)
Alex Deymo2bba3812014-08-13 08:49:09 -0700112
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700113 logging.debug('Found %d modules and %d alias in %s', len(self._modules),
114 len(self._alias), self._filename)
Mike Frysinger818d9632019-08-24 14:43:05 -0400115 charsets = sorted(list(self._alias) + list(self._modules))
Alex Deymo2bba3812014-08-13 08:49:09 -0700116 # Remove the 'INTERNAL' charset from the list, since it is not a charset
117 # but an internal representation used to convert to and from other charsets.
118 if 'INTERNAL' in charsets:
119 charsets.remove('INTERNAL')
120 return charsets
121
122 def Rewrite(self, used_charsets, dry_run=False):
123 """Rewrite gconv-modules file with only the used charsets.
124
125 Args:
126 used_charsets: A list of used charsets. This should be a subset of the
127 list returned by Load().
128 dry_run: Whether this function should not change any file.
129 """
130
131 # Compute the used modules.
132 used_modules = set()
133 for charset in used_charsets:
134 while charset in self._alias:
135 charset = self._alias[charset]
136 used_modules.update(self._modules[charset])
Mike Frysinger1f4478c2019-10-20 18:33:17 -0400137 unused_modules = (functools.reduce(set.union, list(self._modules.values()))
138 - used_modules)
Alex Deymo2bba3812014-08-13 08:49:09 -0700139
Alex Deymo2bba3812014-08-13 08:49:09 -0700140 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700141
142 all_modules = set.union(used_modules, unused_modules)
143 # The list of charsets that depend on a given library. For example,
144 # libdeps['libCNS.so'] is the set of all the modules that require that
145 # library. These libraries live in the same directory as the modules.
146 libdeps = {}
147 for module in all_modules:
148 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
149 modules_dir, [])
Mike Frysinger266e4ff2018-07-14 00:41:05 -0400150 if 'needed' not in deps:
Alex Deymoda9dd402014-08-13 08:54:18 -0700151 continue
152 for lib in deps['needed']:
153 # Ignore the libs without a path defined (outside the modules_dir).
154 if deps['libs'][lib]['path']:
155 libdeps[lib] = libdeps.get(lib, set()).union([module])
156
Mike Frysinger0bdbc102019-06-13 15:27:29 -0400157 used_libdeps = set(lib for lib, deps in libdeps.items()
Alex Deymoda9dd402014-08-13 08:54:18 -0700158 if deps.intersection(used_modules))
159 unused_libdeps = set(libdeps).difference(used_libdeps)
160
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700161 logging.debug('Used modules: %s', ', '.join(sorted(used_modules)))
162 logging.debug('Used dependency libs: %s, '.join(sorted(used_libdeps)))
Alex Deymoda9dd402014-08-13 08:54:18 -0700163
Alex Deymo2bba3812014-08-13 08:49:09 -0700164 unused_size = 0
165 for module in sorted(unused_modules):
166 module_path = os.path.join(modules_dir, '%s.so' % module)
167 unused_size += os.lstat(module_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700168 logging.debug('rm %s', module_path)
Alex Deymo2bba3812014-08-13 08:49:09 -0700169 if not dry_run:
170 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700171
172 unused_libdeps_size = 0
173 for lib in sorted(unused_libdeps):
174 lib_path = os.path.join(modules_dir, lib)
175 unused_libdeps_size += os.lstat(lib_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700176 logging.debug('rm %s', lib_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700177 if not dry_run:
178 os.unlink(lib_path)
179
Ralph Nathan03047282015-03-23 11:09:32 -0700180 logging.info('Done. Using %d gconv modules. Removed %d unused modules'
181 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
182 len(used_modules), len(unused_modules), unused_size / 1024.,
183 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700184
185 # Recompute the gconv-modules file with only the included gconv modules.
186 result = []
Mike Frysinger9c927782019-10-14 02:48:48 -0400187 with open(self._filename) as fp:
188 for line in fp:
189 lst = line.split('#', 1)[0].strip().split()
Alex Deymo2bba3812014-08-13 08:49:09 -0700190
Mike Frysinger9c927782019-10-14 02:48:48 -0400191 if not lst:
192 # Keep comments and copyright headers.
193 result.append(line)
194 elif lst[0] == 'module':
195 _, _, _, filename = lst[:4]
196 if filename in used_modules:
197 # Used module
198 result.append(line)
199 elif lst[0] == 'alias':
200 _, charset, _ = lst
201 charset = charset.rstrip('/')
202 while charset in self._alias:
203 charset = self._alias[charset]
204 if used_modules.intersection(self._modules[charset]):
205 # Alias to an used module
206 result.append(line)
207 else:
208 cros_build_lib.Die('Unknown line: %s', line)
Alex Deymo2bba3812014-08-13 08:49:09 -0700209
210 if not dry_run:
211 osutils.WriteFile(self._filename, ''.join(result))
212
213
214def MultipleStringMatch(patterns, corpus):
215 """Search a list of strings in a corpus string.
216
217 Args:
218 patterns: A list of strings.
219 corpus: The text where to search for the strings.
220
Mike Frysingerc6a67da2016-09-21 00:47:20 -0400221 Returns:
Alex Deymo2bba3812014-08-13 08:49:09 -0700222 A list of Booleans stating whether each pattern string was found in the
223 corpus or not.
224 """
Alex Deymo2bba3812014-08-13 08:49:09 -0700225 result = [False] * len(patterns)
Mike Frysinger9c927782019-10-14 02:48:48 -0400226
Mike Frysingerb582d242019-10-14 02:52:35 -0400227 tree = ahocorasick.Automaton()
228 for i, word in enumerate(patterns):
229 tree.add_word(word, i)
230 tree.make_automaton()
Mike Frysinger9c927782019-10-14 02:48:48 -0400231
Mike Frysingerb582d242019-10-14 02:52:35 -0400232 for _, i in tree.iter(corpus):
233 result[i] = True
Alex Deymo2bba3812014-08-13 08:49:09 -0700234
235 return result
236
237
238def GconvStrip(opts):
239 """Process gconv-modules and remove unused modules.
240
241 Args:
242 opts: The command-line args passed to the script.
243
244 Returns:
245 The exit code number indicating whether the process succeeded.
246 """
247 root_st = os.lstat(opts.root)
248 if not stat.S_ISDIR(root_st.st_mode):
249 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
250
251 # Detect the possible locations of the gconv-modules file.
252 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
253
254 if not gconv_modules_files:
Ralph Nathan446aee92015-03-23 14:44:56 -0700255 logging.warning('gconv-modules file not found.')
Alex Deymo2bba3812014-08-13 08:49:09 -0700256 return 1
257
258 # Only one gconv-modules files should be present, either on /usr/lib or
259 # /usr/lib64, but not both.
260 if len(gconv_modules_files) > 1:
261 cros_build_lib.Die('Found several gconv-modules files.')
262
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400263 gconv_modules_file = gconv_modules_files[0]
Ralph Nathan03047282015-03-23 11:09:32 -0700264 logging.info('Searching for unused gconv files defined in %s',
265 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700266
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400267 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700268 charsets = gmods.Load()
269
270 # Use scanelf to search for all the binary files on the rootfs that require
271 # or define the symbol iconv_open. We also include the binaries that define
272 # it since there could be internal calls to it from other functions.
Ned Nguyen2734fe82018-12-20 10:03:53 -0700273 symbols = ','.join(GCONV_SYMBOLS)
274 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
275 '--symbol', symbols, opts.root]
Mike Frysingerbdd40a12019-11-20 20:43:01 -0500276 result = cros_build_lib.run(cmd, stdout=True, print_cmd=False,
277 encoding='utf-8')
Ned Nguyen2734fe82018-12-20 10:03:53 -0700278 files = set(result.output.splitlines())
279 logging.debug('Symbols %s found on %d files.', symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700280
281 # The charsets are represented as nul-terminated strings in the binary files,
282 # so we append the '\0' to each string. This prevents some false positives
283 # when the name of the charset is a substring of some other string. It doesn't
284 # prevent false positives when the charset name is the suffix of another
285 # string, for example a binary with the string "DON'T DO IT\0" will match the
286 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
287 # charsets could fall in category.
Mike Frysingerbdd40a12019-11-20 20:43:01 -0500288 strings = [s.encode('utf-8') + b'x\00' for s in charsets]
Ralph Nathan03047282015-03-23 11:09:32 -0700289 logging.info('Will search for %d strings in %d files', len(strings),
290 len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700291
292 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
293 # strings should be listed in the gconv-modules file.
294 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
295 if unknown_sticky_modules:
Ralph Nathan446aee92015-03-23 14:44:56 -0700296 logging.warning(
Alex Deymo2bba3812014-08-13 08:49:09 -0700297 'The following charsets were explicitly requested in STICKY_MODULES '
Mike Frysinger80de5012019-08-01 14:10:53 -0400298 "even though they don't exist: %s",
Alex Deymo2bba3812014-08-13 08:49:09 -0700299 ', '.join(unknown_sticky_modules))
300 global_used = [charset in STICKY_MODULES for charset in charsets]
301
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400302 for filename in files:
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400303 used_filenames = MultipleStringMatch(strings,
304 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700305
Mike Frysinger66ce4132019-07-17 22:52:52 -0400306 global_used = [operator.or_(*x) for x in zip(global_used, used_filenames)]
Alex Deymo2bba3812014-08-13 08:49:09 -0700307 # Check the debug flag to avoid running an useless loop.
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400308 if opts.debug and any(used_filenames):
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700309 logging.debug('File %s:', filename)
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400310 for i, used_filename in enumerate(used_filenames):
311 if used_filename:
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700312 logging.debug(' - %s', strings[i])
Alex Deymo2bba3812014-08-13 08:49:09 -0700313
314 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
315 gmods.Rewrite(used_charsets, opts.dry_run)
316 return 0
317
318
319def ParseArgs(argv):
320 """Return parsed commandline arguments."""
321
322 parser = commandline.ArgumentParser()
323 parser.add_argument(
324 '--dry-run', action='store_true', default=False,
Mike Frysinger80de5012019-08-01 14:10:53 -0400325 help="process but don't modify any file.")
Alex Deymo2bba3812014-08-13 08:49:09 -0700326 parser.add_argument(
327 'root', type='path',
328 help='path to the directory where the rootfs is mounted.')
329
330 opts = parser.parse_args(argv)
331 opts.Freeze()
332 return opts
333
334
335def main(argv):
336 """Main function to start the script."""
337 opts = ParseArgs(argv)
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700338 logging.debug('Options are %s', opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700339
340 return GconvStrip(opts)