blob: f76493bcc2718b5a61533d31793004ca77ffefd1 [file] [log] [blame]
Alex Deymo2bba3812014-08-13 08:49:09 -07001# Copyright 2014 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Script to remove unused gconv charset modules from a build."""
6
Mike Frysinger9997cc02019-07-17 15:50:01 -04007import functools
Alex Deymo2bba3812014-08-13 08:49:09 -07008import glob
Chris McDonald59650c32021-07-20 15:29:28 -06009import logging
Alex Deymo2bba3812014-08-13 08:49:09 -070010import operator
11import os
12import stat
13
Chris McDonald59650c32021-07-20 15:29:28 -060014from chromite.third_party import lddtree
15
Alex Deymo2bba3812014-08-13 08:49:09 -070016from chromite.lib import commandline
17from chromite.lib import cros_build_lib
18from chromite.lib import osutils
Mike Frysinger95452702021-01-23 00:07:22 -050019
Alex Deymo2bba3812014-08-13 08:49:09 -070020
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070021try:
22 import pytest # pylint: disable=import-error
23 ahocorasick = pytest.importorskip('ahocorasick')
24except ImportError:
25 import ahocorasick
26
Alex Deymo2bba3812014-08-13 08:49:09 -070027
28# Path pattern to search for the gconv-modules file.
29GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
30
31# Sticky modules. These charsets modules are always included even if they
32# aren't used. You can specify any charset name as supported by 'iconv_open',
33# for example, 'LATIN1' or 'ISO-8859-1'.
34STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
35
36# List of function names (symbols) known to use a charset as a parameter.
37GCONV_SYMBOLS = (
38 # glibc
39 'iconv_open',
40 'iconv',
41 # glib
42 'g_convert',
43 'g_convert_with_fallback',
44 'g_iconv',
45 'g_locale_to_utf8',
46 'g_get_charset',
47)
48
49
50class GconvModules(object):
51 """Class to manipulate the gconv/gconv-modules file and referenced modules.
52
53 This class parses the contents of the gconv-modules file installed by glibc
54 which provides the definition of the charsets supported by iconv_open(3). It
55 allows to load the current gconv-modules file and rewrite it to include only
56 a subset of the supported modules, removing the other modules.
57
58 Each charset is involved on some transformation between that charset and an
59 internal representation. This transformation is defined on a .so file loaded
60 dynamically with dlopen(3) when the charset defined in this file is requested
61 to iconv_open(3).
62
63 See the comments on gconv-modules file for syntax details.
64 """
65
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040066 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070067 """Initialize the class.
68
69 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040070 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070071 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040072 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070073
74 # An alias map of charsets. The key (fromcharset) is the alias name and
75 # the value (tocharset) is the real charset name. We also support a value
76 # that is an alias for another charset.
77 self._alias = {}
78
79 # The modules dict goes from charset to module names (the filenames without
80 # the .so extension). Since several transformations involving the same
81 # charset could be defined in different files, the values of this dict are
82 # a set of module names.
83 self._modules = {}
84
85 def Load(self):
86 """Load the charsets from gconv-modules."""
Mike Frysinger9c927782019-10-14 02:48:48 -040087 with open(self._filename) as fp:
88 for line in fp:
89 line = line.split('#', 1)[0].strip()
90 if not line:
91 # Ignore blank lines & comments.
92 continue
Alex Deymo2bba3812014-08-13 08:49:09 -070093
Mike Frysinger9c927782019-10-14 02:48:48 -040094 lst = line.split()
95 if lst[0] == 'module':
96 _, fromset, toset, filename = lst[:4]
97 for charset in (fromset, toset):
98 charset = charset.rstrip('/')
99 mods = self._modules.get(charset, set())
100 mods.add(filename)
101 self._modules[charset] = mods
102 elif lst[0] == 'alias':
103 _, fromset, toset = lst
104 fromset = fromset.rstrip('/')
105 toset = toset.rstrip('/')
106 # Warn if the same charset is defined as two different aliases.
107 if self._alias.get(fromset, toset) != toset:
108 logging.error('charset "%s" already defined as "%s".', fromset,
109 self._alias[fromset])
110 self._alias[fromset] = toset
111 else:
112 cros_build_lib.Die('Unknown line: %s', line)
Alex Deymo2bba3812014-08-13 08:49:09 -0700113
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700114 logging.debug('Found %d modules and %d alias in %s', len(self._modules),
115 len(self._alias), self._filename)
Mike Frysinger818d9632019-08-24 14:43:05 -0400116 charsets = sorted(list(self._alias) + list(self._modules))
Alex Deymo2bba3812014-08-13 08:49:09 -0700117 # Remove the 'INTERNAL' charset from the list, since it is not a charset
118 # but an internal representation used to convert to and from other charsets.
119 if 'INTERNAL' in charsets:
120 charsets.remove('INTERNAL')
121 return charsets
122
123 def Rewrite(self, used_charsets, dry_run=False):
124 """Rewrite gconv-modules file with only the used charsets.
125
126 Args:
127 used_charsets: A list of used charsets. This should be a subset of the
128 list returned by Load().
129 dry_run: Whether this function should not change any file.
130 """
131
132 # Compute the used modules.
133 used_modules = set()
134 for charset in used_charsets:
135 while charset in self._alias:
136 charset = self._alias[charset]
137 used_modules.update(self._modules[charset])
Mike Frysinger1f4478c2019-10-20 18:33:17 -0400138 unused_modules = (functools.reduce(set.union, list(self._modules.values()))
139 - used_modules)
Alex Deymo2bba3812014-08-13 08:49:09 -0700140
Alex Deymo2bba3812014-08-13 08:49:09 -0700141 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700142
143 all_modules = set.union(used_modules, unused_modules)
144 # The list of charsets that depend on a given library. For example,
145 # libdeps['libCNS.so'] is the set of all the modules that require that
146 # library. These libraries live in the same directory as the modules.
147 libdeps = {}
148 for module in all_modules:
149 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
150 modules_dir, [])
Mike Frysinger266e4ff2018-07-14 00:41:05 -0400151 if 'needed' not in deps:
Alex Deymoda9dd402014-08-13 08:54:18 -0700152 continue
153 for lib in deps['needed']:
154 # Ignore the libs without a path defined (outside the modules_dir).
155 if deps['libs'][lib]['path']:
156 libdeps[lib] = libdeps.get(lib, set()).union([module])
157
Mike Frysinger0bdbc102019-06-13 15:27:29 -0400158 used_libdeps = set(lib for lib, deps in libdeps.items()
Alex Deymoda9dd402014-08-13 08:54:18 -0700159 if deps.intersection(used_modules))
160 unused_libdeps = set(libdeps).difference(used_libdeps)
161
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700162 logging.debug('Used modules: %s', ', '.join(sorted(used_modules)))
163 logging.debug('Used dependency libs: %s, '.join(sorted(used_libdeps)))
Alex Deymoda9dd402014-08-13 08:54:18 -0700164
Alex Deymo2bba3812014-08-13 08:49:09 -0700165 unused_size = 0
166 for module in sorted(unused_modules):
167 module_path = os.path.join(modules_dir, '%s.so' % module)
168 unused_size += os.lstat(module_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700169 logging.debug('rm %s', module_path)
Alex Deymo2bba3812014-08-13 08:49:09 -0700170 if not dry_run:
171 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700172
173 unused_libdeps_size = 0
174 for lib in sorted(unused_libdeps):
175 lib_path = os.path.join(modules_dir, lib)
176 unused_libdeps_size += os.lstat(lib_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700177 logging.debug('rm %s', lib_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700178 if not dry_run:
179 os.unlink(lib_path)
180
Ralph Nathan03047282015-03-23 11:09:32 -0700181 logging.info('Done. Using %d gconv modules. Removed %d unused modules'
182 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
183 len(used_modules), len(unused_modules), unused_size / 1024.,
184 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700185
186 # Recompute the gconv-modules file with only the included gconv modules.
187 result = []
Mike Frysinger9c927782019-10-14 02:48:48 -0400188 with open(self._filename) as fp:
189 for line in fp:
190 lst = line.split('#', 1)[0].strip().split()
Alex Deymo2bba3812014-08-13 08:49:09 -0700191
Mike Frysinger9c927782019-10-14 02:48:48 -0400192 if not lst:
193 # Keep comments and copyright headers.
194 result.append(line)
195 elif lst[0] == 'module':
196 _, _, _, filename = lst[:4]
197 if filename in used_modules:
198 # Used module
199 result.append(line)
200 elif lst[0] == 'alias':
201 _, charset, _ = lst
202 charset = charset.rstrip('/')
203 while charset in self._alias:
204 charset = self._alias[charset]
205 if used_modules.intersection(self._modules[charset]):
206 # Alias to an used module
207 result.append(line)
208 else:
209 cros_build_lib.Die('Unknown line: %s', line)
Alex Deymo2bba3812014-08-13 08:49:09 -0700210
211 if not dry_run:
212 osutils.WriteFile(self._filename, ''.join(result))
213
214
215def MultipleStringMatch(patterns, corpus):
216 """Search a list of strings in a corpus string.
217
218 Args:
219 patterns: A list of strings.
220 corpus: The text where to search for the strings.
221
Mike Frysingerc6a67da2016-09-21 00:47:20 -0400222 Returns:
Alex Deymo2bba3812014-08-13 08:49:09 -0700223 A list of Booleans stating whether each pattern string was found in the
224 corpus or not.
225 """
Alex Deymo2bba3812014-08-13 08:49:09 -0700226 result = [False] * len(patterns)
Mike Frysinger9c927782019-10-14 02:48:48 -0400227
Mike Frysingerb582d242019-10-14 02:52:35 -0400228 tree = ahocorasick.Automaton()
229 for i, word in enumerate(patterns):
230 tree.add_word(word, i)
231 tree.make_automaton()
Mike Frysinger9c927782019-10-14 02:48:48 -0400232
Mike Frysingerb582d242019-10-14 02:52:35 -0400233 for _, i in tree.iter(corpus):
234 result[i] = True
Alex Deymo2bba3812014-08-13 08:49:09 -0700235
236 return result
237
238
239def GconvStrip(opts):
240 """Process gconv-modules and remove unused modules.
241
242 Args:
243 opts: The command-line args passed to the script.
244
245 Returns:
246 The exit code number indicating whether the process succeeded.
247 """
248 root_st = os.lstat(opts.root)
249 if not stat.S_ISDIR(root_st.st_mode):
250 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
251
252 # Detect the possible locations of the gconv-modules file.
253 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
254
255 if not gconv_modules_files:
Ralph Nathan446aee92015-03-23 14:44:56 -0700256 logging.warning('gconv-modules file not found.')
Alex Deymo2bba3812014-08-13 08:49:09 -0700257 return 1
258
259 # Only one gconv-modules files should be present, either on /usr/lib or
260 # /usr/lib64, but not both.
261 if len(gconv_modules_files) > 1:
262 cros_build_lib.Die('Found several gconv-modules files.')
263
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400264 gconv_modules_file = gconv_modules_files[0]
Ralph Nathan03047282015-03-23 11:09:32 -0700265 logging.info('Searching for unused gconv files defined in %s',
266 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700267
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400268 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700269 charsets = gmods.Load()
270
271 # Use scanelf to search for all the binary files on the rootfs that require
272 # or define the symbol iconv_open. We also include the binaries that define
273 # it since there could be internal calls to it from other functions.
Ned Nguyen2734fe82018-12-20 10:03:53 -0700274 symbols = ','.join(GCONV_SYMBOLS)
275 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
276 '--symbol', symbols, opts.root]
Mike Frysingerbdd40a12019-11-20 20:43:01 -0500277 result = cros_build_lib.run(cmd, stdout=True, print_cmd=False,
278 encoding='utf-8')
Mike Frysinger876a8e52022-06-23 18:07:30 -0400279 files = set(result.stdout.splitlines())
Ned Nguyen2734fe82018-12-20 10:03:53 -0700280 logging.debug('Symbols %s found on %d files.', symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700281
282 # The charsets are represented as nul-terminated strings in the binary files,
283 # so we append the '\0' to each string. This prevents some false positives
284 # when the name of the charset is a substring of some other string. It doesn't
285 # prevent false positives when the charset name is the suffix of another
286 # string, for example a binary with the string "DON'T DO IT\0" will match the
287 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
288 # charsets could fall in category.
Mike Frysingerbdd40a12019-11-20 20:43:01 -0500289 strings = [s.encode('utf-8') + b'x\00' for s in charsets]
Ralph Nathan03047282015-03-23 11:09:32 -0700290 logging.info('Will search for %d strings in %d files', len(strings),
291 len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700292
293 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
294 # strings should be listed in the gconv-modules file.
295 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
296 if unknown_sticky_modules:
Ralph Nathan446aee92015-03-23 14:44:56 -0700297 logging.warning(
Alex Deymo2bba3812014-08-13 08:49:09 -0700298 'The following charsets were explicitly requested in STICKY_MODULES '
Mike Frysinger80de5012019-08-01 14:10:53 -0400299 "even though they don't exist: %s",
Alex Deymo2bba3812014-08-13 08:49:09 -0700300 ', '.join(unknown_sticky_modules))
301 global_used = [charset in STICKY_MODULES for charset in charsets]
302
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400303 for filename in files:
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400304 used_filenames = MultipleStringMatch(strings,
305 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700306
Mike Frysinger66ce4132019-07-17 22:52:52 -0400307 global_used = [operator.or_(*x) for x in zip(global_used, used_filenames)]
Alex Deymo2bba3812014-08-13 08:49:09 -0700308 # Check the debug flag to avoid running an useless loop.
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400309 if opts.debug and any(used_filenames):
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700310 logging.debug('File %s:', filename)
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400311 for i, used_filename in enumerate(used_filenames):
312 if used_filename:
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700313 logging.debug(' - %s', strings[i])
Alex Deymo2bba3812014-08-13 08:49:09 -0700314
315 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
316 gmods.Rewrite(used_charsets, opts.dry_run)
317 return 0
318
319
320def ParseArgs(argv):
321 """Return parsed commandline arguments."""
322
323 parser = commandline.ArgumentParser()
324 parser.add_argument(
325 '--dry-run', action='store_true', default=False,
Mike Frysinger80de5012019-08-01 14:10:53 -0400326 help="process but don't modify any file.")
Alex Deymo2bba3812014-08-13 08:49:09 -0700327 parser.add_argument(
328 'root', type='path',
329 help='path to the directory where the rootfs is mounted.')
330
331 opts = parser.parse_args(argv)
332 opts.Freeze()
333 return opts
334
335
336def main(argv):
337 """Main function to start the script."""
338 opts = ParseArgs(argv)
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700339 logging.debug('Options are %s', opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700340
341 return GconvStrip(opts)