blob: 26bd864aabbf02be6db3547400059cef15783f22 [file] [log] [blame]
Mike Frysingere58c0e22017-10-04 15:43:30 -04001# -*- coding: utf-8 -*-
Alex Deymo2bba3812014-08-13 08:49:09 -07002# Copyright 2014 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to remove unused gconv charset modules from a build."""
7
Mike Frysinger383367e2014-09-16 15:06:17 -04008from __future__ import print_function
9
Mike Frysinger9997cc02019-07-17 15:50:01 -040010import functools
Alex Deymo2bba3812014-08-13 08:49:09 -070011import glob
12import operator
13import os
14import stat
Mike Frysinger8e303f02020-02-14 22:53:11 -050015import sys
Alex Deymo2bba3812014-08-13 08:49:09 -070016
Mike Frysingercb56b642019-08-25 15:33:08 -040017import ahocorasick # pylint: disable=import-error
Mike Frysinger6db648e2018-07-24 19:57:58 -040018import lddtree
19
Alex Deymo2bba3812014-08-13 08:49:09 -070020from chromite.lib import commandline
21from chromite.lib import cros_build_lib
Ralph Nathan5a582ff2015-03-20 18:18:30 -070022from chromite.lib import cros_logging as logging
Alex Deymo2bba3812014-08-13 08:49:09 -070023from chromite.lib import osutils
24
25
Mike Frysinger8e303f02020-02-14 22:53:11 -050026assert sys.version_info >= (3, 6), 'This module requires Python 3.6+'
27
28
Alex Deymo2bba3812014-08-13 08:49:09 -070029# Path pattern to search for the gconv-modules file.
30GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
31
32# Sticky modules. These charsets modules are always included even if they
33# aren't used. You can specify any charset name as supported by 'iconv_open',
34# for example, 'LATIN1' or 'ISO-8859-1'.
35STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
36
37# List of function names (symbols) known to use a charset as a parameter.
38GCONV_SYMBOLS = (
39 # glibc
40 'iconv_open',
41 'iconv',
42 # glib
43 'g_convert',
44 'g_convert_with_fallback',
45 'g_iconv',
46 'g_locale_to_utf8',
47 'g_get_charset',
48)
49
50
51class GconvModules(object):
52 """Class to manipulate the gconv/gconv-modules file and referenced modules.
53
54 This class parses the contents of the gconv-modules file installed by glibc
55 which provides the definition of the charsets supported by iconv_open(3). It
56 allows to load the current gconv-modules file and rewrite it to include only
57 a subset of the supported modules, removing the other modules.
58
59 Each charset is involved on some transformation between that charset and an
60 internal representation. This transformation is defined on a .so file loaded
61 dynamically with dlopen(3) when the charset defined in this file is requested
62 to iconv_open(3).
63
64 See the comments on gconv-modules file for syntax details.
65 """
66
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040067 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070068 """Initialize the class.
69
70 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040071 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070072 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040073 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070074
75 # An alias map of charsets. The key (fromcharset) is the alias name and
76 # the value (tocharset) is the real charset name. We also support a value
77 # that is an alias for another charset.
78 self._alias = {}
79
80 # The modules dict goes from charset to module names (the filenames without
81 # the .so extension). Since several transformations involving the same
82 # charset could be defined in different files, the values of this dict are
83 # a set of module names.
84 self._modules = {}
85
86 def Load(self):
87 """Load the charsets from gconv-modules."""
Mike Frysinger9c927782019-10-14 02:48:48 -040088 with open(self._filename) as fp:
89 for line in fp:
90 line = line.split('#', 1)[0].strip()
91 if not line:
92 # Ignore blank lines & comments.
93 continue
Alex Deymo2bba3812014-08-13 08:49:09 -070094
Mike Frysinger9c927782019-10-14 02:48:48 -040095 lst = line.split()
96 if lst[0] == 'module':
97 _, fromset, toset, filename = lst[:4]
98 for charset in (fromset, toset):
99 charset = charset.rstrip('/')
100 mods = self._modules.get(charset, set())
101 mods.add(filename)
102 self._modules[charset] = mods
103 elif lst[0] == 'alias':
104 _, fromset, toset = lst
105 fromset = fromset.rstrip('/')
106 toset = toset.rstrip('/')
107 # Warn if the same charset is defined as two different aliases.
108 if self._alias.get(fromset, toset) != toset:
109 logging.error('charset "%s" already defined as "%s".', fromset,
110 self._alias[fromset])
111 self._alias[fromset] = toset
112 else:
113 cros_build_lib.Die('Unknown line: %s', line)
Alex Deymo2bba3812014-08-13 08:49:09 -0700114
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700115 logging.debug('Found %d modules and %d alias in %s', len(self._modules),
116 len(self._alias), self._filename)
Mike Frysinger818d9632019-08-24 14:43:05 -0400117 charsets = sorted(list(self._alias) + list(self._modules))
Alex Deymo2bba3812014-08-13 08:49:09 -0700118 # Remove the 'INTERNAL' charset from the list, since it is not a charset
119 # but an internal representation used to convert to and from other charsets.
120 if 'INTERNAL' in charsets:
121 charsets.remove('INTERNAL')
122 return charsets
123
124 def Rewrite(self, used_charsets, dry_run=False):
125 """Rewrite gconv-modules file with only the used charsets.
126
127 Args:
128 used_charsets: A list of used charsets. This should be a subset of the
129 list returned by Load().
130 dry_run: Whether this function should not change any file.
131 """
132
133 # Compute the used modules.
134 used_modules = set()
135 for charset in used_charsets:
136 while charset in self._alias:
137 charset = self._alias[charset]
138 used_modules.update(self._modules[charset])
Mike Frysinger1f4478c2019-10-20 18:33:17 -0400139 unused_modules = (functools.reduce(set.union, list(self._modules.values()))
140 - used_modules)
Alex Deymo2bba3812014-08-13 08:49:09 -0700141
Alex Deymo2bba3812014-08-13 08:49:09 -0700142 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700143
144 all_modules = set.union(used_modules, unused_modules)
145 # The list of charsets that depend on a given library. For example,
146 # libdeps['libCNS.so'] is the set of all the modules that require that
147 # library. These libraries live in the same directory as the modules.
148 libdeps = {}
149 for module in all_modules:
150 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
151 modules_dir, [])
Mike Frysinger266e4ff2018-07-14 00:41:05 -0400152 if 'needed' not in deps:
Alex Deymoda9dd402014-08-13 08:54:18 -0700153 continue
154 for lib in deps['needed']:
155 # Ignore the libs without a path defined (outside the modules_dir).
156 if deps['libs'][lib]['path']:
157 libdeps[lib] = libdeps.get(lib, set()).union([module])
158
Mike Frysinger0bdbc102019-06-13 15:27:29 -0400159 used_libdeps = set(lib for lib, deps in libdeps.items()
Alex Deymoda9dd402014-08-13 08:54:18 -0700160 if deps.intersection(used_modules))
161 unused_libdeps = set(libdeps).difference(used_libdeps)
162
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700163 logging.debug('Used modules: %s', ', '.join(sorted(used_modules)))
164 logging.debug('Used dependency libs: %s, '.join(sorted(used_libdeps)))
Alex Deymoda9dd402014-08-13 08:54:18 -0700165
Alex Deymo2bba3812014-08-13 08:49:09 -0700166 unused_size = 0
167 for module in sorted(unused_modules):
168 module_path = os.path.join(modules_dir, '%s.so' % module)
169 unused_size += os.lstat(module_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700170 logging.debug('rm %s', module_path)
Alex Deymo2bba3812014-08-13 08:49:09 -0700171 if not dry_run:
172 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700173
174 unused_libdeps_size = 0
175 for lib in sorted(unused_libdeps):
176 lib_path = os.path.join(modules_dir, lib)
177 unused_libdeps_size += os.lstat(lib_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700178 logging.debug('rm %s', lib_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700179 if not dry_run:
180 os.unlink(lib_path)
181
Ralph Nathan03047282015-03-23 11:09:32 -0700182 logging.info('Done. Using %d gconv modules. Removed %d unused modules'
183 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
184 len(used_modules), len(unused_modules), unused_size / 1024.,
185 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700186
187 # Recompute the gconv-modules file with only the included gconv modules.
188 result = []
Mike Frysinger9c927782019-10-14 02:48:48 -0400189 with open(self._filename) as fp:
190 for line in fp:
191 lst = line.split('#', 1)[0].strip().split()
Alex Deymo2bba3812014-08-13 08:49:09 -0700192
Mike Frysinger9c927782019-10-14 02:48:48 -0400193 if not lst:
194 # Keep comments and copyright headers.
195 result.append(line)
196 elif lst[0] == 'module':
197 _, _, _, filename = lst[:4]
198 if filename in used_modules:
199 # Used module
200 result.append(line)
201 elif lst[0] == 'alias':
202 _, charset, _ = lst
203 charset = charset.rstrip('/')
204 while charset in self._alias:
205 charset = self._alias[charset]
206 if used_modules.intersection(self._modules[charset]):
207 # Alias to an used module
208 result.append(line)
209 else:
210 cros_build_lib.Die('Unknown line: %s', line)
Alex Deymo2bba3812014-08-13 08:49:09 -0700211
212 if not dry_run:
213 osutils.WriteFile(self._filename, ''.join(result))
214
215
216def MultipleStringMatch(patterns, corpus):
217 """Search a list of strings in a corpus string.
218
219 Args:
220 patterns: A list of strings.
221 corpus: The text where to search for the strings.
222
Mike Frysingerc6a67da2016-09-21 00:47:20 -0400223 Returns:
Alex Deymo2bba3812014-08-13 08:49:09 -0700224 A list of Booleans stating whether each pattern string was found in the
225 corpus or not.
226 """
Alex Deymo2bba3812014-08-13 08:49:09 -0700227 result = [False] * len(patterns)
Mike Frysinger9c927782019-10-14 02:48:48 -0400228
Mike Frysingerb582d242019-10-14 02:52:35 -0400229 tree = ahocorasick.Automaton()
230 for i, word in enumerate(patterns):
231 tree.add_word(word, i)
232 tree.make_automaton()
Mike Frysinger9c927782019-10-14 02:48:48 -0400233
Mike Frysingerb582d242019-10-14 02:52:35 -0400234 for _, i in tree.iter(corpus):
235 result[i] = True
Alex Deymo2bba3812014-08-13 08:49:09 -0700236
237 return result
238
239
240def GconvStrip(opts):
241 """Process gconv-modules and remove unused modules.
242
243 Args:
244 opts: The command-line args passed to the script.
245
246 Returns:
247 The exit code number indicating whether the process succeeded.
248 """
249 root_st = os.lstat(opts.root)
250 if not stat.S_ISDIR(root_st.st_mode):
251 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
252
253 # Detect the possible locations of the gconv-modules file.
254 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
255
256 if not gconv_modules_files:
Ralph Nathan446aee92015-03-23 14:44:56 -0700257 logging.warning('gconv-modules file not found.')
Alex Deymo2bba3812014-08-13 08:49:09 -0700258 return 1
259
260 # Only one gconv-modules files should be present, either on /usr/lib or
261 # /usr/lib64, but not both.
262 if len(gconv_modules_files) > 1:
263 cros_build_lib.Die('Found several gconv-modules files.')
264
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400265 gconv_modules_file = gconv_modules_files[0]
Ralph Nathan03047282015-03-23 11:09:32 -0700266 logging.info('Searching for unused gconv files defined in %s',
267 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700268
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400269 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700270 charsets = gmods.Load()
271
272 # Use scanelf to search for all the binary files on the rootfs that require
273 # or define the symbol iconv_open. We also include the binaries that define
274 # it since there could be internal calls to it from other functions.
Ned Nguyen2734fe82018-12-20 10:03:53 -0700275 symbols = ','.join(GCONV_SYMBOLS)
276 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
277 '--symbol', symbols, opts.root]
Mike Frysingerbdd40a12019-11-20 20:43:01 -0500278 result = cros_build_lib.run(cmd, stdout=True, print_cmd=False,
279 encoding='utf-8')
Ned Nguyen2734fe82018-12-20 10:03:53 -0700280 files = set(result.output.splitlines())
281 logging.debug('Symbols %s found on %d files.', symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700282
283 # The charsets are represented as nul-terminated strings in the binary files,
284 # so we append the '\0' to each string. This prevents some false positives
285 # when the name of the charset is a substring of some other string. It doesn't
286 # prevent false positives when the charset name is the suffix of another
287 # string, for example a binary with the string "DON'T DO IT\0" will match the
288 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
289 # charsets could fall in category.
Mike Frysingerbdd40a12019-11-20 20:43:01 -0500290 strings = [s.encode('utf-8') + b'x\00' for s in charsets]
Ralph Nathan03047282015-03-23 11:09:32 -0700291 logging.info('Will search for %d strings in %d files', len(strings),
292 len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700293
294 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
295 # strings should be listed in the gconv-modules file.
296 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
297 if unknown_sticky_modules:
Ralph Nathan446aee92015-03-23 14:44:56 -0700298 logging.warning(
Alex Deymo2bba3812014-08-13 08:49:09 -0700299 'The following charsets were explicitly requested in STICKY_MODULES '
Mike Frysinger80de5012019-08-01 14:10:53 -0400300 "even though they don't exist: %s",
Alex Deymo2bba3812014-08-13 08:49:09 -0700301 ', '.join(unknown_sticky_modules))
302 global_used = [charset in STICKY_MODULES for charset in charsets]
303
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400304 for filename in files:
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400305 used_filenames = MultipleStringMatch(strings,
306 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700307
Mike Frysinger66ce4132019-07-17 22:52:52 -0400308 global_used = [operator.or_(*x) for x in zip(global_used, used_filenames)]
Alex Deymo2bba3812014-08-13 08:49:09 -0700309 # Check the debug flag to avoid running an useless loop.
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400310 if opts.debug and any(used_filenames):
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700311 logging.debug('File %s:', filename)
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400312 for i, used_filename in enumerate(used_filenames):
313 if used_filename:
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700314 logging.debug(' - %s', strings[i])
Alex Deymo2bba3812014-08-13 08:49:09 -0700315
316 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
317 gmods.Rewrite(used_charsets, opts.dry_run)
318 return 0
319
320
321def ParseArgs(argv):
322 """Return parsed commandline arguments."""
323
324 parser = commandline.ArgumentParser()
325 parser.add_argument(
326 '--dry-run', action='store_true', default=False,
Mike Frysinger80de5012019-08-01 14:10:53 -0400327 help="process but don't modify any file.")
Alex Deymo2bba3812014-08-13 08:49:09 -0700328 parser.add_argument(
329 'root', type='path',
330 help='path to the directory where the rootfs is mounted.')
331
332 opts = parser.parse_args(argv)
333 opts.Freeze()
334 return opts
335
336
337def main(argv):
338 """Main function to start the script."""
339 opts = ParseArgs(argv)
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700340 logging.debug('Options are %s', opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700341
342 return GconvStrip(opts)