blob: f3a3aac5d4bfdd9d985ac28658b354bd47df20c1 [file] [log] [blame]
Mike Frysingere58c0e22017-10-04 15:43:30 -04001# -*- coding: utf-8 -*-
Alex Deymo2bba3812014-08-13 08:49:09 -07002# Copyright 2014 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to remove unused gconv charset modules from a build."""
7
Mike Frysinger383367e2014-09-16 15:06:17 -04008from __future__ import print_function
9
Mike Frysinger9997cc02019-07-17 15:50:01 -040010import functools
Alex Deymo2bba3812014-08-13 08:49:09 -070011import glob
12import operator
13import os
14import stat
Mike Frysinger8e303f02020-02-14 22:53:11 -050015import sys
Alex Deymo2bba3812014-08-13 08:49:09 -070016
Mike Frysinger6db648e2018-07-24 19:57:58 -040017import lddtree
18
Alex Deymo2bba3812014-08-13 08:49:09 -070019from chromite.lib import commandline
20from chromite.lib import cros_build_lib
Ralph Nathan5a582ff2015-03-20 18:18:30 -070021from chromite.lib import cros_logging as logging
Alex Deymo2bba3812014-08-13 08:49:09 -070022from chromite.lib import osutils
23
Greg Edelstona4c9b3b2020-01-07 17:51:13 -070024try:
25 import pytest # pylint: disable=import-error
26 ahocorasick = pytest.importorskip('ahocorasick')
27except ImportError:
28 import ahocorasick
29
Alex Deymo2bba3812014-08-13 08:49:09 -070030
Mike Frysinger8e303f02020-02-14 22:53:11 -050031assert sys.version_info >= (3, 6), 'This module requires Python 3.6+'
32
33
Alex Deymo2bba3812014-08-13 08:49:09 -070034# Path pattern to search for the gconv-modules file.
35GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
36
37# Sticky modules. These charsets modules are always included even if they
38# aren't used. You can specify any charset name as supported by 'iconv_open',
39# for example, 'LATIN1' or 'ISO-8859-1'.
40STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
41
42# List of function names (symbols) known to use a charset as a parameter.
43GCONV_SYMBOLS = (
44 # glibc
45 'iconv_open',
46 'iconv',
47 # glib
48 'g_convert',
49 'g_convert_with_fallback',
50 'g_iconv',
51 'g_locale_to_utf8',
52 'g_get_charset',
53)
54
55
56class GconvModules(object):
57 """Class to manipulate the gconv/gconv-modules file and referenced modules.
58
59 This class parses the contents of the gconv-modules file installed by glibc
60 which provides the definition of the charsets supported by iconv_open(3). It
61 allows to load the current gconv-modules file and rewrite it to include only
62 a subset of the supported modules, removing the other modules.
63
64 Each charset is involved on some transformation between that charset and an
65 internal representation. This transformation is defined on a .so file loaded
66 dynamically with dlopen(3) when the charset defined in this file is requested
67 to iconv_open(3).
68
69 See the comments on gconv-modules file for syntax details.
70 """
71
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040072 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070073 """Initialize the class.
74
75 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040076 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070077 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040078 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070079
80 # An alias map of charsets. The key (fromcharset) is the alias name and
81 # the value (tocharset) is the real charset name. We also support a value
82 # that is an alias for another charset.
83 self._alias = {}
84
85 # The modules dict goes from charset to module names (the filenames without
86 # the .so extension). Since several transformations involving the same
87 # charset could be defined in different files, the values of this dict are
88 # a set of module names.
89 self._modules = {}
90
91 def Load(self):
92 """Load the charsets from gconv-modules."""
Mike Frysinger9c927782019-10-14 02:48:48 -040093 with open(self._filename) as fp:
94 for line in fp:
95 line = line.split('#', 1)[0].strip()
96 if not line:
97 # Ignore blank lines & comments.
98 continue
Alex Deymo2bba3812014-08-13 08:49:09 -070099
Mike Frysinger9c927782019-10-14 02:48:48 -0400100 lst = line.split()
101 if lst[0] == 'module':
102 _, fromset, toset, filename = lst[:4]
103 for charset in (fromset, toset):
104 charset = charset.rstrip('/')
105 mods = self._modules.get(charset, set())
106 mods.add(filename)
107 self._modules[charset] = mods
108 elif lst[0] == 'alias':
109 _, fromset, toset = lst
110 fromset = fromset.rstrip('/')
111 toset = toset.rstrip('/')
112 # Warn if the same charset is defined as two different aliases.
113 if self._alias.get(fromset, toset) != toset:
114 logging.error('charset "%s" already defined as "%s".', fromset,
115 self._alias[fromset])
116 self._alias[fromset] = toset
117 else:
118 cros_build_lib.Die('Unknown line: %s', line)
Alex Deymo2bba3812014-08-13 08:49:09 -0700119
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700120 logging.debug('Found %d modules and %d alias in %s', len(self._modules),
121 len(self._alias), self._filename)
Mike Frysinger818d9632019-08-24 14:43:05 -0400122 charsets = sorted(list(self._alias) + list(self._modules))
Alex Deymo2bba3812014-08-13 08:49:09 -0700123 # Remove the 'INTERNAL' charset from the list, since it is not a charset
124 # but an internal representation used to convert to and from other charsets.
125 if 'INTERNAL' in charsets:
126 charsets.remove('INTERNAL')
127 return charsets
128
129 def Rewrite(self, used_charsets, dry_run=False):
130 """Rewrite gconv-modules file with only the used charsets.
131
132 Args:
133 used_charsets: A list of used charsets. This should be a subset of the
134 list returned by Load().
135 dry_run: Whether this function should not change any file.
136 """
137
138 # Compute the used modules.
139 used_modules = set()
140 for charset in used_charsets:
141 while charset in self._alias:
142 charset = self._alias[charset]
143 used_modules.update(self._modules[charset])
Mike Frysinger1f4478c2019-10-20 18:33:17 -0400144 unused_modules = (functools.reduce(set.union, list(self._modules.values()))
145 - used_modules)
Alex Deymo2bba3812014-08-13 08:49:09 -0700146
Alex Deymo2bba3812014-08-13 08:49:09 -0700147 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700148
149 all_modules = set.union(used_modules, unused_modules)
150 # The list of charsets that depend on a given library. For example,
151 # libdeps['libCNS.so'] is the set of all the modules that require that
152 # library. These libraries live in the same directory as the modules.
153 libdeps = {}
154 for module in all_modules:
155 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
156 modules_dir, [])
Mike Frysinger266e4ff2018-07-14 00:41:05 -0400157 if 'needed' not in deps:
Alex Deymoda9dd402014-08-13 08:54:18 -0700158 continue
159 for lib in deps['needed']:
160 # Ignore the libs without a path defined (outside the modules_dir).
161 if deps['libs'][lib]['path']:
162 libdeps[lib] = libdeps.get(lib, set()).union([module])
163
Mike Frysinger0bdbc102019-06-13 15:27:29 -0400164 used_libdeps = set(lib for lib, deps in libdeps.items()
Alex Deymoda9dd402014-08-13 08:54:18 -0700165 if deps.intersection(used_modules))
166 unused_libdeps = set(libdeps).difference(used_libdeps)
167
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700168 logging.debug('Used modules: %s', ', '.join(sorted(used_modules)))
169 logging.debug('Used dependency libs: %s, '.join(sorted(used_libdeps)))
Alex Deymoda9dd402014-08-13 08:54:18 -0700170
Alex Deymo2bba3812014-08-13 08:49:09 -0700171 unused_size = 0
172 for module in sorted(unused_modules):
173 module_path = os.path.join(modules_dir, '%s.so' % module)
174 unused_size += os.lstat(module_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700175 logging.debug('rm %s', module_path)
Alex Deymo2bba3812014-08-13 08:49:09 -0700176 if not dry_run:
177 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700178
179 unused_libdeps_size = 0
180 for lib in sorted(unused_libdeps):
181 lib_path = os.path.join(modules_dir, lib)
182 unused_libdeps_size += os.lstat(lib_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700183 logging.debug('rm %s', lib_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700184 if not dry_run:
185 os.unlink(lib_path)
186
Ralph Nathan03047282015-03-23 11:09:32 -0700187 logging.info('Done. Using %d gconv modules. Removed %d unused modules'
188 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
189 len(used_modules), len(unused_modules), unused_size / 1024.,
190 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700191
192 # Recompute the gconv-modules file with only the included gconv modules.
193 result = []
Mike Frysinger9c927782019-10-14 02:48:48 -0400194 with open(self._filename) as fp:
195 for line in fp:
196 lst = line.split('#', 1)[0].strip().split()
Alex Deymo2bba3812014-08-13 08:49:09 -0700197
Mike Frysinger9c927782019-10-14 02:48:48 -0400198 if not lst:
199 # Keep comments and copyright headers.
200 result.append(line)
201 elif lst[0] == 'module':
202 _, _, _, filename = lst[:4]
203 if filename in used_modules:
204 # Used module
205 result.append(line)
206 elif lst[0] == 'alias':
207 _, charset, _ = lst
208 charset = charset.rstrip('/')
209 while charset in self._alias:
210 charset = self._alias[charset]
211 if used_modules.intersection(self._modules[charset]):
212 # Alias to an used module
213 result.append(line)
214 else:
215 cros_build_lib.Die('Unknown line: %s', line)
Alex Deymo2bba3812014-08-13 08:49:09 -0700216
217 if not dry_run:
218 osutils.WriteFile(self._filename, ''.join(result))
219
220
221def MultipleStringMatch(patterns, corpus):
222 """Search a list of strings in a corpus string.
223
224 Args:
225 patterns: A list of strings.
226 corpus: The text where to search for the strings.
227
Mike Frysingerc6a67da2016-09-21 00:47:20 -0400228 Returns:
Alex Deymo2bba3812014-08-13 08:49:09 -0700229 A list of Booleans stating whether each pattern string was found in the
230 corpus or not.
231 """
Alex Deymo2bba3812014-08-13 08:49:09 -0700232 result = [False] * len(patterns)
Mike Frysinger9c927782019-10-14 02:48:48 -0400233
Mike Frysingerb582d242019-10-14 02:52:35 -0400234 tree = ahocorasick.Automaton()
235 for i, word in enumerate(patterns):
236 tree.add_word(word, i)
237 tree.make_automaton()
Mike Frysinger9c927782019-10-14 02:48:48 -0400238
Mike Frysingerb582d242019-10-14 02:52:35 -0400239 for _, i in tree.iter(corpus):
240 result[i] = True
Alex Deymo2bba3812014-08-13 08:49:09 -0700241
242 return result
243
244
245def GconvStrip(opts):
246 """Process gconv-modules and remove unused modules.
247
248 Args:
249 opts: The command-line args passed to the script.
250
251 Returns:
252 The exit code number indicating whether the process succeeded.
253 """
254 root_st = os.lstat(opts.root)
255 if not stat.S_ISDIR(root_st.st_mode):
256 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
257
258 # Detect the possible locations of the gconv-modules file.
259 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
260
261 if not gconv_modules_files:
Ralph Nathan446aee92015-03-23 14:44:56 -0700262 logging.warning('gconv-modules file not found.')
Alex Deymo2bba3812014-08-13 08:49:09 -0700263 return 1
264
265 # Only one gconv-modules files should be present, either on /usr/lib or
266 # /usr/lib64, but not both.
267 if len(gconv_modules_files) > 1:
268 cros_build_lib.Die('Found several gconv-modules files.')
269
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400270 gconv_modules_file = gconv_modules_files[0]
Ralph Nathan03047282015-03-23 11:09:32 -0700271 logging.info('Searching for unused gconv files defined in %s',
272 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700273
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400274 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700275 charsets = gmods.Load()
276
277 # Use scanelf to search for all the binary files on the rootfs that require
278 # or define the symbol iconv_open. We also include the binaries that define
279 # it since there could be internal calls to it from other functions.
Ned Nguyen2734fe82018-12-20 10:03:53 -0700280 symbols = ','.join(GCONV_SYMBOLS)
281 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
282 '--symbol', symbols, opts.root]
Mike Frysingerbdd40a12019-11-20 20:43:01 -0500283 result = cros_build_lib.run(cmd, stdout=True, print_cmd=False,
284 encoding='utf-8')
Ned Nguyen2734fe82018-12-20 10:03:53 -0700285 files = set(result.output.splitlines())
286 logging.debug('Symbols %s found on %d files.', symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700287
288 # The charsets are represented as nul-terminated strings in the binary files,
289 # so we append the '\0' to each string. This prevents some false positives
290 # when the name of the charset is a substring of some other string. It doesn't
291 # prevent false positives when the charset name is the suffix of another
292 # string, for example a binary with the string "DON'T DO IT\0" will match the
293 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
294 # charsets could fall in category.
Mike Frysingerbdd40a12019-11-20 20:43:01 -0500295 strings = [s.encode('utf-8') + b'x\00' for s in charsets]
Ralph Nathan03047282015-03-23 11:09:32 -0700296 logging.info('Will search for %d strings in %d files', len(strings),
297 len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700298
299 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
300 # strings should be listed in the gconv-modules file.
301 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
302 if unknown_sticky_modules:
Ralph Nathan446aee92015-03-23 14:44:56 -0700303 logging.warning(
Alex Deymo2bba3812014-08-13 08:49:09 -0700304 'The following charsets were explicitly requested in STICKY_MODULES '
Mike Frysinger80de5012019-08-01 14:10:53 -0400305 "even though they don't exist: %s",
Alex Deymo2bba3812014-08-13 08:49:09 -0700306 ', '.join(unknown_sticky_modules))
307 global_used = [charset in STICKY_MODULES for charset in charsets]
308
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400309 for filename in files:
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400310 used_filenames = MultipleStringMatch(strings,
311 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700312
Mike Frysinger66ce4132019-07-17 22:52:52 -0400313 global_used = [operator.or_(*x) for x in zip(global_used, used_filenames)]
Alex Deymo2bba3812014-08-13 08:49:09 -0700314 # Check the debug flag to avoid running an useless loop.
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400315 if opts.debug and any(used_filenames):
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700316 logging.debug('File %s:', filename)
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400317 for i, used_filename in enumerate(used_filenames):
318 if used_filename:
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700319 logging.debug(' - %s', strings[i])
Alex Deymo2bba3812014-08-13 08:49:09 -0700320
321 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
322 gmods.Rewrite(used_charsets, opts.dry_run)
323 return 0
324
325
326def ParseArgs(argv):
327 """Return parsed commandline arguments."""
328
329 parser = commandline.ArgumentParser()
330 parser.add_argument(
331 '--dry-run', action='store_true', default=False,
Mike Frysinger80de5012019-08-01 14:10:53 -0400332 help="process but don't modify any file.")
Alex Deymo2bba3812014-08-13 08:49:09 -0700333 parser.add_argument(
334 'root', type='path',
335 help='path to the directory where the rootfs is mounted.')
336
337 opts = parser.parse_args(argv)
338 opts.Freeze()
339 return opts
340
341
342def main(argv):
343 """Main function to start the script."""
344 opts = ParseArgs(argv)
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700345 logging.debug('Options are %s', opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700346
347 return GconvStrip(opts)