blob: f3b8f30f4422e3800da657e60a62ae49e7370124 [file] [log] [blame]
Mike Frysingere58c0e22017-10-04 15:43:30 -04001# -*- coding: utf-8 -*-
Alex Deymo2bba3812014-08-13 08:49:09 -07002# Copyright 2014 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to remove unused gconv charset modules from a build."""
7
Mike Frysinger383367e2014-09-16 15:06:17 -04008from __future__ import print_function
9
Mike Frysinger9997cc02019-07-17 15:50:01 -040010import functools
Alex Deymo2bba3812014-08-13 08:49:09 -070011import glob
12import operator
13import os
14import stat
15
Mike Frysingercb56b642019-08-25 15:33:08 -040016import ahocorasick # pylint: disable=import-error
Mike Frysinger6db648e2018-07-24 19:57:58 -040017import lddtree
18
Alex Deymo2bba3812014-08-13 08:49:09 -070019from chromite.lib import commandline
20from chromite.lib import cros_build_lib
Ralph Nathan5a582ff2015-03-20 18:18:30 -070021from chromite.lib import cros_logging as logging
Alex Deymo2bba3812014-08-13 08:49:09 -070022from chromite.lib import osutils
23
24
25# Path pattern to search for the gconv-modules file.
26GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
27
28# Sticky modules. These charsets modules are always included even if they
29# aren't used. You can specify any charset name as supported by 'iconv_open',
30# for example, 'LATIN1' or 'ISO-8859-1'.
31STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
32
33# List of function names (symbols) known to use a charset as a parameter.
34GCONV_SYMBOLS = (
35 # glibc
36 'iconv_open',
37 'iconv',
38 # glib
39 'g_convert',
40 'g_convert_with_fallback',
41 'g_iconv',
42 'g_locale_to_utf8',
43 'g_get_charset',
44)
45
46
47class GconvModules(object):
48 """Class to manipulate the gconv/gconv-modules file and referenced modules.
49
50 This class parses the contents of the gconv-modules file installed by glibc
51 which provides the definition of the charsets supported by iconv_open(3). It
52 allows to load the current gconv-modules file and rewrite it to include only
53 a subset of the supported modules, removing the other modules.
54
55 Each charset is involved on some transformation between that charset and an
56 internal representation. This transformation is defined on a .so file loaded
57 dynamically with dlopen(3) when the charset defined in this file is requested
58 to iconv_open(3).
59
60 See the comments on gconv-modules file for syntax details.
61 """
62
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040063 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070064 """Initialize the class.
65
66 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040067 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070068 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040069 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070070
71 # An alias map of charsets. The key (fromcharset) is the alias name and
72 # the value (tocharset) is the real charset name. We also support a value
73 # that is an alias for another charset.
74 self._alias = {}
75
76 # The modules dict goes from charset to module names (the filenames without
77 # the .so extension). Since several transformations involving the same
78 # charset could be defined in different files, the values of this dict are
79 # a set of module names.
80 self._modules = {}
81
82 def Load(self):
83 """Load the charsets from gconv-modules."""
Mike Frysinger9c927782019-10-14 02:48:48 -040084 with open(self._filename) as fp:
85 for line in fp:
86 line = line.split('#', 1)[0].strip()
87 if not line:
88 # Ignore blank lines & comments.
89 continue
Alex Deymo2bba3812014-08-13 08:49:09 -070090
Mike Frysinger9c927782019-10-14 02:48:48 -040091 lst = line.split()
92 if lst[0] == 'module':
93 _, fromset, toset, filename = lst[:4]
94 for charset in (fromset, toset):
95 charset = charset.rstrip('/')
96 mods = self._modules.get(charset, set())
97 mods.add(filename)
98 self._modules[charset] = mods
99 elif lst[0] == 'alias':
100 _, fromset, toset = lst
101 fromset = fromset.rstrip('/')
102 toset = toset.rstrip('/')
103 # Warn if the same charset is defined as two different aliases.
104 if self._alias.get(fromset, toset) != toset:
105 logging.error('charset "%s" already defined as "%s".', fromset,
106 self._alias[fromset])
107 self._alias[fromset] = toset
108 else:
109 cros_build_lib.Die('Unknown line: %s', line)
Alex Deymo2bba3812014-08-13 08:49:09 -0700110
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700111 logging.debug('Found %d modules and %d alias in %s', len(self._modules),
112 len(self._alias), self._filename)
Mike Frysinger818d9632019-08-24 14:43:05 -0400113 charsets = sorted(list(self._alias) + list(self._modules))
Alex Deymo2bba3812014-08-13 08:49:09 -0700114 # Remove the 'INTERNAL' charset from the list, since it is not a charset
115 # but an internal representation used to convert to and from other charsets.
116 if 'INTERNAL' in charsets:
117 charsets.remove('INTERNAL')
118 return charsets
119
120 def Rewrite(self, used_charsets, dry_run=False):
121 """Rewrite gconv-modules file with only the used charsets.
122
123 Args:
124 used_charsets: A list of used charsets. This should be a subset of the
125 list returned by Load().
126 dry_run: Whether this function should not change any file.
127 """
128
129 # Compute the used modules.
130 used_modules = set()
131 for charset in used_charsets:
132 while charset in self._alias:
133 charset = self._alias[charset]
134 used_modules.update(self._modules[charset])
Mike Frysinger1f4478c2019-10-20 18:33:17 -0400135 unused_modules = (functools.reduce(set.union, list(self._modules.values()))
136 - used_modules)
Alex Deymo2bba3812014-08-13 08:49:09 -0700137
Alex Deymo2bba3812014-08-13 08:49:09 -0700138 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700139
140 all_modules = set.union(used_modules, unused_modules)
141 # The list of charsets that depend on a given library. For example,
142 # libdeps['libCNS.so'] is the set of all the modules that require that
143 # library. These libraries live in the same directory as the modules.
144 libdeps = {}
145 for module in all_modules:
146 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
147 modules_dir, [])
Mike Frysinger266e4ff2018-07-14 00:41:05 -0400148 if 'needed' not in deps:
Alex Deymoda9dd402014-08-13 08:54:18 -0700149 continue
150 for lib in deps['needed']:
151 # Ignore the libs without a path defined (outside the modules_dir).
152 if deps['libs'][lib]['path']:
153 libdeps[lib] = libdeps.get(lib, set()).union([module])
154
Mike Frysinger0bdbc102019-06-13 15:27:29 -0400155 used_libdeps = set(lib for lib, deps in libdeps.items()
Alex Deymoda9dd402014-08-13 08:54:18 -0700156 if deps.intersection(used_modules))
157 unused_libdeps = set(libdeps).difference(used_libdeps)
158
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700159 logging.debug('Used modules: %s', ', '.join(sorted(used_modules)))
160 logging.debug('Used dependency libs: %s, '.join(sorted(used_libdeps)))
Alex Deymoda9dd402014-08-13 08:54:18 -0700161
Alex Deymo2bba3812014-08-13 08:49:09 -0700162 unused_size = 0
163 for module in sorted(unused_modules):
164 module_path = os.path.join(modules_dir, '%s.so' % module)
165 unused_size += os.lstat(module_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700166 logging.debug('rm %s', module_path)
Alex Deymo2bba3812014-08-13 08:49:09 -0700167 if not dry_run:
168 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700169
170 unused_libdeps_size = 0
171 for lib in sorted(unused_libdeps):
172 lib_path = os.path.join(modules_dir, lib)
173 unused_libdeps_size += os.lstat(lib_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700174 logging.debug('rm %s', lib_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700175 if not dry_run:
176 os.unlink(lib_path)
177
Ralph Nathan03047282015-03-23 11:09:32 -0700178 logging.info('Done. Using %d gconv modules. Removed %d unused modules'
179 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
180 len(used_modules), len(unused_modules), unused_size / 1024.,
181 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700182
183 # Recompute the gconv-modules file with only the included gconv modules.
184 result = []
Mike Frysinger9c927782019-10-14 02:48:48 -0400185 with open(self._filename) as fp:
186 for line in fp:
187 lst = line.split('#', 1)[0].strip().split()
Alex Deymo2bba3812014-08-13 08:49:09 -0700188
Mike Frysinger9c927782019-10-14 02:48:48 -0400189 if not lst:
190 # Keep comments and copyright headers.
191 result.append(line)
192 elif lst[0] == 'module':
193 _, _, _, filename = lst[:4]
194 if filename in used_modules:
195 # Used module
196 result.append(line)
197 elif lst[0] == 'alias':
198 _, charset, _ = lst
199 charset = charset.rstrip('/')
200 while charset in self._alias:
201 charset = self._alias[charset]
202 if used_modules.intersection(self._modules[charset]):
203 # Alias to an used module
204 result.append(line)
205 else:
206 cros_build_lib.Die('Unknown line: %s', line)
Alex Deymo2bba3812014-08-13 08:49:09 -0700207
208 if not dry_run:
209 osutils.WriteFile(self._filename, ''.join(result))
210
211
212def MultipleStringMatch(patterns, corpus):
213 """Search a list of strings in a corpus string.
214
215 Args:
216 patterns: A list of strings.
217 corpus: The text where to search for the strings.
218
Mike Frysingerc6a67da2016-09-21 00:47:20 -0400219 Returns:
Alex Deymo2bba3812014-08-13 08:49:09 -0700220 A list of Booleans stating whether each pattern string was found in the
221 corpus or not.
222 """
Alex Deymo2bba3812014-08-13 08:49:09 -0700223 result = [False] * len(patterns)
Mike Frysinger9c927782019-10-14 02:48:48 -0400224
Mike Frysingerb582d242019-10-14 02:52:35 -0400225 tree = ahocorasick.Automaton()
226 for i, word in enumerate(patterns):
227 tree.add_word(word, i)
228 tree.make_automaton()
Mike Frysinger9c927782019-10-14 02:48:48 -0400229
Mike Frysingerb582d242019-10-14 02:52:35 -0400230 for _, i in tree.iter(corpus):
231 result[i] = True
Alex Deymo2bba3812014-08-13 08:49:09 -0700232
233 return result
234
235
236def GconvStrip(opts):
237 """Process gconv-modules and remove unused modules.
238
239 Args:
240 opts: The command-line args passed to the script.
241
242 Returns:
243 The exit code number indicating whether the process succeeded.
244 """
245 root_st = os.lstat(opts.root)
246 if not stat.S_ISDIR(root_st.st_mode):
247 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
248
249 # Detect the possible locations of the gconv-modules file.
250 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
251
252 if not gconv_modules_files:
Ralph Nathan446aee92015-03-23 14:44:56 -0700253 logging.warning('gconv-modules file not found.')
Alex Deymo2bba3812014-08-13 08:49:09 -0700254 return 1
255
256 # Only one gconv-modules files should be present, either on /usr/lib or
257 # /usr/lib64, but not both.
258 if len(gconv_modules_files) > 1:
259 cros_build_lib.Die('Found several gconv-modules files.')
260
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400261 gconv_modules_file = gconv_modules_files[0]
Ralph Nathan03047282015-03-23 11:09:32 -0700262 logging.info('Searching for unused gconv files defined in %s',
263 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700264
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400265 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700266 charsets = gmods.Load()
267
268 # Use scanelf to search for all the binary files on the rootfs that require
269 # or define the symbol iconv_open. We also include the binaries that define
270 # it since there could be internal calls to it from other functions.
Ned Nguyen2734fe82018-12-20 10:03:53 -0700271 symbols = ','.join(GCONV_SYMBOLS)
272 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
273 '--symbol', symbols, opts.root]
Mike Frysingerbdd40a12019-11-20 20:43:01 -0500274 result = cros_build_lib.run(cmd, stdout=True, print_cmd=False,
275 encoding='utf-8')
Ned Nguyen2734fe82018-12-20 10:03:53 -0700276 files = set(result.output.splitlines())
277 logging.debug('Symbols %s found on %d files.', symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700278
279 # The charsets are represented as nul-terminated strings in the binary files,
280 # so we append the '\0' to each string. This prevents some false positives
281 # when the name of the charset is a substring of some other string. It doesn't
282 # prevent false positives when the charset name is the suffix of another
283 # string, for example a binary with the string "DON'T DO IT\0" will match the
284 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
285 # charsets could fall in category.
Mike Frysingerbdd40a12019-11-20 20:43:01 -0500286 strings = [s.encode('utf-8') + b'x\00' for s in charsets]
Ralph Nathan03047282015-03-23 11:09:32 -0700287 logging.info('Will search for %d strings in %d files', len(strings),
288 len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700289
290 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
291 # strings should be listed in the gconv-modules file.
292 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
293 if unknown_sticky_modules:
Ralph Nathan446aee92015-03-23 14:44:56 -0700294 logging.warning(
Alex Deymo2bba3812014-08-13 08:49:09 -0700295 'The following charsets were explicitly requested in STICKY_MODULES '
Mike Frysinger80de5012019-08-01 14:10:53 -0400296 "even though they don't exist: %s",
Alex Deymo2bba3812014-08-13 08:49:09 -0700297 ', '.join(unknown_sticky_modules))
298 global_used = [charset in STICKY_MODULES for charset in charsets]
299
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400300 for filename in files:
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400301 used_filenames = MultipleStringMatch(strings,
302 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700303
Mike Frysinger66ce4132019-07-17 22:52:52 -0400304 global_used = [operator.or_(*x) for x in zip(global_used, used_filenames)]
Alex Deymo2bba3812014-08-13 08:49:09 -0700305 # Check the debug flag to avoid running an useless loop.
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400306 if opts.debug and any(used_filenames):
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700307 logging.debug('File %s:', filename)
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400308 for i, used_filename in enumerate(used_filenames):
309 if used_filename:
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700310 logging.debug(' - %s', strings[i])
Alex Deymo2bba3812014-08-13 08:49:09 -0700311
312 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
313 gmods.Rewrite(used_charsets, opts.dry_run)
314 return 0
315
316
317def ParseArgs(argv):
318 """Return parsed commandline arguments."""
319
320 parser = commandline.ArgumentParser()
321 parser.add_argument(
322 '--dry-run', action='store_true', default=False,
Mike Frysinger80de5012019-08-01 14:10:53 -0400323 help="process but don't modify any file.")
Alex Deymo2bba3812014-08-13 08:49:09 -0700324 parser.add_argument(
325 'root', type='path',
326 help='path to the directory where the rootfs is mounted.')
327
328 opts = parser.parse_args(argv)
329 opts.Freeze()
330 return opts
331
332
333def main(argv):
334 """Main function to start the script."""
335 opts = ParseArgs(argv)
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700336 logging.debug('Options are %s', opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700337
338 return GconvStrip(opts)