blob: e07aa11480de1eba18097978ad268924e0e389fb [file] [log] [blame]
Mike Frysingere58c0e22017-10-04 15:43:30 -04001# -*- coding: utf-8 -*-
Alex Deymo2bba3812014-08-13 08:49:09 -07002# Copyright 2014 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to remove unused gconv charset modules from a build."""
7
Mike Frysinger383367e2014-09-16 15:06:17 -04008from __future__ import print_function
9
Mike Frysinger9997cc02019-07-17 15:50:01 -040010import functools
Alex Deymo2bba3812014-08-13 08:49:09 -070011import glob
12import operator
13import os
14import stat
15
Mike Frysinger6db648e2018-07-24 19:57:58 -040016import ahocorasick
17import lddtree
18
Alex Deymo2bba3812014-08-13 08:49:09 -070019from chromite.lib import commandline
20from chromite.lib import cros_build_lib
Ralph Nathan5a582ff2015-03-20 18:18:30 -070021from chromite.lib import cros_logging as logging
Alex Deymo2bba3812014-08-13 08:49:09 -070022from chromite.lib import osutils
23
24
25# Path pattern to search for the gconv-modules file.
26GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
27
28# Sticky modules. These charsets modules are always included even if they
29# aren't used. You can specify any charset name as supported by 'iconv_open',
30# for example, 'LATIN1' or 'ISO-8859-1'.
31STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
32
33# List of function names (symbols) known to use a charset as a parameter.
34GCONV_SYMBOLS = (
35 # glibc
36 'iconv_open',
37 'iconv',
38 # glib
39 'g_convert',
40 'g_convert_with_fallback',
41 'g_iconv',
42 'g_locale_to_utf8',
43 'g_get_charset',
44)
45
46
47class GconvModules(object):
48 """Class to manipulate the gconv/gconv-modules file and referenced modules.
49
50 This class parses the contents of the gconv-modules file installed by glibc
51 which provides the definition of the charsets supported by iconv_open(3). It
52 allows to load the current gconv-modules file and rewrite it to include only
53 a subset of the supported modules, removing the other modules.
54
55 Each charset is involved on some transformation between that charset and an
56 internal representation. This transformation is defined on a .so file loaded
57 dynamically with dlopen(3) when the charset defined in this file is requested
58 to iconv_open(3).
59
60 See the comments on gconv-modules file for syntax details.
61 """
62
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040063 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070064 """Initialize the class.
65
66 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040067 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070068 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040069 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070070
71 # An alias map of charsets. The key (fromcharset) is the alias name and
72 # the value (tocharset) is the real charset name. We also support a value
73 # that is an alias for another charset.
74 self._alias = {}
75
76 # The modules dict goes from charset to module names (the filenames without
77 # the .so extension). Since several transformations involving the same
78 # charset could be defined in different files, the values of this dict are
79 # a set of module names.
80 self._modules = {}
81
82 def Load(self):
83 """Load the charsets from gconv-modules."""
84 for line in open(self._filename):
85 line = line.split('#', 1)[0].strip()
86 if not line: # Comment
87 continue
88
89 lst = line.split()
90 if lst[0] == 'module':
91 _, fromset, toset, filename = lst[:4]
92 for charset in (fromset, toset):
93 charset = charset.rstrip('/')
94 mods = self._modules.get(charset, set())
95 mods.add(filename)
96 self._modules[charset] = mods
97 elif lst[0] == 'alias':
98 _, fromset, toset = lst
99 fromset = fromset.rstrip('/')
100 toset = toset.rstrip('/')
101 # Warn if the same charset is defined as two different aliases.
102 if self._alias.get(fromset, toset) != toset:
Ralph Nathan59900422015-03-24 10:41:17 -0700103 logging.error('charset "%s" already defined as "%s".', fromset,
104 self._alias[fromset])
Alex Deymo2bba3812014-08-13 08:49:09 -0700105 self._alias[fromset] = toset
106 else:
107 cros_build_lib.Die('Unknown line: %s', line)
108
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700109 logging.debug('Found %d modules and %d alias in %s', len(self._modules),
110 len(self._alias), self._filename)
Alex Deymo2bba3812014-08-13 08:49:09 -0700111 charsets = sorted(self._alias.keys() + self._modules.keys())
112 # Remove the 'INTERNAL' charset from the list, since it is not a charset
113 # but an internal representation used to convert to and from other charsets.
114 if 'INTERNAL' in charsets:
115 charsets.remove('INTERNAL')
116 return charsets
117
118 def Rewrite(self, used_charsets, dry_run=False):
119 """Rewrite gconv-modules file with only the used charsets.
120
121 Args:
122 used_charsets: A list of used charsets. This should be a subset of the
123 list returned by Load().
124 dry_run: Whether this function should not change any file.
125 """
126
127 # Compute the used modules.
128 used_modules = set()
129 for charset in used_charsets:
130 while charset in self._alias:
131 charset = self._alias[charset]
132 used_modules.update(self._modules[charset])
Mike Frysinger9997cc02019-07-17 15:50:01 -0400133 unused_modules = (functools.reduce(set.union, self._modules.values()) -
134 used_modules)
Alex Deymo2bba3812014-08-13 08:49:09 -0700135
Alex Deymo2bba3812014-08-13 08:49:09 -0700136 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700137
138 all_modules = set.union(used_modules, unused_modules)
139 # The list of charsets that depend on a given library. For example,
140 # libdeps['libCNS.so'] is the set of all the modules that require that
141 # library. These libraries live in the same directory as the modules.
142 libdeps = {}
143 for module in all_modules:
144 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
145 modules_dir, [])
Mike Frysinger266e4ff2018-07-14 00:41:05 -0400146 if 'needed' not in deps:
Alex Deymoda9dd402014-08-13 08:54:18 -0700147 continue
148 for lib in deps['needed']:
149 # Ignore the libs without a path defined (outside the modules_dir).
150 if deps['libs'][lib]['path']:
151 libdeps[lib] = libdeps.get(lib, set()).union([module])
152
Mike Frysinger0bdbc102019-06-13 15:27:29 -0400153 used_libdeps = set(lib for lib, deps in libdeps.items()
Alex Deymoda9dd402014-08-13 08:54:18 -0700154 if deps.intersection(used_modules))
155 unused_libdeps = set(libdeps).difference(used_libdeps)
156
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700157 logging.debug('Used modules: %s', ', '.join(sorted(used_modules)))
158 logging.debug('Used dependency libs: %s, '.join(sorted(used_libdeps)))
Alex Deymoda9dd402014-08-13 08:54:18 -0700159
Alex Deymo2bba3812014-08-13 08:49:09 -0700160 unused_size = 0
161 for module in sorted(unused_modules):
162 module_path = os.path.join(modules_dir, '%s.so' % module)
163 unused_size += os.lstat(module_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700164 logging.debug('rm %s', module_path)
Alex Deymo2bba3812014-08-13 08:49:09 -0700165 if not dry_run:
166 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700167
168 unused_libdeps_size = 0
169 for lib in sorted(unused_libdeps):
170 lib_path = os.path.join(modules_dir, lib)
171 unused_libdeps_size += os.lstat(lib_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700172 logging.debug('rm %s', lib_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700173 if not dry_run:
174 os.unlink(lib_path)
175
Ralph Nathan03047282015-03-23 11:09:32 -0700176 logging.info('Done. Using %d gconv modules. Removed %d unused modules'
177 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
178 len(used_modules), len(unused_modules), unused_size / 1024.,
179 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700180
181 # Recompute the gconv-modules file with only the included gconv modules.
182 result = []
183 for line in open(self._filename):
184 lst = line.split('#', 1)[0].strip().split()
185
186 if not lst:
187 result.append(line) # Keep comments and copyright headers.
188 elif lst[0] == 'module':
189 _, _, _, filename = lst[:4]
190 if filename in used_modules:
191 result.append(line) # Used module
192 elif lst[0] == 'alias':
193 _, charset, _ = lst
194 charset = charset.rstrip('/')
195 while charset in self._alias:
196 charset = self._alias[charset]
197 if used_modules.intersection(self._modules[charset]):
198 result.append(line) # Alias to an used module
199 else:
200 cros_build_lib.Die('Unknown line: %s', line)
201
202 if not dry_run:
203 osutils.WriteFile(self._filename, ''.join(result))
204
205
206def MultipleStringMatch(patterns, corpus):
207 """Search a list of strings in a corpus string.
208
209 Args:
210 patterns: A list of strings.
211 corpus: The text where to search for the strings.
212
Mike Frysingerc6a67da2016-09-21 00:47:20 -0400213 Returns:
Alex Deymo2bba3812014-08-13 08:49:09 -0700214 A list of Booleans stating whether each pattern string was found in the
215 corpus or not.
216 """
217 tree = ahocorasick.KeywordTree()
218 for word in patterns:
219 tree.add(word)
220 tree.make()
221
222 result = [False] * len(patterns)
223 for i, j in tree.findall(corpus):
224 match = corpus[i:j]
225 result[patterns.index(match)] = True
226
227 return result
228
229
230def GconvStrip(opts):
231 """Process gconv-modules and remove unused modules.
232
233 Args:
234 opts: The command-line args passed to the script.
235
236 Returns:
237 The exit code number indicating whether the process succeeded.
238 """
239 root_st = os.lstat(opts.root)
240 if not stat.S_ISDIR(root_st.st_mode):
241 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
242
243 # Detect the possible locations of the gconv-modules file.
244 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
245
246 if not gconv_modules_files:
Ralph Nathan446aee92015-03-23 14:44:56 -0700247 logging.warning('gconv-modules file not found.')
Alex Deymo2bba3812014-08-13 08:49:09 -0700248 return 1
249
250 # Only one gconv-modules files should be present, either on /usr/lib or
251 # /usr/lib64, but not both.
252 if len(gconv_modules_files) > 1:
253 cros_build_lib.Die('Found several gconv-modules files.')
254
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400255 gconv_modules_file = gconv_modules_files[0]
Ralph Nathan03047282015-03-23 11:09:32 -0700256 logging.info('Searching for unused gconv files defined in %s',
257 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700258
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400259 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700260 charsets = gmods.Load()
261
262 # Use scanelf to search for all the binary files on the rootfs that require
263 # or define the symbol iconv_open. We also include the binaries that define
264 # it since there could be internal calls to it from other functions.
Ned Nguyen2734fe82018-12-20 10:03:53 -0700265 symbols = ','.join(GCONV_SYMBOLS)
266 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
267 '--symbol', symbols, opts.root]
268 result = cros_build_lib.RunCommand(cmd, redirect_stdout=True,
269 print_cmd=False)
270 files = set(result.output.splitlines())
271 logging.debug('Symbols %s found on %d files.', symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700272
273 # The charsets are represented as nul-terminated strings in the binary files,
274 # so we append the '\0' to each string. This prevents some false positives
275 # when the name of the charset is a substring of some other string. It doesn't
276 # prevent false positives when the charset name is the suffix of another
277 # string, for example a binary with the string "DON'T DO IT\0" will match the
278 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
279 # charsets could fall in category.
280 strings = [s + '\0' for s in charsets]
Ralph Nathan03047282015-03-23 11:09:32 -0700281 logging.info('Will search for %d strings in %d files', len(strings),
282 len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700283
284 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
285 # strings should be listed in the gconv-modules file.
286 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
287 if unknown_sticky_modules:
Ralph Nathan446aee92015-03-23 14:44:56 -0700288 logging.warning(
Alex Deymo2bba3812014-08-13 08:49:09 -0700289 'The following charsets were explicitly requested in STICKY_MODULES '
Mike Frysinger80de5012019-08-01 14:10:53 -0400290 "even though they don't exist: %s",
Alex Deymo2bba3812014-08-13 08:49:09 -0700291 ', '.join(unknown_sticky_modules))
292 global_used = [charset in STICKY_MODULES for charset in charsets]
293
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400294 for filename in files:
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400295 used_filenames = MultipleStringMatch(strings,
296 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700297
Mike Frysinger66ce4132019-07-17 22:52:52 -0400298 global_used = [operator.or_(*x) for x in zip(global_used, used_filenames)]
Alex Deymo2bba3812014-08-13 08:49:09 -0700299 # Check the debug flag to avoid running an useless loop.
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400300 if opts.debug and any(used_filenames):
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700301 logging.debug('File %s:', filename)
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400302 for i, used_filename in enumerate(used_filenames):
303 if used_filename:
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700304 logging.debug(' - %s', strings[i])
Alex Deymo2bba3812014-08-13 08:49:09 -0700305
306 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
307 gmods.Rewrite(used_charsets, opts.dry_run)
308 return 0
309
310
311def ParseArgs(argv):
312 """Return parsed commandline arguments."""
313
314 parser = commandline.ArgumentParser()
315 parser.add_argument(
316 '--dry-run', action='store_true', default=False,
Mike Frysinger80de5012019-08-01 14:10:53 -0400317 help="process but don't modify any file.")
Alex Deymo2bba3812014-08-13 08:49:09 -0700318 parser.add_argument(
319 'root', type='path',
320 help='path to the directory where the rootfs is mounted.')
321
322 opts = parser.parse_args(argv)
323 opts.Freeze()
324 return opts
325
326
327def main(argv):
328 """Main function to start the script."""
329 opts = ParseArgs(argv)
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700330 logging.debug('Options are %s', opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700331
332 return GconvStrip(opts)