blob: 9992e04cab12d4ed35e37d5c23219f24ac7664ea [file] [log] [blame]
Mike Frysingere58c0e22017-10-04 15:43:30 -04001# -*- coding: utf-8 -*-
Alex Deymo2bba3812014-08-13 08:49:09 -07002# Copyright 2014 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to remove unused gconv charset modules from a build."""
7
Mike Frysinger383367e2014-09-16 15:06:17 -04008from __future__ import print_function
9
Alex Deymo2bba3812014-08-13 08:49:09 -070010import ahocorasick
11import glob
Alex Deymoda9dd402014-08-13 08:54:18 -070012import lddtree
Alex Deymo2bba3812014-08-13 08:49:09 -070013import operator
14import os
15import stat
16
17from chromite.lib import commandline
18from chromite.lib import cros_build_lib
Ralph Nathan5a582ff2015-03-20 18:18:30 -070019from chromite.lib import cros_logging as logging
Alex Deymo2bba3812014-08-13 08:49:09 -070020from chromite.lib import osutils
21
22
23# Path pattern to search for the gconv-modules file.
24GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
25
26# Sticky modules. These charsets modules are always included even if they
27# aren't used. You can specify any charset name as supported by 'iconv_open',
28# for example, 'LATIN1' or 'ISO-8859-1'.
29STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
30
31# List of function names (symbols) known to use a charset as a parameter.
32GCONV_SYMBOLS = (
33 # glibc
34 'iconv_open',
35 'iconv',
36 # glib
37 'g_convert',
38 'g_convert_with_fallback',
39 'g_iconv',
40 'g_locale_to_utf8',
41 'g_get_charset',
42)
43
44
45class GconvModules(object):
46 """Class to manipulate the gconv/gconv-modules file and referenced modules.
47
48 This class parses the contents of the gconv-modules file installed by glibc
49 which provides the definition of the charsets supported by iconv_open(3). It
50 allows to load the current gconv-modules file and rewrite it to include only
51 a subset of the supported modules, removing the other modules.
52
53 Each charset is involved on some transformation between that charset and an
54 internal representation. This transformation is defined on a .so file loaded
55 dynamically with dlopen(3) when the charset defined in this file is requested
56 to iconv_open(3).
57
58 See the comments on gconv-modules file for syntax details.
59 """
60
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040061 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070062 """Initialize the class.
63
64 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040065 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070066 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040067 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070068
69 # An alias map of charsets. The key (fromcharset) is the alias name and
70 # the value (tocharset) is the real charset name. We also support a value
71 # that is an alias for another charset.
72 self._alias = {}
73
74 # The modules dict goes from charset to module names (the filenames without
75 # the .so extension). Since several transformations involving the same
76 # charset could be defined in different files, the values of this dict are
77 # a set of module names.
78 self._modules = {}
79
80 def Load(self):
81 """Load the charsets from gconv-modules."""
82 for line in open(self._filename):
83 line = line.split('#', 1)[0].strip()
84 if not line: # Comment
85 continue
86
87 lst = line.split()
88 if lst[0] == 'module':
89 _, fromset, toset, filename = lst[:4]
90 for charset in (fromset, toset):
91 charset = charset.rstrip('/')
92 mods = self._modules.get(charset, set())
93 mods.add(filename)
94 self._modules[charset] = mods
95 elif lst[0] == 'alias':
96 _, fromset, toset = lst
97 fromset = fromset.rstrip('/')
98 toset = toset.rstrip('/')
99 # Warn if the same charset is defined as two different aliases.
100 if self._alias.get(fromset, toset) != toset:
Ralph Nathan59900422015-03-24 10:41:17 -0700101 logging.error('charset "%s" already defined as "%s".', fromset,
102 self._alias[fromset])
Alex Deymo2bba3812014-08-13 08:49:09 -0700103 self._alias[fromset] = toset
104 else:
105 cros_build_lib.Die('Unknown line: %s', line)
106
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700107 logging.debug('Found %d modules and %d alias in %s', len(self._modules),
108 len(self._alias), self._filename)
Alex Deymo2bba3812014-08-13 08:49:09 -0700109 charsets = sorted(self._alias.keys() + self._modules.keys())
110 # Remove the 'INTERNAL' charset from the list, since it is not a charset
111 # but an internal representation used to convert to and from other charsets.
112 if 'INTERNAL' in charsets:
113 charsets.remove('INTERNAL')
114 return charsets
115
116 def Rewrite(self, used_charsets, dry_run=False):
117 """Rewrite gconv-modules file with only the used charsets.
118
119 Args:
120 used_charsets: A list of used charsets. This should be a subset of the
121 list returned by Load().
122 dry_run: Whether this function should not change any file.
123 """
124
125 # Compute the used modules.
126 used_modules = set()
127 for charset in used_charsets:
128 while charset in self._alias:
129 charset = self._alias[charset]
130 used_modules.update(self._modules[charset])
131 unused_modules = reduce(set.union, self._modules.values()) - used_modules
132
Alex Deymo2bba3812014-08-13 08:49:09 -0700133 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700134
135 all_modules = set.union(used_modules, unused_modules)
136 # The list of charsets that depend on a given library. For example,
137 # libdeps['libCNS.so'] is the set of all the modules that require that
138 # library. These libraries live in the same directory as the modules.
139 libdeps = {}
140 for module in all_modules:
141 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
142 modules_dir, [])
Mike Frysinger266e4ff2018-07-14 00:41:05 -0400143 if 'needed' not in deps:
Alex Deymoda9dd402014-08-13 08:54:18 -0700144 continue
145 for lib in deps['needed']:
146 # Ignore the libs without a path defined (outside the modules_dir).
147 if deps['libs'][lib]['path']:
148 libdeps[lib] = libdeps.get(lib, set()).union([module])
149
150 used_libdeps = set(lib for lib, deps in libdeps.iteritems()
151 if deps.intersection(used_modules))
152 unused_libdeps = set(libdeps).difference(used_libdeps)
153
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700154 logging.debug('Used modules: %s', ', '.join(sorted(used_modules)))
155 logging.debug('Used dependency libs: %s, '.join(sorted(used_libdeps)))
Alex Deymoda9dd402014-08-13 08:54:18 -0700156
Alex Deymo2bba3812014-08-13 08:49:09 -0700157 unused_size = 0
158 for module in sorted(unused_modules):
159 module_path = os.path.join(modules_dir, '%s.so' % module)
160 unused_size += os.lstat(module_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700161 logging.debug('rm %s', module_path)
Alex Deymo2bba3812014-08-13 08:49:09 -0700162 if not dry_run:
163 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700164
165 unused_libdeps_size = 0
166 for lib in sorted(unused_libdeps):
167 lib_path = os.path.join(modules_dir, lib)
168 unused_libdeps_size += os.lstat(lib_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700169 logging.debug('rm %s', lib_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700170 if not dry_run:
171 os.unlink(lib_path)
172
Ralph Nathan03047282015-03-23 11:09:32 -0700173 logging.info('Done. Using %d gconv modules. Removed %d unused modules'
174 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
175 len(used_modules), len(unused_modules), unused_size / 1024.,
176 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700177
178 # Recompute the gconv-modules file with only the included gconv modules.
179 result = []
180 for line in open(self._filename):
181 lst = line.split('#', 1)[0].strip().split()
182
183 if not lst:
184 result.append(line) # Keep comments and copyright headers.
185 elif lst[0] == 'module':
186 _, _, _, filename = lst[:4]
187 if filename in used_modules:
188 result.append(line) # Used module
189 elif lst[0] == 'alias':
190 _, charset, _ = lst
191 charset = charset.rstrip('/')
192 while charset in self._alias:
193 charset = self._alias[charset]
194 if used_modules.intersection(self._modules[charset]):
195 result.append(line) # Alias to an used module
196 else:
197 cros_build_lib.Die('Unknown line: %s', line)
198
199 if not dry_run:
200 osutils.WriteFile(self._filename, ''.join(result))
201
202
203def MultipleStringMatch(patterns, corpus):
204 """Search a list of strings in a corpus string.
205
206 Args:
207 patterns: A list of strings.
208 corpus: The text where to search for the strings.
209
Mike Frysingerc6a67da2016-09-21 00:47:20 -0400210 Returns:
Alex Deymo2bba3812014-08-13 08:49:09 -0700211 A list of Booleans stating whether each pattern string was found in the
212 corpus or not.
213 """
214 tree = ahocorasick.KeywordTree()
215 for word in patterns:
216 tree.add(word)
217 tree.make()
218
219 result = [False] * len(patterns)
220 for i, j in tree.findall(corpus):
221 match = corpus[i:j]
222 result[patterns.index(match)] = True
223
224 return result
225
226
227def GconvStrip(opts):
228 """Process gconv-modules and remove unused modules.
229
230 Args:
231 opts: The command-line args passed to the script.
232
233 Returns:
234 The exit code number indicating whether the process succeeded.
235 """
236 root_st = os.lstat(opts.root)
237 if not stat.S_ISDIR(root_st.st_mode):
238 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
239
240 # Detect the possible locations of the gconv-modules file.
241 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
242
243 if not gconv_modules_files:
Ralph Nathan446aee92015-03-23 14:44:56 -0700244 logging.warning('gconv-modules file not found.')
Alex Deymo2bba3812014-08-13 08:49:09 -0700245 return 1
246
247 # Only one gconv-modules files should be present, either on /usr/lib or
248 # /usr/lib64, but not both.
249 if len(gconv_modules_files) > 1:
250 cros_build_lib.Die('Found several gconv-modules files.')
251
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400252 gconv_modules_file = gconv_modules_files[0]
Ralph Nathan03047282015-03-23 11:09:32 -0700253 logging.info('Searching for unused gconv files defined in %s',
254 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700255
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400256 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700257 charsets = gmods.Load()
258
259 # Use scanelf to search for all the binary files on the rootfs that require
260 # or define the symbol iconv_open. We also include the binaries that define
261 # it since there could be internal calls to it from other functions.
Ned Nguyen2734fe82018-12-20 10:03:53 -0700262 symbols = ','.join(GCONV_SYMBOLS)
263 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
264 '--symbol', symbols, opts.root]
265 result = cros_build_lib.RunCommand(cmd, redirect_stdout=True,
266 print_cmd=False)
267 files = set(result.output.splitlines())
268 logging.debug('Symbols %s found on %d files.', symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700269
270 # The charsets are represented as nul-terminated strings in the binary files,
271 # so we append the '\0' to each string. This prevents some false positives
272 # when the name of the charset is a substring of some other string. It doesn't
273 # prevent false positives when the charset name is the suffix of another
274 # string, for example a binary with the string "DON'T DO IT\0" will match the
275 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
276 # charsets could fall in category.
277 strings = [s + '\0' for s in charsets]
Ralph Nathan03047282015-03-23 11:09:32 -0700278 logging.info('Will search for %d strings in %d files', len(strings),
279 len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700280
281 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
282 # strings should be listed in the gconv-modules file.
283 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
284 if unknown_sticky_modules:
Ralph Nathan446aee92015-03-23 14:44:56 -0700285 logging.warning(
Alex Deymo2bba3812014-08-13 08:49:09 -0700286 'The following charsets were explicitly requested in STICKY_MODULES '
287 'even though they don\'t exist: %s',
288 ', '.join(unknown_sticky_modules))
289 global_used = [charset in STICKY_MODULES for charset in charsets]
290
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400291 for filename in files:
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400292 used_filenames = MultipleStringMatch(strings,
293 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700294
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400295 global_used = map(operator.or_, global_used, used_filenames)
Alex Deymo2bba3812014-08-13 08:49:09 -0700296 # Check the debug flag to avoid running an useless loop.
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400297 if opts.debug and any(used_filenames):
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700298 logging.debug('File %s:', filename)
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400299 for i, used_filename in enumerate(used_filenames):
300 if used_filename:
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700301 logging.debug(' - %s', strings[i])
Alex Deymo2bba3812014-08-13 08:49:09 -0700302
303 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
304 gmods.Rewrite(used_charsets, opts.dry_run)
305 return 0
306
307
308def ParseArgs(argv):
309 """Return parsed commandline arguments."""
310
311 parser = commandline.ArgumentParser()
312 parser.add_argument(
313 '--dry-run', action='store_true', default=False,
314 help='process but don\'t modify any file.')
315 parser.add_argument(
316 'root', type='path',
317 help='path to the directory where the rootfs is mounted.')
318
319 opts = parser.parse_args(argv)
320 opts.Freeze()
321 return opts
322
323
324def main(argv):
325 """Main function to start the script."""
326 opts = ParseArgs(argv)
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700327 logging.debug('Options are %s', opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700328
329 return GconvStrip(opts)