blob: 250b110c04aa0b09c1ce01199a765d5355626e0b [file] [log] [blame]
Alex Deymo2bba3812014-08-13 08:49:09 -07001# Copyright 2014 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Script to remove unused gconv charset modules from a build."""
6
Mike Frysinger383367e2014-09-16 15:06:17 -04007from __future__ import print_function
8
Alex Deymo2bba3812014-08-13 08:49:09 -07009import ahocorasick
10import glob
Alex Deymoda9dd402014-08-13 08:54:18 -070011import lddtree
Alex Deymo2bba3812014-08-13 08:49:09 -070012import operator
13import os
14import stat
15
16from chromite.lib import commandline
17from chromite.lib import cros_build_lib
18from chromite.lib import osutils
19
20
21# Path pattern to search for the gconv-modules file.
22GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
23
24# Sticky modules. These charsets modules are always included even if they
25# aren't used. You can specify any charset name as supported by 'iconv_open',
26# for example, 'LATIN1' or 'ISO-8859-1'.
27STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
28
29# List of function names (symbols) known to use a charset as a parameter.
30GCONV_SYMBOLS = (
31 # glibc
32 'iconv_open',
33 'iconv',
34 # glib
35 'g_convert',
36 'g_convert_with_fallback',
37 'g_iconv',
38 'g_locale_to_utf8',
39 'g_get_charset',
40)
41
42
43class GconvModules(object):
44 """Class to manipulate the gconv/gconv-modules file and referenced modules.
45
46 This class parses the contents of the gconv-modules file installed by glibc
47 which provides the definition of the charsets supported by iconv_open(3). It
48 allows to load the current gconv-modules file and rewrite it to include only
49 a subset of the supported modules, removing the other modules.
50
51 Each charset is involved on some transformation between that charset and an
52 internal representation. This transformation is defined on a .so file loaded
53 dynamically with dlopen(3) when the charset defined in this file is requested
54 to iconv_open(3).
55
56 See the comments on gconv-modules file for syntax details.
57 """
58
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040059 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070060 """Initialize the class.
61
62 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040063 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070064 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040065 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070066
67 # An alias map of charsets. The key (fromcharset) is the alias name and
68 # the value (tocharset) is the real charset name. We also support a value
69 # that is an alias for another charset.
70 self._alias = {}
71
72 # The modules dict goes from charset to module names (the filenames without
73 # the .so extension). Since several transformations involving the same
74 # charset could be defined in different files, the values of this dict are
75 # a set of module names.
76 self._modules = {}
77
78 def Load(self):
79 """Load the charsets from gconv-modules."""
80 for line in open(self._filename):
81 line = line.split('#', 1)[0].strip()
82 if not line: # Comment
83 continue
84
85 lst = line.split()
86 if lst[0] == 'module':
87 _, fromset, toset, filename = lst[:4]
88 for charset in (fromset, toset):
89 charset = charset.rstrip('/')
90 mods = self._modules.get(charset, set())
91 mods.add(filename)
92 self._modules[charset] = mods
93 elif lst[0] == 'alias':
94 _, fromset, toset = lst
95 fromset = fromset.rstrip('/')
96 toset = toset.rstrip('/')
97 # Warn if the same charset is defined as two different aliases.
98 if self._alias.get(fromset, toset) != toset:
99 cros_build_lib.Error('charset "%s" already defined as "%s".',
100 fromset, self._alias[fromset])
101 self._alias[fromset] = toset
102 else:
103 cros_build_lib.Die('Unknown line: %s', line)
104
105 cros_build_lib.Debug('Found %d modules and %d alias in %s',
106 len(self._modules), len(self._alias), self._filename)
107 charsets = sorted(self._alias.keys() + self._modules.keys())
108 # Remove the 'INTERNAL' charset from the list, since it is not a charset
109 # but an internal representation used to convert to and from other charsets.
110 if 'INTERNAL' in charsets:
111 charsets.remove('INTERNAL')
112 return charsets
113
114 def Rewrite(self, used_charsets, dry_run=False):
115 """Rewrite gconv-modules file with only the used charsets.
116
117 Args:
118 used_charsets: A list of used charsets. This should be a subset of the
119 list returned by Load().
120 dry_run: Whether this function should not change any file.
121 """
122
123 # Compute the used modules.
124 used_modules = set()
125 for charset in used_charsets:
126 while charset in self._alias:
127 charset = self._alias[charset]
128 used_modules.update(self._modules[charset])
129 unused_modules = reduce(set.union, self._modules.values()) - used_modules
130
Alex Deymo2bba3812014-08-13 08:49:09 -0700131 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700132
133 all_modules = set.union(used_modules, unused_modules)
134 # The list of charsets that depend on a given library. For example,
135 # libdeps['libCNS.so'] is the set of all the modules that require that
136 # library. These libraries live in the same directory as the modules.
137 libdeps = {}
138 for module in all_modules:
139 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
140 modules_dir, [])
141 if not 'needed' in deps:
142 continue
143 for lib in deps['needed']:
144 # Ignore the libs without a path defined (outside the modules_dir).
145 if deps['libs'][lib]['path']:
146 libdeps[lib] = libdeps.get(lib, set()).union([module])
147
148 used_libdeps = set(lib for lib, deps in libdeps.iteritems()
149 if deps.intersection(used_modules))
150 unused_libdeps = set(libdeps).difference(used_libdeps)
151
152 cros_build_lib.Debug('Used modules: %s', ', '.join(sorted(used_modules)))
153 cros_build_lib.Debug('Used dependency libs: %s',
154 ', '.join(sorted(used_libdeps)))
155
Alex Deymo2bba3812014-08-13 08:49:09 -0700156 unused_size = 0
157 for module in sorted(unused_modules):
158 module_path = os.path.join(modules_dir, '%s.so' % module)
159 unused_size += os.lstat(module_path).st_size
160 cros_build_lib.Debug('rm %s', module_path)
161 if not dry_run:
162 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700163
164 unused_libdeps_size = 0
165 for lib in sorted(unused_libdeps):
166 lib_path = os.path.join(modules_dir, lib)
167 unused_libdeps_size += os.lstat(lib_path).st_size
168 cros_build_lib.Debug('rm %s', lib_path)
169 if not dry_run:
170 os.unlink(lib_path)
171
172 cros_build_lib.Info(
173 'Done. Using %d gconv modules. Removed %d unused modules'
174 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
175 len(used_modules), len(unused_modules), unused_size / 1024.,
176 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700177
178 # Recompute the gconv-modules file with only the included gconv modules.
179 result = []
180 for line in open(self._filename):
181 lst = line.split('#', 1)[0].strip().split()
182
183 if not lst:
184 result.append(line) # Keep comments and copyright headers.
185 elif lst[0] == 'module':
186 _, _, _, filename = lst[:4]
187 if filename in used_modules:
188 result.append(line) # Used module
189 elif lst[0] == 'alias':
190 _, charset, _ = lst
191 charset = charset.rstrip('/')
192 while charset in self._alias:
193 charset = self._alias[charset]
194 if used_modules.intersection(self._modules[charset]):
195 result.append(line) # Alias to an used module
196 else:
197 cros_build_lib.Die('Unknown line: %s', line)
198
199 if not dry_run:
200 osutils.WriteFile(self._filename, ''.join(result))
201
202
203def MultipleStringMatch(patterns, corpus):
204 """Search a list of strings in a corpus string.
205
206 Args:
207 patterns: A list of strings.
208 corpus: The text where to search for the strings.
209
210 Result:
211 A list of Booleans stating whether each pattern string was found in the
212 corpus or not.
213 """
214 tree = ahocorasick.KeywordTree()
215 for word in patterns:
216 tree.add(word)
217 tree.make()
218
219 result = [False] * len(patterns)
220 for i, j in tree.findall(corpus):
221 match = corpus[i:j]
222 result[patterns.index(match)] = True
223
224 return result
225
226
227def GconvStrip(opts):
228 """Process gconv-modules and remove unused modules.
229
230 Args:
231 opts: The command-line args passed to the script.
232
233 Returns:
234 The exit code number indicating whether the process succeeded.
235 """
236 root_st = os.lstat(opts.root)
237 if not stat.S_ISDIR(root_st.st_mode):
238 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
239
240 # Detect the possible locations of the gconv-modules file.
241 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
242
243 if not gconv_modules_files:
244 cros_build_lib.Warning('gconv-modules file not found.')
245 return 1
246
247 # Only one gconv-modules files should be present, either on /usr/lib or
248 # /usr/lib64, but not both.
249 if len(gconv_modules_files) > 1:
250 cros_build_lib.Die('Found several gconv-modules files.')
251
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400252 gconv_modules_file = gconv_modules_files[0]
Alex Deymo2bba3812014-08-13 08:49:09 -0700253 cros_build_lib.Info('Searching for unused gconv files defined in %s',
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400254 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700255
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400256 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700257 charsets = gmods.Load()
258
259 # Use scanelf to search for all the binary files on the rootfs that require
260 # or define the symbol iconv_open. We also include the binaries that define
261 # it since there could be internal calls to it from other functions.
262 files = set()
263 for symbol in GCONV_SYMBOLS:
264 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
265 '--symbol', symbol, opts.root]
266 result = cros_build_lib.RunCommand(cmd, redirect_stdout=True,
267 print_cmd=False)
268 symbol_files = result.output.splitlines()
269 cros_build_lib.Debug('Symbol %s found on %d files.',
270 symbol, len(symbol_files))
271 files.update(symbol_files)
272
273 # The charsets are represented as nul-terminated strings in the binary files,
274 # so we append the '\0' to each string. This prevents some false positives
275 # when the name of the charset is a substring of some other string. It doesn't
276 # prevent false positives when the charset name is the suffix of another
277 # string, for example a binary with the string "DON'T DO IT\0" will match the
278 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
279 # charsets could fall in category.
280 strings = [s + '\0' for s in charsets]
281 cros_build_lib.Info('Will search for %d strings in %d files',
282 len(strings), len(files))
283
284 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
285 # strings should be listed in the gconv-modules file.
286 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
287 if unknown_sticky_modules:
288 cros_build_lib.Warning(
289 'The following charsets were explicitly requested in STICKY_MODULES '
290 'even though they don\'t exist: %s',
291 ', '.join(unknown_sticky_modules))
292 global_used = [charset in STICKY_MODULES for charset in charsets]
293
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400294 for filename in files:
295 used_filename = MultipleStringMatch(strings,
296 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700297
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400298 global_used = map(operator.or_, global_used, used_filename)
Alex Deymo2bba3812014-08-13 08:49:09 -0700299 # Check the debug flag to avoid running an useless loop.
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400300 if opts.debug and any(used_filename):
301 cros_build_lib.Debug('File %s:', filename)
302 for i in range(len(used_filename)):
303 if used_filename[i]:
Alex Deymo2bba3812014-08-13 08:49:09 -0700304 cros_build_lib.Debug(' - %s', strings[i])
305
306 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
307 gmods.Rewrite(used_charsets, opts.dry_run)
308 return 0
309
310
311def ParseArgs(argv):
312 """Return parsed commandline arguments."""
313
314 parser = commandline.ArgumentParser()
315 parser.add_argument(
316 '--dry-run', action='store_true', default=False,
317 help='process but don\'t modify any file.')
318 parser.add_argument(
319 'root', type='path',
320 help='path to the directory where the rootfs is mounted.')
321
322 opts = parser.parse_args(argv)
323 opts.Freeze()
324 return opts
325
326
327def main(argv):
328 """Main function to start the script."""
329 opts = ParseArgs(argv)
330 cros_build_lib.Debug('Options are %s', opts)
331
332 return GconvStrip(opts)