blob: cd1438d27cac021fa3071b585f9dfeb7f02cf0a0 [file] [log] [blame]
Alex Deymo2bba3812014-08-13 08:49:09 -07001#!/usr/bin/python
2# Copyright 2014 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to remove unused gconv charset modules from a build."""
7
8import ahocorasick
9import glob
10import operator
11import os
12import stat
13
14from chromite.lib import commandline
15from chromite.lib import cros_build_lib
16from chromite.lib import osutils
17
18
19# Path pattern to search for the gconv-modules file.
20GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
21
22# Sticky modules. These charsets modules are always included even if they
23# aren't used. You can specify any charset name as supported by 'iconv_open',
24# for example, 'LATIN1' or 'ISO-8859-1'.
25STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
26
27# List of function names (symbols) known to use a charset as a parameter.
28GCONV_SYMBOLS = (
29 # glibc
30 'iconv_open',
31 'iconv',
32 # glib
33 'g_convert',
34 'g_convert_with_fallback',
35 'g_iconv',
36 'g_locale_to_utf8',
37 'g_get_charset',
38)
39
40
41class GconvModules(object):
42 """Class to manipulate the gconv/gconv-modules file and referenced modules.
43
44 This class parses the contents of the gconv-modules file installed by glibc
45 which provides the definition of the charsets supported by iconv_open(3). It
46 allows to load the current gconv-modules file and rewrite it to include only
47 a subset of the supported modules, removing the other modules.
48
49 Each charset is involved on some transformation between that charset and an
50 internal representation. This transformation is defined on a .so file loaded
51 dynamically with dlopen(3) when the charset defined in this file is requested
52 to iconv_open(3).
53
54 See the comments on gconv-modules file for syntax details.
55 """
56
57 def __init__(self, gconv_modules_fn):
58 """Initialize the class.
59
60 Args:
61 gconv_modules_fn: Path to gconv/gconv-modules file.
62 """
63 self._filename = gconv_modules_fn
64
65 # An alias map of charsets. The key (fromcharset) is the alias name and
66 # the value (tocharset) is the real charset name. We also support a value
67 # that is an alias for another charset.
68 self._alias = {}
69
70 # The modules dict goes from charset to module names (the filenames without
71 # the .so extension). Since several transformations involving the same
72 # charset could be defined in different files, the values of this dict are
73 # a set of module names.
74 self._modules = {}
75
76 def Load(self):
77 """Load the charsets from gconv-modules."""
78 for line in open(self._filename):
79 line = line.split('#', 1)[0].strip()
80 if not line: # Comment
81 continue
82
83 lst = line.split()
84 if lst[0] == 'module':
85 _, fromset, toset, filename = lst[:4]
86 for charset in (fromset, toset):
87 charset = charset.rstrip('/')
88 mods = self._modules.get(charset, set())
89 mods.add(filename)
90 self._modules[charset] = mods
91 elif lst[0] == 'alias':
92 _, fromset, toset = lst
93 fromset = fromset.rstrip('/')
94 toset = toset.rstrip('/')
95 # Warn if the same charset is defined as two different aliases.
96 if self._alias.get(fromset, toset) != toset:
97 cros_build_lib.Error('charset "%s" already defined as "%s".',
98 fromset, self._alias[fromset])
99 self._alias[fromset] = toset
100 else:
101 cros_build_lib.Die('Unknown line: %s', line)
102
103 cros_build_lib.Debug('Found %d modules and %d alias in %s',
104 len(self._modules), len(self._alias), self._filename)
105 charsets = sorted(self._alias.keys() + self._modules.keys())
106 # Remove the 'INTERNAL' charset from the list, since it is not a charset
107 # but an internal representation used to convert to and from other charsets.
108 if 'INTERNAL' in charsets:
109 charsets.remove('INTERNAL')
110 return charsets
111
112 def Rewrite(self, used_charsets, dry_run=False):
113 """Rewrite gconv-modules file with only the used charsets.
114
115 Args:
116 used_charsets: A list of used charsets. This should be a subset of the
117 list returned by Load().
118 dry_run: Whether this function should not change any file.
119 """
120
121 # Compute the used modules.
122 used_modules = set()
123 for charset in used_charsets:
124 while charset in self._alias:
125 charset = self._alias[charset]
126 used_modules.update(self._modules[charset])
127 unused_modules = reduce(set.union, self._modules.values()) - used_modules
128
129 cros_build_lib.Debug('Used modules: %s', ', '.join(sorted(used_modules)))
130
131 modules_dir = os.path.dirname(self._filename)
132 unused_size = 0
133 for module in sorted(unused_modules):
134 module_path = os.path.join(modules_dir, '%s.so' % module)
135 unused_size += os.lstat(module_path).st_size
136 cros_build_lib.Debug('rm %s', module_path)
137 if not dry_run:
138 os.unlink(module_path)
139 cros_build_lib.Info('Done. Using %d gconv modules. Removed %d unused'
140 ' modules (%.1f KiB)',
141 len(used_modules), len(unused_modules),
142 unused_size / 1024.)
143
144 # Recompute the gconv-modules file with only the included gconv modules.
145 result = []
146 for line in open(self._filename):
147 lst = line.split('#', 1)[0].strip().split()
148
149 if not lst:
150 result.append(line) # Keep comments and copyright headers.
151 elif lst[0] == 'module':
152 _, _, _, filename = lst[:4]
153 if filename in used_modules:
154 result.append(line) # Used module
155 elif lst[0] == 'alias':
156 _, charset, _ = lst
157 charset = charset.rstrip('/')
158 while charset in self._alias:
159 charset = self._alias[charset]
160 if used_modules.intersection(self._modules[charset]):
161 result.append(line) # Alias to an used module
162 else:
163 cros_build_lib.Die('Unknown line: %s', line)
164
165 if not dry_run:
166 osutils.WriteFile(self._filename, ''.join(result))
167
168
169def MultipleStringMatch(patterns, corpus):
170 """Search a list of strings in a corpus string.
171
172 Args:
173 patterns: A list of strings.
174 corpus: The text where to search for the strings.
175
176 Result:
177 A list of Booleans stating whether each pattern string was found in the
178 corpus or not.
179 """
180 tree = ahocorasick.KeywordTree()
181 for word in patterns:
182 tree.add(word)
183 tree.make()
184
185 result = [False] * len(patterns)
186 for i, j in tree.findall(corpus):
187 match = corpus[i:j]
188 result[patterns.index(match)] = True
189
190 return result
191
192
193def GconvStrip(opts):
194 """Process gconv-modules and remove unused modules.
195
196 Args:
197 opts: The command-line args passed to the script.
198
199 Returns:
200 The exit code number indicating whether the process succeeded.
201 """
202 root_st = os.lstat(opts.root)
203 if not stat.S_ISDIR(root_st.st_mode):
204 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
205
206 # Detect the possible locations of the gconv-modules file.
207 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
208
209 if not gconv_modules_files:
210 cros_build_lib.Warning('gconv-modules file not found.')
211 return 1
212
213 # Only one gconv-modules files should be present, either on /usr/lib or
214 # /usr/lib64, but not both.
215 if len(gconv_modules_files) > 1:
216 cros_build_lib.Die('Found several gconv-modules files.')
217
218 gconv_modules_fn = gconv_modules_files[0]
219 cros_build_lib.Info('Searching for unused gconv files defined in %s',
220 gconv_modules_fn)
221
222 gmods = GconvModules(gconv_modules_fn)
223 charsets = gmods.Load()
224
225 # Use scanelf to search for all the binary files on the rootfs that require
226 # or define the symbol iconv_open. We also include the binaries that define
227 # it since there could be internal calls to it from other functions.
228 files = set()
229 for symbol in GCONV_SYMBOLS:
230 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
231 '--symbol', symbol, opts.root]
232 result = cros_build_lib.RunCommand(cmd, redirect_stdout=True,
233 print_cmd=False)
234 symbol_files = result.output.splitlines()
235 cros_build_lib.Debug('Symbol %s found on %d files.',
236 symbol, len(symbol_files))
237 files.update(symbol_files)
238
239 # The charsets are represented as nul-terminated strings in the binary files,
240 # so we append the '\0' to each string. This prevents some false positives
241 # when the name of the charset is a substring of some other string. It doesn't
242 # prevent false positives when the charset name is the suffix of another
243 # string, for example a binary with the string "DON'T DO IT\0" will match the
244 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
245 # charsets could fall in category.
246 strings = [s + '\0' for s in charsets]
247 cros_build_lib.Info('Will search for %d strings in %d files',
248 len(strings), len(files))
249
250 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
251 # strings should be listed in the gconv-modules file.
252 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
253 if unknown_sticky_modules:
254 cros_build_lib.Warning(
255 'The following charsets were explicitly requested in STICKY_MODULES '
256 'even though they don\'t exist: %s',
257 ', '.join(unknown_sticky_modules))
258 global_used = [charset in STICKY_MODULES for charset in charsets]
259
260 for fn in files:
261 used_fn = MultipleStringMatch(strings, osutils.ReadFile(fn, mode='rb'))
262
263 global_used = map(operator.or_, global_used, used_fn)
264 # Check the debug flag to avoid running an useless loop.
265 if opts.debug and any(used_fn):
266 cros_build_lib.Debug('File %s:', fn)
267 for i in range(len(used_fn)):
268 if used_fn[i]:
269 cros_build_lib.Debug(' - %s', strings[i])
270
271 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
272 gmods.Rewrite(used_charsets, opts.dry_run)
273 return 0
274
275
276def ParseArgs(argv):
277 """Return parsed commandline arguments."""
278
279 parser = commandline.ArgumentParser()
280 parser.add_argument(
281 '--dry-run', action='store_true', default=False,
282 help='process but don\'t modify any file.')
283 parser.add_argument(
284 'root', type='path',
285 help='path to the directory where the rootfs is mounted.')
286
287 opts = parser.parse_args(argv)
288 opts.Freeze()
289 return opts
290
291
292def main(argv):
293 """Main function to start the script."""
294 opts = ParseArgs(argv)
295 cros_build_lib.Debug('Options are %s', opts)
296
297 return GconvStrip(opts)