blob: b1994ee5286883c2290b8be983c1e642f2f8bee0 [file] [log] [blame]
Alex Deymo2bba3812014-08-13 08:49:09 -07001#!/usr/bin/python
2# Copyright 2014 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to remove unused gconv charset modules from a build."""
7
8import ahocorasick
9import glob
Alex Deymoda9dd402014-08-13 08:54:18 -070010import lddtree
Alex Deymo2bba3812014-08-13 08:49:09 -070011import operator
12import os
13import stat
14
15from chromite.lib import commandline
16from chromite.lib import cros_build_lib
17from chromite.lib import osutils
18
19
20# Path pattern to search for the gconv-modules file.
21GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
22
23# Sticky modules. These charsets modules are always included even if they
24# aren't used. You can specify any charset name as supported by 'iconv_open',
25# for example, 'LATIN1' or 'ISO-8859-1'.
26STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
27
28# List of function names (symbols) known to use a charset as a parameter.
29GCONV_SYMBOLS = (
30 # glibc
31 'iconv_open',
32 'iconv',
33 # glib
34 'g_convert',
35 'g_convert_with_fallback',
36 'g_iconv',
37 'g_locale_to_utf8',
38 'g_get_charset',
39)
40
41
42class GconvModules(object):
43 """Class to manipulate the gconv/gconv-modules file and referenced modules.
44
45 This class parses the contents of the gconv-modules file installed by glibc
46 which provides the definition of the charsets supported by iconv_open(3). It
47 allows to load the current gconv-modules file and rewrite it to include only
48 a subset of the supported modules, removing the other modules.
49
50 Each charset is involved on some transformation between that charset and an
51 internal representation. This transformation is defined on a .so file loaded
52 dynamically with dlopen(3) when the charset defined in this file is requested
53 to iconv_open(3).
54
55 See the comments on gconv-modules file for syntax details.
56 """
57
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040058 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070059 """Initialize the class.
60
61 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040062 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070063 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040064 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070065
66 # An alias map of charsets. The key (fromcharset) is the alias name and
67 # the value (tocharset) is the real charset name. We also support a value
68 # that is an alias for another charset.
69 self._alias = {}
70
71 # The modules dict goes from charset to module names (the filenames without
72 # the .so extension). Since several transformations involving the same
73 # charset could be defined in different files, the values of this dict are
74 # a set of module names.
75 self._modules = {}
76
77 def Load(self):
78 """Load the charsets from gconv-modules."""
79 for line in open(self._filename):
80 line = line.split('#', 1)[0].strip()
81 if not line: # Comment
82 continue
83
84 lst = line.split()
85 if lst[0] == 'module':
86 _, fromset, toset, filename = lst[:4]
87 for charset in (fromset, toset):
88 charset = charset.rstrip('/')
89 mods = self._modules.get(charset, set())
90 mods.add(filename)
91 self._modules[charset] = mods
92 elif lst[0] == 'alias':
93 _, fromset, toset = lst
94 fromset = fromset.rstrip('/')
95 toset = toset.rstrip('/')
96 # Warn if the same charset is defined as two different aliases.
97 if self._alias.get(fromset, toset) != toset:
98 cros_build_lib.Error('charset "%s" already defined as "%s".',
99 fromset, self._alias[fromset])
100 self._alias[fromset] = toset
101 else:
102 cros_build_lib.Die('Unknown line: %s', line)
103
104 cros_build_lib.Debug('Found %d modules and %d alias in %s',
105 len(self._modules), len(self._alias), self._filename)
106 charsets = sorted(self._alias.keys() + self._modules.keys())
107 # Remove the 'INTERNAL' charset from the list, since it is not a charset
108 # but an internal representation used to convert to and from other charsets.
109 if 'INTERNAL' in charsets:
110 charsets.remove('INTERNAL')
111 return charsets
112
113 def Rewrite(self, used_charsets, dry_run=False):
114 """Rewrite gconv-modules file with only the used charsets.
115
116 Args:
117 used_charsets: A list of used charsets. This should be a subset of the
118 list returned by Load().
119 dry_run: Whether this function should not change any file.
120 """
121
122 # Compute the used modules.
123 used_modules = set()
124 for charset in used_charsets:
125 while charset in self._alias:
126 charset = self._alias[charset]
127 used_modules.update(self._modules[charset])
128 unused_modules = reduce(set.union, self._modules.values()) - used_modules
129
Alex Deymo2bba3812014-08-13 08:49:09 -0700130 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700131
132 all_modules = set.union(used_modules, unused_modules)
133 # The list of charsets that depend on a given library. For example,
134 # libdeps['libCNS.so'] is the set of all the modules that require that
135 # library. These libraries live in the same directory as the modules.
136 libdeps = {}
137 for module in all_modules:
138 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
139 modules_dir, [])
140 if not 'needed' in deps:
141 continue
142 for lib in deps['needed']:
143 # Ignore the libs without a path defined (outside the modules_dir).
144 if deps['libs'][lib]['path']:
145 libdeps[lib] = libdeps.get(lib, set()).union([module])
146
147 used_libdeps = set(lib for lib, deps in libdeps.iteritems()
148 if deps.intersection(used_modules))
149 unused_libdeps = set(libdeps).difference(used_libdeps)
150
151 cros_build_lib.Debug('Used modules: %s', ', '.join(sorted(used_modules)))
152 cros_build_lib.Debug('Used dependency libs: %s',
153 ', '.join(sorted(used_libdeps)))
154
Alex Deymo2bba3812014-08-13 08:49:09 -0700155 unused_size = 0
156 for module in sorted(unused_modules):
157 module_path = os.path.join(modules_dir, '%s.so' % module)
158 unused_size += os.lstat(module_path).st_size
159 cros_build_lib.Debug('rm %s', module_path)
160 if not dry_run:
161 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700162
163 unused_libdeps_size = 0
164 for lib in sorted(unused_libdeps):
165 lib_path = os.path.join(modules_dir, lib)
166 unused_libdeps_size += os.lstat(lib_path).st_size
167 cros_build_lib.Debug('rm %s', lib_path)
168 if not dry_run:
169 os.unlink(lib_path)
170
171 cros_build_lib.Info(
172 'Done. Using %d gconv modules. Removed %d unused modules'
173 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
174 len(used_modules), len(unused_modules), unused_size / 1024.,
175 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700176
177 # Recompute the gconv-modules file with only the included gconv modules.
178 result = []
179 for line in open(self._filename):
180 lst = line.split('#', 1)[0].strip().split()
181
182 if not lst:
183 result.append(line) # Keep comments and copyright headers.
184 elif lst[0] == 'module':
185 _, _, _, filename = lst[:4]
186 if filename in used_modules:
187 result.append(line) # Used module
188 elif lst[0] == 'alias':
189 _, charset, _ = lst
190 charset = charset.rstrip('/')
191 while charset in self._alias:
192 charset = self._alias[charset]
193 if used_modules.intersection(self._modules[charset]):
194 result.append(line) # Alias to an used module
195 else:
196 cros_build_lib.Die('Unknown line: %s', line)
197
198 if not dry_run:
199 osutils.WriteFile(self._filename, ''.join(result))
200
201
202def MultipleStringMatch(patterns, corpus):
203 """Search a list of strings in a corpus string.
204
205 Args:
206 patterns: A list of strings.
207 corpus: The text where to search for the strings.
208
209 Result:
210 A list of Booleans stating whether each pattern string was found in the
211 corpus or not.
212 """
213 tree = ahocorasick.KeywordTree()
214 for word in patterns:
215 tree.add(word)
216 tree.make()
217
218 result = [False] * len(patterns)
219 for i, j in tree.findall(corpus):
220 match = corpus[i:j]
221 result[patterns.index(match)] = True
222
223 return result
224
225
226def GconvStrip(opts):
227 """Process gconv-modules and remove unused modules.
228
229 Args:
230 opts: The command-line args passed to the script.
231
232 Returns:
233 The exit code number indicating whether the process succeeded.
234 """
235 root_st = os.lstat(opts.root)
236 if not stat.S_ISDIR(root_st.st_mode):
237 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
238
239 # Detect the possible locations of the gconv-modules file.
240 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
241
242 if not gconv_modules_files:
243 cros_build_lib.Warning('gconv-modules file not found.')
244 return 1
245
246 # Only one gconv-modules files should be present, either on /usr/lib or
247 # /usr/lib64, but not both.
248 if len(gconv_modules_files) > 1:
249 cros_build_lib.Die('Found several gconv-modules files.')
250
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400251 gconv_modules_file = gconv_modules_files[0]
Alex Deymo2bba3812014-08-13 08:49:09 -0700252 cros_build_lib.Info('Searching for unused gconv files defined in %s',
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400253 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700254
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400255 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700256 charsets = gmods.Load()
257
258 # Use scanelf to search for all the binary files on the rootfs that require
259 # or define the symbol iconv_open. We also include the binaries that define
260 # it since there could be internal calls to it from other functions.
261 files = set()
262 for symbol in GCONV_SYMBOLS:
263 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
264 '--symbol', symbol, opts.root]
265 result = cros_build_lib.RunCommand(cmd, redirect_stdout=True,
266 print_cmd=False)
267 symbol_files = result.output.splitlines()
268 cros_build_lib.Debug('Symbol %s found on %d files.',
269 symbol, len(symbol_files))
270 files.update(symbol_files)
271
272 # The charsets are represented as nul-terminated strings in the binary files,
273 # so we append the '\0' to each string. This prevents some false positives
274 # when the name of the charset is a substring of some other string. It doesn't
275 # prevent false positives when the charset name is the suffix of another
276 # string, for example a binary with the string "DON'T DO IT\0" will match the
277 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
278 # charsets could fall in category.
279 strings = [s + '\0' for s in charsets]
280 cros_build_lib.Info('Will search for %d strings in %d files',
281 len(strings), len(files))
282
283 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
284 # strings should be listed in the gconv-modules file.
285 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
286 if unknown_sticky_modules:
287 cros_build_lib.Warning(
288 'The following charsets were explicitly requested in STICKY_MODULES '
289 'even though they don\'t exist: %s',
290 ', '.join(unknown_sticky_modules))
291 global_used = [charset in STICKY_MODULES for charset in charsets]
292
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400293 for filename in files:
294 used_filename = MultipleStringMatch(strings,
295 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700296
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400297 global_used = map(operator.or_, global_used, used_filename)
Alex Deymo2bba3812014-08-13 08:49:09 -0700298 # Check the debug flag to avoid running an useless loop.
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400299 if opts.debug and any(used_filename):
300 cros_build_lib.Debug('File %s:', filename)
301 for i in range(len(used_filename)):
302 if used_filename[i]:
Alex Deymo2bba3812014-08-13 08:49:09 -0700303 cros_build_lib.Debug(' - %s', strings[i])
304
305 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
306 gmods.Rewrite(used_charsets, opts.dry_run)
307 return 0
308
309
310def ParseArgs(argv):
311 """Return parsed commandline arguments."""
312
313 parser = commandline.ArgumentParser()
314 parser.add_argument(
315 '--dry-run', action='store_true', default=False,
316 help='process but don\'t modify any file.')
317 parser.add_argument(
318 'root', type='path',
319 help='path to the directory where the rootfs is mounted.')
320
321 opts = parser.parse_args(argv)
322 opts.Freeze()
323 return opts
324
325
326def main(argv):
327 """Main function to start the script."""
328 opts = ParseArgs(argv)
329 cros_build_lib.Debug('Options are %s', opts)
330
331 return GconvStrip(opts)