blob: 7a28118f16b449152a93ea74125c73183cd5388a [file] [log] [blame]
Alex Deymo2bba3812014-08-13 08:49:09 -07001# Copyright 2014 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Script to remove unused gconv charset modules from a build."""
6
7import ahocorasick
8import glob
Alex Deymoda9dd402014-08-13 08:54:18 -07009import lddtree
Alex Deymo2bba3812014-08-13 08:49:09 -070010import operator
11import os
12import stat
13
14from chromite.lib import commandline
15from chromite.lib import cros_build_lib
16from chromite.lib import osutils
17
18
19# Path pattern to search for the gconv-modules file.
20GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
21
22# Sticky modules. These charsets modules are always included even if they
23# aren't used. You can specify any charset name as supported by 'iconv_open',
24# for example, 'LATIN1' or 'ISO-8859-1'.
25STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
26
27# List of function names (symbols) known to use a charset as a parameter.
28GCONV_SYMBOLS = (
29 # glibc
30 'iconv_open',
31 'iconv',
32 # glib
33 'g_convert',
34 'g_convert_with_fallback',
35 'g_iconv',
36 'g_locale_to_utf8',
37 'g_get_charset',
38)
39
40
41class GconvModules(object):
42 """Class to manipulate the gconv/gconv-modules file and referenced modules.
43
44 This class parses the contents of the gconv-modules file installed by glibc
45 which provides the definition of the charsets supported by iconv_open(3). It
46 allows to load the current gconv-modules file and rewrite it to include only
47 a subset of the supported modules, removing the other modules.
48
49 Each charset is involved on some transformation between that charset and an
50 internal representation. This transformation is defined on a .so file loaded
51 dynamically with dlopen(3) when the charset defined in this file is requested
52 to iconv_open(3).
53
54 See the comments on gconv-modules file for syntax details.
55 """
56
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040057 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070058 """Initialize the class.
59
60 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040061 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070062 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040063 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070064
65 # An alias map of charsets. The key (fromcharset) is the alias name and
66 # the value (tocharset) is the real charset name. We also support a value
67 # that is an alias for another charset.
68 self._alias = {}
69
70 # The modules dict goes from charset to module names (the filenames without
71 # the .so extension). Since several transformations involving the same
72 # charset could be defined in different files, the values of this dict are
73 # a set of module names.
74 self._modules = {}
75
76 def Load(self):
77 """Load the charsets from gconv-modules."""
78 for line in open(self._filename):
79 line = line.split('#', 1)[0].strip()
80 if not line: # Comment
81 continue
82
83 lst = line.split()
84 if lst[0] == 'module':
85 _, fromset, toset, filename = lst[:4]
86 for charset in (fromset, toset):
87 charset = charset.rstrip('/')
88 mods = self._modules.get(charset, set())
89 mods.add(filename)
90 self._modules[charset] = mods
91 elif lst[0] == 'alias':
92 _, fromset, toset = lst
93 fromset = fromset.rstrip('/')
94 toset = toset.rstrip('/')
95 # Warn if the same charset is defined as two different aliases.
96 if self._alias.get(fromset, toset) != toset:
97 cros_build_lib.Error('charset "%s" already defined as "%s".',
98 fromset, self._alias[fromset])
99 self._alias[fromset] = toset
100 else:
101 cros_build_lib.Die('Unknown line: %s', line)
102
103 cros_build_lib.Debug('Found %d modules and %d alias in %s',
104 len(self._modules), len(self._alias), self._filename)
105 charsets = sorted(self._alias.keys() + self._modules.keys())
106 # Remove the 'INTERNAL' charset from the list, since it is not a charset
107 # but an internal representation used to convert to and from other charsets.
108 if 'INTERNAL' in charsets:
109 charsets.remove('INTERNAL')
110 return charsets
111
112 def Rewrite(self, used_charsets, dry_run=False):
113 """Rewrite gconv-modules file with only the used charsets.
114
115 Args:
116 used_charsets: A list of used charsets. This should be a subset of the
117 list returned by Load().
118 dry_run: Whether this function should not change any file.
119 """
120
121 # Compute the used modules.
122 used_modules = set()
123 for charset in used_charsets:
124 while charset in self._alias:
125 charset = self._alias[charset]
126 used_modules.update(self._modules[charset])
127 unused_modules = reduce(set.union, self._modules.values()) - used_modules
128
Alex Deymo2bba3812014-08-13 08:49:09 -0700129 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700130
131 all_modules = set.union(used_modules, unused_modules)
132 # The list of charsets that depend on a given library. For example,
133 # libdeps['libCNS.so'] is the set of all the modules that require that
134 # library. These libraries live in the same directory as the modules.
135 libdeps = {}
136 for module in all_modules:
137 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
138 modules_dir, [])
139 if not 'needed' in deps:
140 continue
141 for lib in deps['needed']:
142 # Ignore the libs without a path defined (outside the modules_dir).
143 if deps['libs'][lib]['path']:
144 libdeps[lib] = libdeps.get(lib, set()).union([module])
145
146 used_libdeps = set(lib for lib, deps in libdeps.iteritems()
147 if deps.intersection(used_modules))
148 unused_libdeps = set(libdeps).difference(used_libdeps)
149
150 cros_build_lib.Debug('Used modules: %s', ', '.join(sorted(used_modules)))
151 cros_build_lib.Debug('Used dependency libs: %s',
152 ', '.join(sorted(used_libdeps)))
153
Alex Deymo2bba3812014-08-13 08:49:09 -0700154 unused_size = 0
155 for module in sorted(unused_modules):
156 module_path = os.path.join(modules_dir, '%s.so' % module)
157 unused_size += os.lstat(module_path).st_size
158 cros_build_lib.Debug('rm %s', module_path)
159 if not dry_run:
160 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700161
162 unused_libdeps_size = 0
163 for lib in sorted(unused_libdeps):
164 lib_path = os.path.join(modules_dir, lib)
165 unused_libdeps_size += os.lstat(lib_path).st_size
166 cros_build_lib.Debug('rm %s', lib_path)
167 if not dry_run:
168 os.unlink(lib_path)
169
170 cros_build_lib.Info(
171 'Done. Using %d gconv modules. Removed %d unused modules'
172 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
173 len(used_modules), len(unused_modules), unused_size / 1024.,
174 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700175
176 # Recompute the gconv-modules file with only the included gconv modules.
177 result = []
178 for line in open(self._filename):
179 lst = line.split('#', 1)[0].strip().split()
180
181 if not lst:
182 result.append(line) # Keep comments and copyright headers.
183 elif lst[0] == 'module':
184 _, _, _, filename = lst[:4]
185 if filename in used_modules:
186 result.append(line) # Used module
187 elif lst[0] == 'alias':
188 _, charset, _ = lst
189 charset = charset.rstrip('/')
190 while charset in self._alias:
191 charset = self._alias[charset]
192 if used_modules.intersection(self._modules[charset]):
193 result.append(line) # Alias to an used module
194 else:
195 cros_build_lib.Die('Unknown line: %s', line)
196
197 if not dry_run:
198 osutils.WriteFile(self._filename, ''.join(result))
199
200
201def MultipleStringMatch(patterns, corpus):
202 """Search a list of strings in a corpus string.
203
204 Args:
205 patterns: A list of strings.
206 corpus: The text where to search for the strings.
207
208 Result:
209 A list of Booleans stating whether each pattern string was found in the
210 corpus or not.
211 """
212 tree = ahocorasick.KeywordTree()
213 for word in patterns:
214 tree.add(word)
215 tree.make()
216
217 result = [False] * len(patterns)
218 for i, j in tree.findall(corpus):
219 match = corpus[i:j]
220 result[patterns.index(match)] = True
221
222 return result
223
224
225def GconvStrip(opts):
226 """Process gconv-modules and remove unused modules.
227
228 Args:
229 opts: The command-line args passed to the script.
230
231 Returns:
232 The exit code number indicating whether the process succeeded.
233 """
234 root_st = os.lstat(opts.root)
235 if not stat.S_ISDIR(root_st.st_mode):
236 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
237
238 # Detect the possible locations of the gconv-modules file.
239 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
240
241 if not gconv_modules_files:
242 cros_build_lib.Warning('gconv-modules file not found.')
243 return 1
244
245 # Only one gconv-modules files should be present, either on /usr/lib or
246 # /usr/lib64, but not both.
247 if len(gconv_modules_files) > 1:
248 cros_build_lib.Die('Found several gconv-modules files.')
249
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400250 gconv_modules_file = gconv_modules_files[0]
Alex Deymo2bba3812014-08-13 08:49:09 -0700251 cros_build_lib.Info('Searching for unused gconv files defined in %s',
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400252 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700253
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400254 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700255 charsets = gmods.Load()
256
257 # Use scanelf to search for all the binary files on the rootfs that require
258 # or define the symbol iconv_open. We also include the binaries that define
259 # it since there could be internal calls to it from other functions.
260 files = set()
261 for symbol in GCONV_SYMBOLS:
262 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
263 '--symbol', symbol, opts.root]
264 result = cros_build_lib.RunCommand(cmd, redirect_stdout=True,
265 print_cmd=False)
266 symbol_files = result.output.splitlines()
267 cros_build_lib.Debug('Symbol %s found on %d files.',
268 symbol, len(symbol_files))
269 files.update(symbol_files)
270
271 # The charsets are represented as nul-terminated strings in the binary files,
272 # so we append the '\0' to each string. This prevents some false positives
273 # when the name of the charset is a substring of some other string. It doesn't
274 # prevent false positives when the charset name is the suffix of another
275 # string, for example a binary with the string "DON'T DO IT\0" will match the
276 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
277 # charsets could fall in category.
278 strings = [s + '\0' for s in charsets]
279 cros_build_lib.Info('Will search for %d strings in %d files',
280 len(strings), len(files))
281
282 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
283 # strings should be listed in the gconv-modules file.
284 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
285 if unknown_sticky_modules:
286 cros_build_lib.Warning(
287 'The following charsets were explicitly requested in STICKY_MODULES '
288 'even though they don\'t exist: %s',
289 ', '.join(unknown_sticky_modules))
290 global_used = [charset in STICKY_MODULES for charset in charsets]
291
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400292 for filename in files:
293 used_filename = MultipleStringMatch(strings,
294 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700295
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400296 global_used = map(operator.or_, global_used, used_filename)
Alex Deymo2bba3812014-08-13 08:49:09 -0700297 # Check the debug flag to avoid running an useless loop.
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400298 if opts.debug and any(used_filename):
299 cros_build_lib.Debug('File %s:', filename)
300 for i in range(len(used_filename)):
301 if used_filename[i]:
Alex Deymo2bba3812014-08-13 08:49:09 -0700302 cros_build_lib.Debug(' - %s', strings[i])
303
304 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
305 gmods.Rewrite(used_charsets, opts.dry_run)
306 return 0
307
308
309def ParseArgs(argv):
310 """Return parsed commandline arguments."""
311
312 parser = commandline.ArgumentParser()
313 parser.add_argument(
314 '--dry-run', action='store_true', default=False,
315 help='process but don\'t modify any file.')
316 parser.add_argument(
317 'root', type='path',
318 help='path to the directory where the rootfs is mounted.')
319
320 opts = parser.parse_args(argv)
321 opts.Freeze()
322 return opts
323
324
325def main(argv):
326 """Main function to start the script."""
327 opts = ParseArgs(argv)
328 cros_build_lib.Debug('Options are %s', opts)
329
330 return GconvStrip(opts)