blob: 4c2e1094443869b7e11eae33decad3a700a4c046 [file] [log] [blame]
Mike Frysingere58c0e22017-10-04 15:43:30 -04001# -*- coding: utf-8 -*-
Alex Deymo2bba3812014-08-13 08:49:09 -07002# Copyright 2014 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to remove unused gconv charset modules from a build."""
7
Mike Frysinger383367e2014-09-16 15:06:17 -04008from __future__ import print_function
9
Alex Deymo2bba3812014-08-13 08:49:09 -070010import ahocorasick
Mike Frysinger9997cc02019-07-17 15:50:01 -040011import functools
Alex Deymo2bba3812014-08-13 08:49:09 -070012import glob
Alex Deymoda9dd402014-08-13 08:54:18 -070013import lddtree
Alex Deymo2bba3812014-08-13 08:49:09 -070014import operator
15import os
16import stat
17
18from chromite.lib import commandline
19from chromite.lib import cros_build_lib
Ralph Nathan5a582ff2015-03-20 18:18:30 -070020from chromite.lib import cros_logging as logging
Alex Deymo2bba3812014-08-13 08:49:09 -070021from chromite.lib import osutils
22
23
24# Path pattern to search for the gconv-modules file.
25GCONV_MODULES_PATH = 'usr/*/gconv/gconv-modules'
26
27# Sticky modules. These charsets modules are always included even if they
28# aren't used. You can specify any charset name as supported by 'iconv_open',
29# for example, 'LATIN1' or 'ISO-8859-1'.
30STICKY_MODULES = ('UTF-16', 'UTF-32', 'UNICODE')
31
32# List of function names (symbols) known to use a charset as a parameter.
33GCONV_SYMBOLS = (
34 # glibc
35 'iconv_open',
36 'iconv',
37 # glib
38 'g_convert',
39 'g_convert_with_fallback',
40 'g_iconv',
41 'g_locale_to_utf8',
42 'g_get_charset',
43)
44
45
46class GconvModules(object):
47 """Class to manipulate the gconv/gconv-modules file and referenced modules.
48
49 This class parses the contents of the gconv-modules file installed by glibc
50 which provides the definition of the charsets supported by iconv_open(3). It
51 allows to load the current gconv-modules file and rewrite it to include only
52 a subset of the supported modules, removing the other modules.
53
54 Each charset is involved on some transformation between that charset and an
55 internal representation. This transformation is defined on a .so file loaded
56 dynamically with dlopen(3) when the charset defined in this file is requested
57 to iconv_open(3).
58
59 See the comments on gconv-modules file for syntax details.
60 """
61
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040062 def __init__(self, gconv_modules_file):
Alex Deymo2bba3812014-08-13 08:49:09 -070063 """Initialize the class.
64
65 Args:
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040066 gconv_modules_file: Path to gconv/gconv-modules file.
Alex Deymo2bba3812014-08-13 08:49:09 -070067 """
Mike Frysinger22f6c5a2014-08-18 00:45:54 -040068 self._filename = gconv_modules_file
Alex Deymo2bba3812014-08-13 08:49:09 -070069
70 # An alias map of charsets. The key (fromcharset) is the alias name and
71 # the value (tocharset) is the real charset name. We also support a value
72 # that is an alias for another charset.
73 self._alias = {}
74
75 # The modules dict goes from charset to module names (the filenames without
76 # the .so extension). Since several transformations involving the same
77 # charset could be defined in different files, the values of this dict are
78 # a set of module names.
79 self._modules = {}
80
81 def Load(self):
82 """Load the charsets from gconv-modules."""
83 for line in open(self._filename):
84 line = line.split('#', 1)[0].strip()
85 if not line: # Comment
86 continue
87
88 lst = line.split()
89 if lst[0] == 'module':
90 _, fromset, toset, filename = lst[:4]
91 for charset in (fromset, toset):
92 charset = charset.rstrip('/')
93 mods = self._modules.get(charset, set())
94 mods.add(filename)
95 self._modules[charset] = mods
96 elif lst[0] == 'alias':
97 _, fromset, toset = lst
98 fromset = fromset.rstrip('/')
99 toset = toset.rstrip('/')
100 # Warn if the same charset is defined as two different aliases.
101 if self._alias.get(fromset, toset) != toset:
Ralph Nathan59900422015-03-24 10:41:17 -0700102 logging.error('charset "%s" already defined as "%s".', fromset,
103 self._alias[fromset])
Alex Deymo2bba3812014-08-13 08:49:09 -0700104 self._alias[fromset] = toset
105 else:
106 cros_build_lib.Die('Unknown line: %s', line)
107
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700108 logging.debug('Found %d modules and %d alias in %s', len(self._modules),
109 len(self._alias), self._filename)
Alex Deymo2bba3812014-08-13 08:49:09 -0700110 charsets = sorted(self._alias.keys() + self._modules.keys())
111 # Remove the 'INTERNAL' charset from the list, since it is not a charset
112 # but an internal representation used to convert to and from other charsets.
113 if 'INTERNAL' in charsets:
114 charsets.remove('INTERNAL')
115 return charsets
116
117 def Rewrite(self, used_charsets, dry_run=False):
118 """Rewrite gconv-modules file with only the used charsets.
119
120 Args:
121 used_charsets: A list of used charsets. This should be a subset of the
122 list returned by Load().
123 dry_run: Whether this function should not change any file.
124 """
125
126 # Compute the used modules.
127 used_modules = set()
128 for charset in used_charsets:
129 while charset in self._alias:
130 charset = self._alias[charset]
131 used_modules.update(self._modules[charset])
Mike Frysinger9997cc02019-07-17 15:50:01 -0400132 unused_modules = (functools.reduce(set.union, self._modules.values()) -
133 used_modules)
Alex Deymo2bba3812014-08-13 08:49:09 -0700134
Alex Deymo2bba3812014-08-13 08:49:09 -0700135 modules_dir = os.path.dirname(self._filename)
Alex Deymoda9dd402014-08-13 08:54:18 -0700136
137 all_modules = set.union(used_modules, unused_modules)
138 # The list of charsets that depend on a given library. For example,
139 # libdeps['libCNS.so'] is the set of all the modules that require that
140 # library. These libraries live in the same directory as the modules.
141 libdeps = {}
142 for module in all_modules:
143 deps = lddtree.ParseELF(os.path.join(modules_dir, '%s.so' % module),
144 modules_dir, [])
Mike Frysinger266e4ff2018-07-14 00:41:05 -0400145 if 'needed' not in deps:
Alex Deymoda9dd402014-08-13 08:54:18 -0700146 continue
147 for lib in deps['needed']:
148 # Ignore the libs without a path defined (outside the modules_dir).
149 if deps['libs'][lib]['path']:
150 libdeps[lib] = libdeps.get(lib, set()).union([module])
151
Mike Frysinger0bdbc102019-06-13 15:27:29 -0400152 used_libdeps = set(lib for lib, deps in libdeps.items()
Alex Deymoda9dd402014-08-13 08:54:18 -0700153 if deps.intersection(used_modules))
154 unused_libdeps = set(libdeps).difference(used_libdeps)
155
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700156 logging.debug('Used modules: %s', ', '.join(sorted(used_modules)))
157 logging.debug('Used dependency libs: %s, '.join(sorted(used_libdeps)))
Alex Deymoda9dd402014-08-13 08:54:18 -0700158
Alex Deymo2bba3812014-08-13 08:49:09 -0700159 unused_size = 0
160 for module in sorted(unused_modules):
161 module_path = os.path.join(modules_dir, '%s.so' % module)
162 unused_size += os.lstat(module_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700163 logging.debug('rm %s', module_path)
Alex Deymo2bba3812014-08-13 08:49:09 -0700164 if not dry_run:
165 os.unlink(module_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700166
167 unused_libdeps_size = 0
168 for lib in sorted(unused_libdeps):
169 lib_path = os.path.join(modules_dir, lib)
170 unused_libdeps_size += os.lstat(lib_path).st_size
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700171 logging.debug('rm %s', lib_path)
Alex Deymoda9dd402014-08-13 08:54:18 -0700172 if not dry_run:
173 os.unlink(lib_path)
174
Ralph Nathan03047282015-03-23 11:09:32 -0700175 logging.info('Done. Using %d gconv modules. Removed %d unused modules'
176 ' (%.1f KiB) and %d unused dependencies (%.1f KiB)',
177 len(used_modules), len(unused_modules), unused_size / 1024.,
178 len(unused_libdeps), unused_libdeps_size / 1024.)
Alex Deymo2bba3812014-08-13 08:49:09 -0700179
180 # Recompute the gconv-modules file with only the included gconv modules.
181 result = []
182 for line in open(self._filename):
183 lst = line.split('#', 1)[0].strip().split()
184
185 if not lst:
186 result.append(line) # Keep comments and copyright headers.
187 elif lst[0] == 'module':
188 _, _, _, filename = lst[:4]
189 if filename in used_modules:
190 result.append(line) # Used module
191 elif lst[0] == 'alias':
192 _, charset, _ = lst
193 charset = charset.rstrip('/')
194 while charset in self._alias:
195 charset = self._alias[charset]
196 if used_modules.intersection(self._modules[charset]):
197 result.append(line) # Alias to an used module
198 else:
199 cros_build_lib.Die('Unknown line: %s', line)
200
201 if not dry_run:
202 osutils.WriteFile(self._filename, ''.join(result))
203
204
205def MultipleStringMatch(patterns, corpus):
206 """Search a list of strings in a corpus string.
207
208 Args:
209 patterns: A list of strings.
210 corpus: The text where to search for the strings.
211
Mike Frysingerc6a67da2016-09-21 00:47:20 -0400212 Returns:
Alex Deymo2bba3812014-08-13 08:49:09 -0700213 A list of Booleans stating whether each pattern string was found in the
214 corpus or not.
215 """
216 tree = ahocorasick.KeywordTree()
217 for word in patterns:
218 tree.add(word)
219 tree.make()
220
221 result = [False] * len(patterns)
222 for i, j in tree.findall(corpus):
223 match = corpus[i:j]
224 result[patterns.index(match)] = True
225
226 return result
227
228
229def GconvStrip(opts):
230 """Process gconv-modules and remove unused modules.
231
232 Args:
233 opts: The command-line args passed to the script.
234
235 Returns:
236 The exit code number indicating whether the process succeeded.
237 """
238 root_st = os.lstat(opts.root)
239 if not stat.S_ISDIR(root_st.st_mode):
240 cros_build_lib.Die('root (%s) must be a directory.' % opts.root)
241
242 # Detect the possible locations of the gconv-modules file.
243 gconv_modules_files = glob.glob(os.path.join(opts.root, GCONV_MODULES_PATH))
244
245 if not gconv_modules_files:
Ralph Nathan446aee92015-03-23 14:44:56 -0700246 logging.warning('gconv-modules file not found.')
Alex Deymo2bba3812014-08-13 08:49:09 -0700247 return 1
248
249 # Only one gconv-modules files should be present, either on /usr/lib or
250 # /usr/lib64, but not both.
251 if len(gconv_modules_files) > 1:
252 cros_build_lib.Die('Found several gconv-modules files.')
253
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400254 gconv_modules_file = gconv_modules_files[0]
Ralph Nathan03047282015-03-23 11:09:32 -0700255 logging.info('Searching for unused gconv files defined in %s',
256 gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700257
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400258 gmods = GconvModules(gconv_modules_file)
Alex Deymo2bba3812014-08-13 08:49:09 -0700259 charsets = gmods.Load()
260
261 # Use scanelf to search for all the binary files on the rootfs that require
262 # or define the symbol iconv_open. We also include the binaries that define
263 # it since there could be internal calls to it from other functions.
Ned Nguyen2734fe82018-12-20 10:03:53 -0700264 symbols = ','.join(GCONV_SYMBOLS)
265 cmd = ['scanelf', '--mount', '--quiet', '--recursive', '--format', '#s%F',
266 '--symbol', symbols, opts.root]
267 result = cros_build_lib.RunCommand(cmd, redirect_stdout=True,
268 print_cmd=False)
269 files = set(result.output.splitlines())
270 logging.debug('Symbols %s found on %d files.', symbols, len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700271
272 # The charsets are represented as nul-terminated strings in the binary files,
273 # so we append the '\0' to each string. This prevents some false positives
274 # when the name of the charset is a substring of some other string. It doesn't
275 # prevent false positives when the charset name is the suffix of another
276 # string, for example a binary with the string "DON'T DO IT\0" will match the
277 # 'IT' charset. Empirical test on ChromeOS images suggests that only 4
278 # charsets could fall in category.
279 strings = [s + '\0' for s in charsets]
Ralph Nathan03047282015-03-23 11:09:32 -0700280 logging.info('Will search for %d strings in %d files', len(strings),
281 len(files))
Alex Deymo2bba3812014-08-13 08:49:09 -0700282
283 # Charsets listed in STICKY_MOUDLES are initialized as used. Note that those
284 # strings should be listed in the gconv-modules file.
285 unknown_sticky_modules = set(STICKY_MODULES) - set(charsets)
286 if unknown_sticky_modules:
Ralph Nathan446aee92015-03-23 14:44:56 -0700287 logging.warning(
Alex Deymo2bba3812014-08-13 08:49:09 -0700288 'The following charsets were explicitly requested in STICKY_MODULES '
Mike Frysinger80de5012019-08-01 14:10:53 -0400289 "even though they don't exist: %s",
Alex Deymo2bba3812014-08-13 08:49:09 -0700290 ', '.join(unknown_sticky_modules))
291 global_used = [charset in STICKY_MODULES for charset in charsets]
292
Mike Frysinger22f6c5a2014-08-18 00:45:54 -0400293 for filename in files:
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400294 used_filenames = MultipleStringMatch(strings,
295 osutils.ReadFile(filename, mode='rb'))
Alex Deymo2bba3812014-08-13 08:49:09 -0700296
Mike Frysinger66ce4132019-07-17 22:52:52 -0400297 global_used = [operator.or_(*x) for x in zip(global_used, used_filenames)]
Alex Deymo2bba3812014-08-13 08:49:09 -0700298 # Check the debug flag to avoid running an useless loop.
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400299 if opts.debug and any(used_filenames):
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700300 logging.debug('File %s:', filename)
Mike Frysinger8960f7c2018-07-14 00:52:26 -0400301 for i, used_filename in enumerate(used_filenames):
302 if used_filename:
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700303 logging.debug(' - %s', strings[i])
Alex Deymo2bba3812014-08-13 08:49:09 -0700304
305 used_charsets = [cs for cs, used in zip(charsets, global_used) if used]
306 gmods.Rewrite(used_charsets, opts.dry_run)
307 return 0
308
309
310def ParseArgs(argv):
311 """Return parsed commandline arguments."""
312
313 parser = commandline.ArgumentParser()
314 parser.add_argument(
315 '--dry-run', action='store_true', default=False,
Mike Frysinger80de5012019-08-01 14:10:53 -0400316 help="process but don't modify any file.")
Alex Deymo2bba3812014-08-13 08:49:09 -0700317 parser.add_argument(
318 'root', type='path',
319 help='path to the directory where the rootfs is mounted.')
320
321 opts = parser.parse_args(argv)
322 opts.Freeze()
323 return opts
324
325
326def main(argv):
327 """Main function to start the script."""
328 opts = ParseArgs(argv)
Ralph Nathan5a582ff2015-03-20 18:18:30 -0700329 logging.debug('Options are %s', opts)
Alex Deymo2bba3812014-08-13 08:49:09 -0700330
331 return GconvStrip(opts)