Mike Frysinger | f1ba7ad | 2022-09-12 05:42:57 -0400 | [diff] [blame] | 1 | # Copyright 2014 The ChromiumOS Authors |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | |
Mike Frysinger | 750c5f5 | 2014-09-16 16:16:57 -0400 | [diff] [blame] | 5 | """Script to discover dependencies and other file information from a build. |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 6 | |
| 7 | Some files in the image are installed to provide some functionality, such as |
| 8 | chrome, shill or bluetoothd provide different functionality that can be |
| 9 | present or not on a given build. Many other files are dependencies from these |
| 10 | files that need to be present in the image for them to work. These dependencies |
| 11 | come from needed shared libraries, executed files and other configuration files |
| 12 | read. |
| 13 | |
| 14 | This script currently discovers dependencies between ELF files for libraries |
| 15 | required at load time (libraries loaded by the dynamic linker) but not |
Alex Deymo | 365b10c | 2014-08-25 13:14:28 -0700 | [diff] [blame] | 16 | libraries loaded at runtime with dlopen(). It also computes size and file type |
| 17 | in several cases to help understand the contents of the built image. |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 18 | """ |
| 19 | |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 20 | import json |
Chris McDonald | 59650c3 | 2021-07-20 15:29:28 -0600 | [diff] [blame] | 21 | import logging |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 22 | import multiprocessing |
| 23 | import os |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 24 | import stat |
Mike Frysinger | 00688e1 | 2022-04-21 21:22:35 -0400 | [diff] [blame] | 25 | from typing import Union |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 26 | |
Chris McDonald | 59650c3 | 2021-07-20 15:29:28 -0600 | [diff] [blame] | 27 | from chromite.third_party import lddtree |
| 28 | |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 29 | from chromite.lib import commandline |
Alex Deymo | 365b10c | 2014-08-25 13:14:28 -0700 | [diff] [blame] | 30 | from chromite.lib import filetype |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 31 | from chromite.lib import parseelf |
Alex Deymo | c99dd0b | 2014-09-09 16:15:17 -0700 | [diff] [blame] | 32 | from chromite.lib import portage_util |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 33 | |
| 34 | |
| 35 | # Regex to parse Gentoo atoms. This should match the following ebuild names, |
| 36 | # splitting the package name from the version. |
| 37 | # without version: |
| 38 | # chromeos-base/tty |
| 39 | # chromeos-base/libchrome-271506 |
| 40 | # sys-kernel/chromeos-kernel-3_8 |
| 41 | # with version: |
| 42 | # chromeos-base/tty-0.0.1-r4 |
| 43 | # chromeos-base/libchrome-271506-r5 |
| 44 | # sys-kernel/chromeos-kernel-3_8-3.8.11-r35 |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 45 | RE_EBUILD_WITHOUT_VERSION = r"^([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)$" |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 46 | RE_EBUILD_WITH_VERSION = ( |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 47 | r"^=?([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)\-([^\-]+(\-r\d+)?)$" |
| 48 | ) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 49 | |
| 50 | |
| 51 | def ParseELFWithArgs(args): |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 52 | """Wrapper to parseelf.ParseELF accepting a single arg. |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 53 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 54 | This wrapper is required to use multiprocessing.Pool.map function. |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 55 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 56 | Returns: |
| 57 | A 2-tuple with the passed relative path and the result of ParseELF(). On |
| 58 | error, when ParseELF() returns None, this function returns None. |
| 59 | """ |
| 60 | elf = parseelf.ParseELF(*args) |
| 61 | if elf is None: |
| 62 | return |
| 63 | return args[1], elf |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 64 | |
| 65 | |
| 66 | class DepTracker(object): |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 67 | """Tracks dependencies and file information in a root directory. |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 68 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 69 | This class computes dependencies and other information related to the files |
| 70 | in the root image. |
| 71 | """ |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 72 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 73 | def __init__(self, root: Union[str, os.PathLike], jobs: int = 1): |
| 74 | # TODO(vapier): Convert this to Path. |
| 75 | root = str(root) |
| 76 | root_st = os.lstat(root) |
| 77 | if not stat.S_ISDIR(root_st.st_mode): |
| 78 | raise Exception("root (%s) must be a directory" % root) |
| 79 | self._root = root.rstrip("/") + "/" |
| 80 | self._file_type_decoder = filetype.FileTypeDecoder(root) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 81 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 82 | # A wrapper to the multiprocess map function. We avoid launching a pool |
| 83 | # of processes when jobs is 1 so python exceptions kill the main process, |
| 84 | # useful for debugging. |
| 85 | if jobs > 1: |
| 86 | # Pool is close()d in DepTracker's destructor. |
| 87 | # pylint: disable=consider-using-with |
| 88 | self._pool = multiprocessing.Pool(jobs) |
| 89 | self._imap = self._pool.map |
| 90 | else: |
| 91 | self._imap = map |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 92 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 93 | self._files = {} |
| 94 | self._ebuilds = {} |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 95 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 96 | # Mapping of rel_paths for symlinks and hardlinks. Hardlinks are assumed |
| 97 | # to point to the lowest lexicographically file with the same inode. |
| 98 | self._symlinks = {} |
| 99 | self._hardlinks = {} |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 100 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 101 | def __del__(self): |
| 102 | """Destructor method to free up self._pool resource.""" |
| 103 | self._pool.close() |
Sergey Frolov | 73eaa31 | 2022-06-06 17:26:10 -0600 | [diff] [blame] | 104 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 105 | def Init(self): |
| 106 | """Generates the initial list of files.""" |
| 107 | # First iteration over all the files in root searching for symlinks and |
| 108 | # non-regular files. |
| 109 | seen_inodes = {} |
| 110 | for basepath, _, filenames in sorted(os.walk(self._root)): |
| 111 | for filename in sorted(filenames): |
| 112 | full_path = os.path.join(basepath, filename) |
| 113 | rel_path = full_path[len(self._root) :] |
| 114 | st = os.lstat(full_path) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 115 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 116 | file_data = { |
| 117 | "size": st.st_size, |
| 118 | } |
| 119 | self._files[rel_path] = file_data |
| 120 | |
| 121 | # Track symlinks. |
| 122 | if stat.S_ISLNK(st.st_mode): |
| 123 | link_path = os.readlink(full_path) |
| 124 | # lddtree's normpath handles a little more cases than the os.path |
| 125 | # version. In particular, it handles the '//' case. |
| 126 | self._symlinks[rel_path] = ( |
| 127 | link_path.lstrip("/") |
| 128 | if link_path and link_path[0] == "/" |
| 129 | else lddtree.normpath( |
| 130 | os.path.join(os.path.dirname(rel_path), link_path) |
| 131 | ) |
| 132 | ) |
| 133 | file_data["deps"] = {"symlink": [self._symlinks[rel_path]]} |
| 134 | |
| 135 | # Track hardlinks. |
| 136 | if st.st_ino in seen_inodes: |
| 137 | self._hardlinks[rel_path] = seen_inodes[st.st_ino] |
| 138 | continue |
| 139 | seen_inodes[st.st_ino] = rel_path |
| 140 | |
| 141 | def SaveJSON(self, filename): |
| 142 | """Save the computed information to a JSON file. |
| 143 | |
| 144 | Args: |
| 145 | filename: The destination JSON file. |
| 146 | """ |
| 147 | data = { |
| 148 | "files": self._files, |
| 149 | "ebuilds": self._ebuilds, |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 150 | } |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 151 | json.dump(data, open(filename, "w")) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 152 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 153 | def ComputeEbuildDeps(self, sysroot): |
| 154 | """Compute the dependencies between ebuilds and files. |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 155 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 156 | Iterates over the list of ebuilds in the database and annotates the files |
| 157 | with the ebuilds they are in. For each ebuild installing a file in the root, |
| 158 | also compute the direct dependencies. Stores the information internally. |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 159 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 160 | Args: |
| 161 | sysroot: The path to the sysroot, for example "/build/link". |
| 162 | """ |
| 163 | portage_db = portage_util.PortageDB(sysroot) |
| 164 | if not os.path.exists(portage_db.db_path): |
| 165 | logging.warning( |
| 166 | "PortageDB directory not found: %s", portage_db.db_path |
| 167 | ) |
| 168 | return |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 169 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 170 | for pkg in portage_db.InstalledPackages(): |
| 171 | pkg_files = [] |
| 172 | pkg_size = 0 |
| 173 | cpf = "%s/%s" % (pkg.category, pkg.pf) |
| 174 | for typ, rel_path in pkg.ListContents(): |
| 175 | # We ignore other entries like for example "dir". |
| 176 | if not typ in (pkg.OBJ, pkg.SYM): |
| 177 | continue |
| 178 | # We ignore files installed in the SYSROOT that weren't copied to the |
| 179 | # image. |
| 180 | if not rel_path in self._files: |
| 181 | continue |
| 182 | pkg_files.append(rel_path) |
| 183 | file_data = self._files[rel_path] |
| 184 | if "ebuild" in file_data: |
| 185 | logging.warning( |
| 186 | "Duplicated entry for %s: %s and %s", |
| 187 | rel_path, |
| 188 | file_data["ebuild"], |
| 189 | cpf, |
| 190 | ) |
| 191 | file_data["ebuild"] = cpf |
| 192 | pkg_size += file_data["size"] |
| 193 | # Ignore packages that don't install any file. |
| 194 | if not pkg_files: |
| 195 | continue |
| 196 | self._ebuilds[cpf] = { |
| 197 | "size": pkg_size, |
| 198 | "files": len(pkg_files), |
| 199 | "atom": "%s/%s" % (pkg.category, pkg.package), |
| 200 | "version": pkg.version, |
| 201 | } |
| 202 | # TODO(deymo): Parse dependencies between ebuilds. |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 203 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 204 | def ComputeELFFileDeps(self): |
| 205 | """Computes the dependencies between files. |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 206 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 207 | Computes the dependencies between the files in the root directory passed |
| 208 | during construction. The dependencies are inferred for ELF files. |
| 209 | The list of dependencies for each file in the passed rootfs as a dict(). |
| 210 | The result's keys are the relative path of the files and the value of each |
| 211 | file is a list of dependencies. A dependency is a tuple (dep_path, |
| 212 | dep_type) where the dep_path is relative path from the passed root to the |
| 213 | dependent file and dep_type is one the following strings stating how the |
| 214 | dependency was discovered: |
| 215 | 'ldd': The dependent ELF file is listed as needed in the dynamic section. |
| 216 | 'symlink': The dependent file is a symlink to the depending. |
| 217 | If there are dependencies of a given type whose target file wasn't |
| 218 | determined, a tuple (None, dep_type) is included. This is the case for |
| 219 | example is a program uses library that wasn't found. |
| 220 | """ |
| 221 | ldpaths = lddtree.LoadLdpaths(self._root) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 222 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 223 | # First iteration over all the files in root searching for symlinks and |
| 224 | # non-regular files. |
| 225 | parseelf_args = [] |
| 226 | for rel_path, file_data in self._files.items(): |
| 227 | if rel_path in self._symlinks or rel_path in self._hardlinks: |
| 228 | continue |
Alex Deymo | c99dd0b | 2014-09-09 16:15:17 -0700 | [diff] [blame] | 229 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 230 | full_path = os.path.join(self._root, rel_path) |
| 231 | st = os.lstat(full_path) |
| 232 | if not stat.S_ISREG(st.st_mode): |
| 233 | continue |
| 234 | parseelf_args.append((self._root, rel_path, ldpaths)) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 235 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 236 | # Parallelize the ELF lookup step since it is quite expensive. |
| 237 | elfs = dict( |
| 238 | x |
| 239 | for x in self._imap(ParseELFWithArgs, parseelf_args) |
| 240 | if not x is None |
| 241 | ) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 242 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 243 | for rel_path, elf in elfs.items(): |
| 244 | file_data = self._files[rel_path] |
| 245 | # Fill in the ftype if not set yet. We complete this value at this point |
| 246 | # to avoid re-parsing the ELF file later. |
| 247 | if not "ftype" in file_data: |
| 248 | ftype = self._file_type_decoder.GetType(rel_path, elf=elf) |
| 249 | if ftype: |
| 250 | file_data["ftype"] = ftype |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 251 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 252 | file_deps = file_data.get("deps", {}) |
| 253 | # Dependencies based on the result of ldd. |
| 254 | for lib in elf.get("needed", []): |
| 255 | lib_path = elf["libs"][lib]["path"] |
| 256 | if not "ldd" in file_deps: |
| 257 | file_deps["ldd"] = [] |
| 258 | file_deps["ldd"].append(lib_path) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 259 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 260 | if file_deps: |
| 261 | file_data["deps"] = file_deps |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 262 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 263 | def ComputeFileTypes(self): |
| 264 | """Computes all the missing file type for the files in the root.""" |
| 265 | for rel_path, file_data in self._files.items(): |
| 266 | if "ftype" in file_data: |
| 267 | continue |
| 268 | ftype = self._file_type_decoder.GetType(rel_path) |
| 269 | if ftype: |
| 270 | file_data["ftype"] = ftype |
Alex Deymo | 365b10c | 2014-08-25 13:14:28 -0700 | [diff] [blame] | 271 | |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 272 | |
| 273 | def ParseArgs(argv): |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 274 | """Return parsed commandline arguments.""" |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 275 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 276 | parser = commandline.ArgumentParser() |
| 277 | parser.add_argument( |
| 278 | "-j", |
| 279 | "--jobs", |
| 280 | type=int, |
| 281 | default=multiprocessing.cpu_count(), |
| 282 | help="number of simultaneous jobs.", |
| 283 | ) |
| 284 | parser.add_argument( |
| 285 | "--sysroot", |
| 286 | type="path", |
| 287 | metavar="SYSROOT", |
| 288 | help="parse portage DB for ebuild information from the provided sysroot.", |
| 289 | ) |
| 290 | parser.add_argument( |
| 291 | "--json", type="path", help="store information in JSON file." |
| 292 | ) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 293 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 294 | parser.add_argument( |
| 295 | "root", |
| 296 | type="path", |
| 297 | help="path to the directory where the rootfs is mounted.", |
| 298 | ) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 299 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 300 | opts = parser.parse_args(argv) |
| 301 | opts.Freeze() |
| 302 | return opts |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 303 | |
| 304 | |
| 305 | def main(argv): |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 306 | """Main function to start the script.""" |
| 307 | opts = ParseArgs(argv) |
| 308 | logging.debug("Options are %s", opts) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 309 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 310 | dt = DepTracker(opts.root, jobs=opts.jobs) |
| 311 | dt.Init() |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 312 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 313 | dt.ComputeELFFileDeps() |
| 314 | dt.ComputeFileTypes() |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 315 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 316 | if opts.sysroot: |
| 317 | dt.ComputeEbuildDeps(opts.sysroot) |
Alex Deymo | 3cfb9cd | 2014-08-18 15:56:35 -0700 | [diff] [blame] | 318 | |
Alex Klein | 1699fab | 2022-09-08 08:46:06 -0600 | [diff] [blame] | 319 | if opts.json: |
| 320 | dt.SaveJSON(opts.json) |