blob: a2b9541982942c3f5bf18413bc9d5efb4ca6c4fa [file] [log] [blame]
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -08001#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3# Copyright 2021 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6""" This script cleans up the vendor directory.
7"""
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -07008import argparse
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +00009import hashlib
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -080010import json
11import os
12import pathlib
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -070013import re
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +000014import subprocess
Abhishek Pandit-Subedice0f5b22021-09-10 15:50:08 -070015import toml
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +000016
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -070017# We only care about crates we're actually going to use and that's usually
18# limited to ones with cfg(linux). For running `cargo metadata`, limit results
19# to only this platform
20DEFAULT_PLATFORM_FILTER = "x86_64-unknown-linux-gnu"
21
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +000022
23def _rerun_checksums(package_path):
24 """Re-run checksums for given package.
25
26 Writes resulting checksums to $package_path/.cargo-checksum.json.
27 """
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -070028 hashes = dict()
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +000029 checksum_path = os.path.join(package_path, '.cargo-checksum.json')
30 if not pathlib.Path(checksum_path).is_file():
31 return False
32
33 with open(checksum_path, 'r') as fread:
34 contents = json.load(fread)
35
36 for root, _, files in os.walk(package_path, topdown=True):
37 for f in files:
38 # Don't checksum an existing checksum file
39 if f == ".cargo-checksum.json":
40 continue
41
42 file_path = os.path.join(root, f)
43 with open(file_path, 'rb') as frb:
44 m = hashlib.sha256()
45 m.update(frb.read())
46 d = m.hexdigest()
47
48 # Key is relative to the package path so strip from beginning
49 key = os.path.relpath(file_path, package_path)
50 hashes[key] = d
51
52 if hashes:
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -070053 print("{} regenerated {} hashes".format(package_path,
54 len(hashes.keys())))
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +000055 contents['files'] = hashes
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +000056 with open(checksum_path, 'w') as fwrite:
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -070057 json.dump(contents, fwrite, sort_keys=True)
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +000058
59 return True
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -080060
61
62def _remove_OWNERS_checksum(root):
63 """ Delete all OWNERS files from the checksum file.
64
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +000065 Args:
66 root: Root directory for the vendored crate.
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -080067
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +000068 Returns:
69 True if OWNERS was found and cleaned up. Otherwise False.
70 """
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -080071 checksum_path = os.path.join(root, '.cargo-checksum.json')
72 if not pathlib.Path(checksum_path).is_file():
73 return False
74
75 with open(checksum_path, 'r') as fread:
76 contents = json.load(fread)
77
78 del_keys = []
79 for cfile in contents['files']:
80 if 'OWNERS' in cfile:
81 del_keys.append(cfile)
82
83 for key in del_keys:
84 del contents['files'][key]
85
86 if del_keys:
87 print('{} deleted: {}'.format(root, del_keys))
88 with open(checksum_path, 'w') as fwrite:
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -070089 json.dump(contents, fwrite, sort_keys=True)
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -080090
91 return bool(del_keys)
92
93
94def cleanup_owners(vendor_path):
95 """ Remove owners checksums from the vendor directory.
96
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +000097 We currently do not check in the OWNERS files from vendored crates because
98 they interfere with the find-owners functionality in gerrit. This cleanup
99 simply finds all instances of "OWNERS" in the checksum files within and
100 removes them.
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -0800101
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +0000102 Args:
103 vendor_path: Absolute path to vendor directory.
104 """
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -0800105 deps_cleaned = []
106 for root, dirs, _ in os.walk(vendor_path):
107 for d in dirs:
108 removed = _remove_OWNERS_checksum(os.path.join(root, d))
109 if removed:
110 deps_cleaned.append(d)
111
112 if deps_cleaned:
113 print('Cleanup owners:\n {}'.format("\n".join(deps_cleaned)))
114
115
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +0000116def apply_single_patch(patch, workdir):
117 """Apply a single patch and return whether it was successful.
118
119 Returns:
120 True if successful. False otherwise.
121 """
122 print("-- Applying {}".format(patch))
123 proc = subprocess.run(["patch", "-p1", "-i", patch], cwd=workdir)
124 return proc.returncode == 0
125
126
127def apply_patches(patches_path, vendor_path):
128 """Finds patches and applies them to sub-folders in the vendored crates.
129
130 Args:
131 patches_path: Path to folder with patches. Expect all patches to be one
132 level down (matching the crate name).
133 vendor_path: Root path to vendored crates directory.
134 """
135 checksums_for = {}
136
137 # Don't bother running if patches directory is empty
138 if not pathlib.Path(patches_path).is_dir():
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -0700139 return
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +0000140
141 # Look for all patches and apply them
142 for d in os.listdir(patches_path):
143 dir_path = os.path.join(patches_path, d)
144
145 # We don't process patches in root dir
146 if not os.path.isdir(dir_path):
147 continue
148
149 for patch in os.listdir(os.path.join(dir_path)):
150 file_path = os.path.join(dir_path, patch)
151
152 # Skip if not a patch file
153 if not os.path.isfile(file_path) or not patch.endswith(".patch"):
154 continue
155
156 # If there are any patches, queue checksums for that folder.
157 checksums_for[d] = True
158
159 # Apply the patch. Exit from patch loop if patching failed.
160 success = apply_single_patch(file_path,
161 os.path.join(vendor_path, d))
162 if not success:
163 print("Failed to apply patch: {}".format(patch))
164 break
165
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +0000166 # Re-run checksums for all modified packages since we applied patches.
167 for key in checksums_for.keys():
168 _rerun_checksums(os.path.join(vendor_path, key))
169
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -0700170
Abhishek Pandit-Subedifa902382021-08-20 11:04:33 -0700171def run_cargo_vendor(working_dir):
172 """Runs cargo vendor.
173
174 Args:
175 working_dir: Directory to run inside. This should be the directory where
Abhishek Pandit-Subedice0f5b22021-09-10 15:50:08 -0700176 Cargo.toml is kept.
Abhishek Pandit-Subedifa902382021-08-20 11:04:33 -0700177 """
178 subprocess.check_call(["cargo", "vendor"], cwd=working_dir)
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +0000179
Abhishek Pandit-Subedice0f5b22021-09-10 15:50:08 -0700180
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -0700181def load_metadata(working_dir, filter_platform=DEFAULT_PLATFORM_FILTER):
182 """Load metadata for manifest at given directory.
183
184 Args:
185 working_dir: Directory to run from.
186 filter_platform: Filter packages to ones configured for this platform.
187 """
188 manifest_path = os.path.join(working_dir, 'Cargo.toml')
189 cmd = [
190 'cargo', 'metadata', '--format-version', '1', "--filter-platform",
191 filter_platform, '--manifest-path', manifest_path
192 ]
193 output = subprocess.check_output(cmd, cwd=working_dir)
194
195 return json.loads(output)
196
197
198class LicenseManager:
199 """ Manage consolidating licenses for all packages."""
200
201 # These are all the licenses we support. Keys are what is seen in metadata and
202 # values are what is expected by the ebuild.
203 SUPPORTED_LICENSES = {
204 'Apache-2.0': 'Apache-2.0',
205 'MIT': 'MIT',
206 'BSD-3-Clause': 'BSD-3',
207 'ISC': 'ISC'
208 }
209
210 # Prefer to take attribution licenses in this order. All these require that
211 # we actually use the license file found in the package so they MUST have
212 # a license file set.
213 PREFERRED_ATTRIB_LICENSE_ORDER = ['MIT', 'BSD-3', 'ISC']
214
215 # If Apache license is found, always prefer it (simplifies attribution)
216 APACHE_LICENSE = 'Apache-2.0'
217
218 # Regex for license files found in the vendored directories. Search for
219 # these files with re.IGNORECASE.
220 #
221 # These will be searched in order with the earlier entries being preferred.
222 LICENSE_NAMES_REGEX = [
223 r'^license-mit$',
224 r'^copyright$',
225 r'^licen[cs]e.*$',
226 ]
227
228 # Some crates have their license file in other crates. This usually occurs
229 # because multiple crates are published from the same git repository and the
230 # license isn't updated in each sub-crate. In these cases, we can just
231 # ignore these packages.
232 MAP_LICENSE_TO_OTHER = {
233 'failure_derive': 'failure',
234 'grpcio-compiler': 'grpcio',
235 'grpcio-sys': 'grpcio',
236 'rustyline-derive': 'rustyline',
237 }
238
239 # Map a package to a specific license and license file. Only use this if
240 # a package doesn't have an easily discoverable license or exports its
241 # license in a weird way. Prefer to patch the project with a license and
242 # upstream the patch instead.
243 STATIC_LICENSE_MAP = {
244 # "package name": ( "license name", "license file relative location")
245 }
246
247 def __init__(self, working_dir, vendor_dir):
248 self.working_dir = working_dir
249 self.vendor_dir = vendor_dir
250
251 def _find_license_in_dir(self, search_dir):
252 for p in os.listdir(search_dir):
253 # Ignore anything that's not a file
254 if not os.path.isfile(os.path.join(search_dir, p)):
255 continue
256
257 # Now check if the name matches any of the regexes
258 # We'll return the first matching file.
259 for regex in self.LICENSE_NAMES_REGEX:
260 if re.search(regex, p, re.IGNORECASE):
261 yield os.path.join(search_dir, p)
262 break
263
264 def _guess_license_type(self, license_file):
265 if '-MIT' in license_file:
266 return 'MIT'
267 elif '-APACHE' in license_file:
268 return 'APACHE'
269 elif '-BSD' in license_file:
270 return 'BSD-3'
271
272 with open(license_file, 'r') as f:
273 lines = f.read()
274 if 'MIT' in lines:
275 return 'MIT'
276 elif 'Apache' in lines:
277 return 'APACHE'
278 elif 'BSD 3-Clause' in lines:
279 return 'BSD-3'
280
281 return ''
282
283 def generate_license(self, skip_license_check, print_map_to_file):
284 """Generate single massive license file from metadata."""
285 metadata = load_metadata(self.working_dir)
286
287 has_license_types = set()
288 bad_licenses = {}
289
290 # Keep license map ordered so it generates a consistent license map
291 license_map = {}
292
293 skip_license_check = skip_license_check or []
294
295 for package in metadata['packages']:
296 pkg_name = package['name']
297
298 # Skip vendor libs directly
299 if pkg_name == "vendor_libs":
300 continue
301
302 if pkg_name in skip_license_check:
303 print(
304 "Skipped license check on {}. Reason: Skipped from command line"
305 .format(pkg_name))
306 continue
307
308 if pkg_name in self.MAP_LICENSE_TO_OTHER:
309 print(
310 'Skipped license check on {}. Reason: License already in {}'
311 .format(pkg_name, self.MAP_LICENSE_TO_OTHER[pkg_name]))
312 continue
313
314 # Check if we have a static license map for this package. Use the
315 # static values if we have it already set.
316 if pkg_name in self.STATIC_LICENSE_MAP:
317 (license, license_file) = self.STATIC_LICENSE_MAP[pkg_name]
318 license_map[pkg_name] = {
319 "license": license,
320 "license_file": license_file,
321 }
322 continue
323
324 license_files = []
325 license = package.get('license', '')
326
327 # We ignore the metadata for license file because most crates don't
328 # have it set. Just scan the source for licenses.
329 license_files = [
330 x for x in self._find_license_in_dir(
331 os.path.join(self.vendor_dir, pkg_name))
332 ]
333
334 # If there are multiple licenses, they are delimited with "OR" or "/"
335 delim = ' OR ' if ' OR ' in license else '/'
336 found = license.split(delim)
337
338 # Filter licenses to ones we support
339 licenses_or = [
340 self.SUPPORTED_LICENSES[f] for f in found
341 if f in self.SUPPORTED_LICENSES
342 ]
343
344 # If apache license is found, always prefer it because it simplifies
345 # license attribution (we can use existing Apache notice)
346 if self.APACHE_LICENSE in licenses_or:
347 has_license_types.add(self.APACHE_LICENSE)
348 license_map[pkg_name] = {'license': self.APACHE_LICENSE}
349
350 # Handle single license that has at least one license file
351 # We pick the first license file and the license
352 elif len(licenses_or) == 1:
353 if license_files:
354 l = licenses_or[0]
355 lf = license_files[0]
356
357 has_license_types.add(l)
358 license_map[pkg_name] = {
359 'license': l,
360 'license_file': os.path.relpath(lf, self.working_dir),
361 }
362 else:
363 bad_licenses[pkg_name] = "{} missing license file".format(
364 licenses_or[0])
365 # Handle multiple licenses
366 elif len(licenses_or) > 1:
367 # Check preferred licenses in order
368 license_found = False
369 for l in self.PREFERRED_ATTRIB_LICENSE_ORDER:
370 if not l in licenses_or:
371 continue
372
373 for f in license_files:
374 if self._guess_license_type(f) == l:
375 license_found = True
376 has_license_types.add(l)
377 license_map[pkg_name] = {
378 'license':
379 l,
380 'license_file':
381 os.path.relpath(f, self.working_dir),
382 }
383 break
384
385 # Break out of loop if license is found
386 if license_found:
387 break
388 else:
389 bad_licenses[pkg_name] = license
390
391 # If we had any bad licenses, we need to abort
392 if bad_licenses:
393 for k in bad_licenses.keys():
394 print("{} had no acceptable licenses: {}".format(
395 k, bad_licenses[k]))
396 raise Exception("Bad licenses in vendored packages.")
397
398 # Write license map to file
399 if print_map_to_file:
400 with open(os.path.join(self.working_dir, print_map_to_file),
401 'w') as lfile:
402 json.dump(license_map, lfile, sort_keys=True)
403
404 # Raise missing licenses unless we have a valid reason to ignore them
405 raise_missing_license = False
406 for name, v in license_map.items():
407 if 'license_file' not in v and v.get('license',
408 '') != self.APACHE_LICENSE:
409 raise_missing_license = True
410 print(" {}: Missing license file. Fix or add to ignorelist.".
411 format(name))
412
413 if raise_missing_license:
414 raise Exception(
415 "Unhandled missing license file. "
416 "Make sure all are accounted for before continuing.")
417
418 print("Add the following licenses to the ebuild: \n",
419 sorted([x for x in has_license_types]))
420
421
Abhishek Pandit-Subedice0f5b22021-09-10 15:50:08 -0700422# TODO(abps) - This needs to be replaced with datalog later. We should compile
423# all crab files into datalog and query it with our requirements
424# instead.
425class CrabManager:
426 """Manage audit files."""
427 def __init__(self, working_dir, crab_dir):
428 self.working_dir = working_dir
429 self.crab_dir = crab_dir
430
431 def _check_bad_traits(self, crabdata):
432 """Checks that a package's crab audit meets our requirements.
433
434 Args:
435 crabdata: Dict with crab keys in standard templated format.
436 """
437 common = crabdata['common']
438 # TODO(b/200578411) - Figure out what conditions we should enforce as
439 # part of the audit.
440 conditions = [
441 common.get('deny', None),
442 ]
443
444 # If any conditions are true, this crate is not acceptable.
445 return any(conditions)
446
447 def verify_traits(self):
448 """ Verify that all required CRAB traits for this repository are met.
449 """
450 metadata = load_metadata(self.working_dir)
451
452 failing_crates = {}
453
454 # Verify all packages have a CRAB file associated with it and they meet
455 # all our required traits
456 for package in metadata['packages']:
457 # Skip vendor_libs
458 if package['name'] == 'vendor_libs':
459 continue
460
461 crabname = "{}-{}".format(package['name'], package['version'])
462 filename = os.path.join(self.crab_dir, "{}.toml".format(crabname))
463
464 # If crab file doesn't exist, the crate fails
465 if not os.path.isfile(filename):
466 failing_crates[crabname] = "No crab file".format(filename)
467 continue
468
469 with open(filename, 'r') as f:
470 crabdata = toml.loads(f.read())
471
472 # If crab file's crate_name and version keys don't match this
473 # package, it also fails. This is just housekeeping...
474 if package['name'] != crabdata['crate_name'] or package[
475 'version'] != crabdata['version']:
476 failing_crates[crabname] = "Crate name or version don't match"
477 continue
478
479 if self._check_bad_traits(crabdata):
480 failing_crates[crabname] = "Failed bad traits check"
481
482 # If we had any failing crates, list them now
483 if failing_crates:
484 print('Failed CRAB audit:')
485 for k, v in failing_crates.items():
486 print(' {}: {}'.format(k, v))
487
488
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -0700489def main(args):
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -0800490 current_path = pathlib.Path(__file__).parent.absolute()
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +0000491 patches = os.path.join(current_path, "patches")
492 vendor = os.path.join(current_path, "vendor")
Abhishek Pandit-Subedice0f5b22021-09-10 15:50:08 -0700493 crab_dir = os.path.join(current_path, "crab", "crates")
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -0800494
Abhishek Pandit-Subedifa902382021-08-20 11:04:33 -0700495 # First, actually run cargo vendor
496 run_cargo_vendor(current_path)
497
Abhishek Pandit-Subedi5065a0f2021-06-13 20:38:55 +0000498 # Order matters here:
499 # - Apply patches (also re-calculates checksums)
500 # - Cleanup any owners files (otherwise, git check-in or checksums are
501 # unhappy)
502 apply_patches(patches, vendor)
503 cleanup_owners(vendor)
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -0800504
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -0700505 # Combine license file and check for any bad licenses
506 lm = LicenseManager(current_path, vendor)
507 lm.generate_license(args.skip_license_check, args.license_map)
508
Abhishek Pandit-Subedice0f5b22021-09-10 15:50:08 -0700509 # Run crab audit on all packages
510 crab = CrabManager(current_path, crab_dir)
511 crab.verify_traits()
512
Abhishek Pandit-Subedib75bd562021-02-25 15:32:22 -0800513
514if __name__ == '__main__':
Abhishek Pandit-Subedie393cb72021-08-22 10:41:13 -0700515 parser = argparse.ArgumentParser(description='Vendor packages properly')
516 parser.add_argument('--skip-license-check',
517 '-s',
518 help='Skip the license check on a specific package',
519 action='append')
520 parser.add_argument('--license-map', help='Write license map to this file')
521 args = parser.parse_args()
522
523 main(args)