vendor.py: Add licenses check and generation

Update vendor.py to check that licenses match a set of accepted licenses
and generate a list of all used licenses across the crates. Also add the
capability to generate a license map from the vendored crates.

The license map has a few keys:
- licenses: A list of licenses supported by this crate
- license_file: Path to the license file for this crate. This may be
                empty for some crates if they are in the ignorelist.

BUG=b:197337740
TEST=./vendor.py

Change-Id: I0343cf641c5f3e047a5fe701e73cef2357e2dfc0
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/rust_crates/+/3115309
Tested-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Reviewed-by: Chris McDonald <cjmcdonald@chromium.org>
Commit-Queue: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
diff --git a/vendor.py b/vendor.py
index 2b3b2fa..db5ef1a 100755
--- a/vendor.py
+++ b/vendor.py
@@ -5,19 +5,26 @@
 # found in the LICENSE file.
 """ This script cleans up the vendor directory.
 """
+import argparse
 import hashlib
 import json
 import os
 import pathlib
+import re
 import subprocess
 
+# We only care about crates we're actually going to use and that's usually
+# limited to ones with cfg(linux). For running `cargo metadata`, limit results
+# to only this platform
+DEFAULT_PLATFORM_FILTER = "x86_64-unknown-linux-gnu"
+
 
 def _rerun_checksums(package_path):
     """Re-run checksums for given package.
 
     Writes resulting checksums to $package_path/.cargo-checksum.json.
     """
-    hashes = {}
+    hashes = dict()
     checksum_path = os.path.join(package_path, '.cargo-checksum.json')
     if not pathlib.Path(checksum_path).is_file():
         return False
@@ -42,11 +49,11 @@
                 hashes[key] = d
 
     if hashes:
-        print("{} regenerated {} hashes".format(package_path, len(hashes.keys())))
+        print("{} regenerated {} hashes".format(package_path,
+                                                len(hashes.keys())))
         contents['files'] = hashes
-
         with open(checksum_path, 'w') as fwrite:
-            json.dump(contents, fwrite)
+            json.dump(contents, fwrite, sort_keys=True)
 
     return True
 
@@ -78,7 +85,7 @@
     if del_keys:
         print('{} deleted: {}'.format(root, del_keys))
         with open(checksum_path, 'w') as fwrite:
-            json.dump(contents, fwrite)
+            json.dump(contents, fwrite, sort_keys=True)
 
     return bool(del_keys)
 
@@ -128,7 +135,7 @@
 
     # Don't bother running if patches directory is empty
     if not pathlib.Path(patches_path).is_dir():
-      return
+        return
 
     # Look for all patches and apply them
     for d in os.listdir(patches_path):
@@ -155,11 +162,11 @@
                 print("Failed to apply patch: {}".format(patch))
                 break
 
-
     # Re-run checksums for all modified packages since we applied patches.
     for key in checksums_for.keys():
         _rerun_checksums(os.path.join(vendor_path, key))
 
+
 def run_cargo_vendor(working_dir):
     """Runs cargo vendor.
 
@@ -169,7 +176,248 @@
     """
     subprocess.check_call(["cargo", "vendor"], cwd=working_dir)
 
-def main():
+def load_metadata(working_dir, filter_platform=DEFAULT_PLATFORM_FILTER):
+    """Load metadata for manifest at given directory.
+
+    Args:
+        working_dir: Directory to run from.
+        filter_platform: Filter packages to ones configured for this platform.
+    """
+    manifest_path = os.path.join(working_dir, 'Cargo.toml')
+    cmd = [
+        'cargo', 'metadata', '--format-version', '1', "--filter-platform",
+        filter_platform, '--manifest-path', manifest_path
+    ]
+    output = subprocess.check_output(cmd, cwd=working_dir)
+
+    return json.loads(output)
+
+
+class LicenseManager:
+    """ Manage consolidating licenses for all packages."""
+
+    # These are all the licenses we support. Keys are what is seen in metadata and
+    # values are what is expected by the ebuild.
+    SUPPORTED_LICENSES = {
+        'Apache-2.0': 'Apache-2.0',
+        'MIT': 'MIT',
+        'BSD-3-Clause': 'BSD-3',
+        'ISC': 'ISC'
+    }
+
+    # Prefer to take attribution licenses in this order. All these require that
+    # we actually use the license file found in the package so they MUST have
+    # a license file set.
+    PREFERRED_ATTRIB_LICENSE_ORDER = ['MIT', 'BSD-3', 'ISC']
+
+    # If Apache license is found, always prefer it (simplifies attribution)
+    APACHE_LICENSE = 'Apache-2.0'
+
+    # Regex for license files found in the vendored directories. Search for
+    # these files with re.IGNORECASE.
+    #
+    # These will be searched in order with the earlier entries being preferred.
+    LICENSE_NAMES_REGEX = [
+        r'^license-mit$',
+        r'^copyright$',
+        r'^licen[cs]e.*$',
+    ]
+
+    # Some crates have their license file in other crates. This usually occurs
+    # because multiple crates are published from the same git repository and the
+    # license isn't updated in each sub-crate. In these cases, we can just
+    # ignore these packages.
+    MAP_LICENSE_TO_OTHER = {
+        'failure_derive': 'failure',
+        'grpcio-compiler': 'grpcio',
+        'grpcio-sys': 'grpcio',
+        'rustyline-derive': 'rustyline',
+    }
+
+    # Map a package to a specific license and license file. Only use this if
+    # a package doesn't have an easily discoverable license or exports its
+    # license in a weird way. Prefer to patch the project with a license and
+    # upstream the patch instead.
+    STATIC_LICENSE_MAP = {
+        # "package name": ( "license name", "license file relative location")
+    }
+
+    def __init__(self, working_dir, vendor_dir):
+        self.working_dir = working_dir
+        self.vendor_dir = vendor_dir
+
+    def _find_license_in_dir(self, search_dir):
+        for p in os.listdir(search_dir):
+            # Ignore anything that's not a file
+            if not os.path.isfile(os.path.join(search_dir, p)):
+                continue
+
+            # Now check if the name matches any of the regexes
+            # We'll return the first matching file.
+            for regex in self.LICENSE_NAMES_REGEX:
+                if re.search(regex, p, re.IGNORECASE):
+                    yield os.path.join(search_dir, p)
+                    break
+
+    def _guess_license_type(self, license_file):
+        if '-MIT' in license_file:
+            return 'MIT'
+        elif '-APACHE' in license_file:
+            return 'APACHE'
+        elif '-BSD' in license_file:
+            return 'BSD-3'
+
+        with open(license_file, 'r') as f:
+            lines = f.read()
+            if 'MIT' in lines:
+                return 'MIT'
+            elif 'Apache' in lines:
+                return 'APACHE'
+            elif 'BSD 3-Clause' in lines:
+                return 'BSD-3'
+
+        return ''
+
+    def generate_license(self, skip_license_check, print_map_to_file):
+        """Generate single massive license file from metadata."""
+        metadata = load_metadata(self.working_dir)
+
+        has_license_types = set()
+        bad_licenses = {}
+
+        # Keep license map ordered so it generates a consistent license map
+        license_map = {}
+
+        skip_license_check = skip_license_check or []
+
+        for package in metadata['packages']:
+            pkg_name = package['name']
+
+            # Skip vendor libs directly
+            if pkg_name == "vendor_libs":
+                continue
+
+            if pkg_name in skip_license_check:
+                print(
+                    "Skipped license check on {}. Reason: Skipped from command line"
+                    .format(pkg_name))
+                continue
+
+            if pkg_name in self.MAP_LICENSE_TO_OTHER:
+                print(
+                    'Skipped license check on {}. Reason: License already in {}'
+                    .format(pkg_name, self.MAP_LICENSE_TO_OTHER[pkg_name]))
+                continue
+
+            # Check if we have a static license map for this package. Use the
+            # static values if we have it already set.
+            if pkg_name in self.STATIC_LICENSE_MAP:
+                (license, license_file) = self.STATIC_LICENSE_MAP[pkg_name]
+                license_map[pkg_name] = {
+                    "license": license,
+                    "license_file": license_file,
+                }
+                continue
+
+            license_files = []
+            license = package.get('license', '')
+
+            # We ignore the metadata for license file because most crates don't
+            # have it set. Just scan the source for licenses.
+            license_files = [
+                x for x in self._find_license_in_dir(
+                    os.path.join(self.vendor_dir, pkg_name))
+            ]
+
+            # If there are multiple licenses, they are delimited with "OR" or "/"
+            delim = ' OR ' if ' OR ' in license else '/'
+            found = license.split(delim)
+
+            # Filter licenses to ones we support
+            licenses_or = [
+                self.SUPPORTED_LICENSES[f] for f in found
+                if f in self.SUPPORTED_LICENSES
+            ]
+
+            # If apache license is found, always prefer it because it simplifies
+            # license attribution (we can use existing Apache notice)
+            if self.APACHE_LICENSE in licenses_or:
+                has_license_types.add(self.APACHE_LICENSE)
+                license_map[pkg_name] = {'license': self.APACHE_LICENSE}
+
+            # Handle single license that has at least one license file
+            # We pick the first license file and the license
+            elif len(licenses_or) == 1:
+                if license_files:
+                    l = licenses_or[0]
+                    lf = license_files[0]
+
+                    has_license_types.add(l)
+                    license_map[pkg_name] = {
+                        'license': l,
+                        'license_file': os.path.relpath(lf, self.working_dir),
+                    }
+                else:
+                    bad_licenses[pkg_name] = "{} missing license file".format(
+                        licenses_or[0])
+            # Handle multiple licenses
+            elif len(licenses_or) > 1:
+                # Check preferred licenses in order
+                license_found = False
+                for l in self.PREFERRED_ATTRIB_LICENSE_ORDER:
+                    if not l in licenses_or:
+                        continue
+
+                    for f in license_files:
+                        if self._guess_license_type(f) == l:
+                            license_found = True
+                            has_license_types.add(l)
+                            license_map[pkg_name] = {
+                                'license':
+                                l,
+                                'license_file':
+                                os.path.relpath(f, self.working_dir),
+                            }
+                            break
+
+                    # Break out of loop if license is found
+                    if license_found:
+                        break
+            else:
+                bad_licenses[pkg_name] = license
+
+        # If we had any bad licenses, we need to abort
+        if bad_licenses:
+            for k in bad_licenses.keys():
+                print("{} had no acceptable licenses: {}".format(
+                    k, bad_licenses[k]))
+            raise Exception("Bad licenses in vendored packages.")
+
+        # Write license map to file
+        if print_map_to_file:
+            with open(os.path.join(self.working_dir, print_map_to_file),
+                      'w') as lfile:
+                json.dump(license_map, lfile, sort_keys=True)
+
+        # Raise missing licenses unless we have a valid reason to ignore them
+        raise_missing_license = False
+        for name, v in license_map.items():
+            if 'license_file' not in v and v.get('license',
+                                                 '') != self.APACHE_LICENSE:
+                raise_missing_license = True
+                print("  {}: Missing license file. Fix or add to ignorelist.".
+                      format(name))
+
+        if raise_missing_license:
+            raise Exception(
+                "Unhandled missing license file. "
+                "Make sure all are accounted for before continuing.")
+
+        print("Add the following licenses to the ebuild: \n",
+              sorted([x for x in has_license_types]))
+
+
+def main(args):
     current_path = pathlib.Path(__file__).parent.absolute()
     patches = os.path.join(current_path, "patches")
     vendor = os.path.join(current_path, "vendor")
@@ -184,6 +432,18 @@
     apply_patches(patches, vendor)
     cleanup_owners(vendor)
 
+    # Combine license file and check for any bad licenses
+    lm = LicenseManager(current_path, vendor)
+    lm.generate_license(args.skip_license_check, args.license_map)
+
 
 if __name__ == '__main__':
-    main()
+    parser = argparse.ArgumentParser(description='Vendor packages properly')
+    parser.add_argument('--skip-license-check',
+                        '-s',
+                        help='Skip the license check on a specific package',
+                        action='append')
+    parser.add_argument('--license-map', help='Write license map to this file')
+    args = parser.parse_args()
+
+    main(args)