blob: e9ecbe3e321eedaddee757210e46044671afbb9b [file] [log] [blame]
Marc MERLIN0a621942013-09-30 15:22:38 -07001#!/usr/bin/python
2#
3# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6#
7
8"""Compares the packages between 2 images by parsing the license file output."""
9
10import re
11
12from chromite.lib import commandline
13
14
Marc MERLIN41585292013-10-09 15:00:49 -070015def GetPackagesLicensesFromHtml(html_file):
16 """Get the list of packages and licenses in a ChromeOS license file.
Marc MERLIN0a621942013-09-30 15:22:38 -070017
18 Args:
19 html_file: which html license file to scan for packages.
20
21 Returns:
Marc MERLINea95f202013-10-02 17:07:00 -070022 tuple of dictionary of packages and version numbers and set of licenses.
23
24 Raises:
25 AssertionError: if regex failed.
Marc MERLIN0a621942013-09-30 15:22:38 -070026 """
27
28 packages = {}
Marc MERLINea95f202013-10-02 17:07:00 -070029 licenses = set()
Marc MERLIN0a621942013-09-30 15:22:38 -070030
Marc MERLINea95f202013-10-02 17:07:00 -070031 pkg_rgx = re.compile(r'<span class="title">(.+)-(.+)</span>')
Marc MERLIN3561ad82013-10-29 14:18:47 -070032 # Do not add <pre> in the regex or it would only show the first entry on
33 # a package that has multiple hits.
Marc MERLIN41585292013-10-09 15:00:49 -070034 license_rgx1 = re.compile(r'Scanned (Source License .+):', re.IGNORECASE)
Marc MERLIN3561ad82013-10-29 14:18:47 -070035 license_rgx2 = re.compile(r'(Custom License .+):', re.IGNORECASE)
36 license_rgx3 = re.compile(r'(Copyright Attribution .+):', re.IGNORECASE)
37 # This regex isn't as tight because it has to match these:
Marc MERLIN41585292013-10-09 15:00:49 -070038 # Gentoo Package Stock License BZIP2:
Marc MERLIN3561ad82013-10-29 14:18:47 -070039 # <a ... class="title">Gentoo Package Provided Stock License public-domain</a>
Marc MERLIN41585292013-10-09 15:00:49 -070040 # <a ... class="title">Gentoo Package Stock License public-domain</a>
Marc MERLIN3561ad82013-10-29 14:18:47 -070041 license_rgx4 = re.compile(r'(Stock License [^<:]+)', re.IGNORECASE)
42 license_rgx5 = re.compile(r'class="title">(Custom License .+)</a>',
Marc MERLIN41585292013-10-09 15:00:49 -070043 re.IGNORECASE)
Marc MERLIN0a621942013-09-30 15:22:38 -070044 with open(html_file, 'r') as f:
45 for line in f:
Marc MERLINea95f202013-10-02 17:07:00 -070046 # Grep and turn
47 # <span class="title">ath6k-34</span>
48 # into
49 # ath6k 34
50 match = pkg_rgx.search(line)
Marc MERLIN0a621942013-09-30 15:22:38 -070051 if match:
52 packages[match.group(1)] = match.group(2)
53
Marc MERLIN41585292013-10-09 15:00:49 -070054 match = license_rgx1.search(line)
Marc MERLINea95f202013-10-02 17:07:00 -070055 if match:
Marc MERLIN41585292013-10-09 15:00:49 -070056 # Turn Source license simplejson-2.5.0/LICENSE.txt
57 # into Source license simplejson/LICENSE.txt
58 # (we don't want to create diffs based on version numbers)
59 lic = re.sub(r'(.+)-([^/]+)/(.+)', r'\1/\3', match.group(1))
60 # Old files had this lowercased.
61 lic = re.sub(r'Source license', r'Source License', lic)
Marc MERLINea95f202013-10-02 17:07:00 -070062 licenses.add(lic)
Marc MERLIN41585292013-10-09 15:00:49 -070063
Marc MERLIN3561ad82013-10-29 14:18:47 -070064 for rgx in (license_rgx2, license_rgx3, license_rgx4, license_rgx5):
Marc MERLIN41585292013-10-09 15:00:49 -070065 match = rgx.search(line)
66 if match:
67 licenses.add(match.group(1))
Marc MERLINea95f202013-10-02 17:07:00 -070068
69 return (packages, licenses)
Marc MERLIN0a621942013-09-30 15:22:38 -070070
71
72def ComparePkgLists(pkg_list1, pkg_list2):
73 """Compare the package list in 2 dictionaries and output the differences.
74
75 Args:
Marc MERLIN41585292013-10-09 15:00:49 -070076 pkg_list1: dict from GetPackagesLicensesFromHtml.
77 pkg_list2: dict from GetPackagesLicensesFromHtml.
Marc MERLIN0a621942013-09-30 15:22:38 -070078
79 Returns:
80 N/A (outputs result on stdout).
81 """
82
83 for removed_package in sorted(set(pkg_list1) - set(pkg_list2)):
84 print 'Package removed: %s-%s' % (
85 removed_package, pkg_list1[removed_package])
86
87 print
88 for added_package in sorted(set(pkg_list2) - set(pkg_list1)):
89 print 'Package added: %s-%s' % (
90 added_package, pkg_list2[added_package])
91
92 print
93 for changed_package in sorted(set(pkg_list1) & set(pkg_list2)):
94 ver1 = pkg_list1[changed_package]
95 ver2 = pkg_list2[changed_package]
96 if ver1 != ver2:
97 print 'Package updated: %s from %s to %s' % (changed_package, ver1, ver2)
98
99
Marc MERLINea95f202013-10-02 17:07:00 -0700100def CompareLicenseSets(set1, set2):
101 """Compare the license list in 2 sets and output the differences.
102
103 Args:
Marc MERLIN41585292013-10-09 15:00:49 -0700104 set1: set from GetPackagesLicensesFromHtml.
105 set2: set from GetPackagesLicensesFromHtml.
Marc MERLINea95f202013-10-02 17:07:00 -0700106
107 Returns:
108 N/A (outputs result on stdout).
109 """
110
111 for removed_license in sorted(set1 - set2):
112 print 'License removed: %s' % (removed_license)
113
114 print
115 for added_license in sorted(set2 - set1):
116 print 'License added: %s' % (added_license)
117
118
Marc MERLIN0a621942013-09-30 15:22:38 -0700119def main(args):
120 parser = commandline.ArgumentParser(usage=__doc__)
121 parser.add_argument('html1', metavar='license1.html', type='path',
122 help='old html file')
123 parser.add_argument('html2', metavar='license2.html', type='path',
124 help='new html file')
125 opts = parser.parse_args(args)
126
Marc MERLIN41585292013-10-09 15:00:49 -0700127 pkg_list1 = GetPackagesLicensesFromHtml(opts.html1)
128 pkg_list2 = GetPackagesLicensesFromHtml(opts.html2)
Marc MERLINea95f202013-10-02 17:07:00 -0700129 ComparePkgLists(pkg_list1[0], pkg_list2[0])
130 print
131 CompareLicenseSets(pkg_list1[1], pkg_list2[1])