licensing: show diff of packages and licenses.

TEST=hand tested
polgara:/usr/local/google2/gerrit-int/chromium-os/chromite/license-generation$ ../bin/diff_license_html  output.html-M30 output.html-M31
Package removed: argparse-1.2.1
Package removed: busybox-1.21.0
Package removed: ddccontrol-0.4.2
Package removed: ddccontrol-db-20061014
Package removed: htpdate-1.0.4
Package removed: ibus-english-m-0.0.1
Package removed: ibus-m17n-1.3.3
Package removed: jsonrpclib-0_pre20110820
Package removed: libaio-0.3.109
Package removed: libchewing-0.3.2
Package removed: libhangul-0.0.10
Package removed: m17n-contrib-1.1.10
Package removed: m17n-db-1.6.1
Package removed: m17n-lib-1.6.1
Package removed: netifaces-0.8
Package removed: patch-2.6.1
Package removed: python-evdev-0.3.1
Package removed: pyyaml-3.09
Package removed: setproctitle-1.1.6
Package removed: simplejson-2.5.0
Package removed: stressapptest-1.0.4
Package removed: unittest2-0.5.1
Package removed: xxd-1.10

Package added: avahi-0.6.31
Package added: brltty-4.5
Package added: coreboot-2013.04
Package added: crosextrafonts-carlito-20130920
Package added: gcc-libs-0.0.1
Package added: less-441
Package added: libdaemon-0.14
Package added: u-boot-2013.06

Package updated: chinese-input from 1.0.0.0 to 1.1.0.0
Package updated: chromeos-hangul from 1.0.0.12 to 1.1.0.0
Package updated: chromeos-keyboards from 1.1.3.0 to 1.1.4.0
Package updated: curl from 7.23.1 to 7.31.0
Package updated: dbus-glib from 0.100 to 0.100.2
Package updated: dmidecode from 2.10 to 2.11
Package updated: hdparm from 9.20 to 9.39
Package updated: input-tools from 0.0.1 to 2.4.1.0
Package updated: libxml2 from 2.7.8 to 2.9.1
Package updated: portage from 2.1.11.50 to 2.1.10.11
Package updated: timezone-data from 2012j to 2013d

License removed: Source license PyYAML/LICENSE
License removed: Source license argparse/LICENSE.txt
License removed: Source license python-evdev/LICENSE
License removed: Source license setproctitle/COPYRIGHT
License removed: Source license simplejson/LICENSE.txt
License removed: Stock License netiface

License added: Source license crosextrafonts-carlito/LICENSE
License added: Stock License FDL-1.2

BUG=chromium:197970 chromium:271812

Change-Id: Iab0b1dc8fbb9c4722a28c941141a8ebf98b2da96
Reviewed-on: https://chromium-review.googlesource.com/171681
Reviewed-by: Marc MERLIN <merlin@chromium.org>
Commit-Queue: Marc MERLIN <merlin@chromium.org>
Tested-by: Marc MERLIN <merlin@chromium.org>
diff --git a/scripts/diff_license_html.py b/scripts/diff_license_html.py
index c465113..acdeab2 100644
--- a/scripts/diff_license_html.py
+++ b/scripts/diff_license_html.py
@@ -19,23 +19,44 @@
     html_file: which html license file to scan for packages.
 
   Returns:
-    dictionary of packages and version numbers.
+    tuple of dictionary of packages and version numbers and set of licenses.
+
+  Raises:
+    AssertionError: if regex failed.
   """
 
   packages = {}
+  licenses = set()
 
-  # Grep and turn
-  # <span class="title">ath6k-34</span>
-  # into
-  # ath6k 34
-  exp = re.compile(r'<span class="title">(.+)-(.+)</span>')
+  pkg_rgx = re.compile(r'<span class="title">(.+)-(.+)</span>')
+  license_rgx = re.compile(
+      r'(?:Gentoo Package (Stock License .+)</a>|Scanned (Source license .+):)')
   with open(html_file, 'r') as f:
     for line in f:
-      match = exp.search(line)
+      # Grep and turn
+      # <span class="title">ath6k-34</span>
+      # into
+      # ath6k 34
+      match = pkg_rgx.search(line)
       if match:
         packages[match.group(1)] = match.group(2)
 
-  return packages
+      match = license_rgx.search(line)
+      if match:
+        lic = None
+        if match.group(1):
+          lic = match.group(1)
+        else:
+          # Turn Source license simplejson-2.5.0/LICENSE.txt
+          # into Source license simplejson/LICENSE.txt
+          # (we don't want to create diffs based on version numbers)
+          lic = re.sub(r'(.+)-([^/]+)/(.+)', r'\1/\3', match.group(2))
+
+        licenses.add(lic)
+        if not lic:
+          raise AssertionError('License for %s came up empty')
+
+  return (packages, licenses)
 
 
 def ComparePkgLists(pkg_list1, pkg_list2):
@@ -66,6 +87,25 @@
       print 'Package updated: %s from %s to %s' % (changed_package, ver1, ver2)
 
 
+def CompareLicenseSets(set1, set2):
+  """Compare the license list in 2 sets and output the differences.
+
+  Args:
+    set1: set from GetTreePackages.
+    set2: set from GetTreePackages.
+
+  Returns:
+    N/A (outputs result on stdout).
+  """
+
+  for removed_license in sorted(set1 - set2):
+    print 'License removed: %s' % (removed_license)
+
+  print
+  for added_license in sorted(set2 - set1):
+    print 'License added: %s' % (added_license)
+
+
 def main(args):
   parser = commandline.ArgumentParser(usage=__doc__)
   parser.add_argument('html1', metavar='license1.html', type='path',
@@ -76,4 +116,6 @@
 
   pkg_list1 = GetTreePackages(opts.html1)
   pkg_list2 = GetTreePackages(opts.html2)
-  ComparePkgLists(pkg_list1, pkg_list2)
+  ComparePkgLists(pkg_list1[0], pkg_list2[0])
+  print
+  CompareLicenseSets(pkg_list1[1], pkg_list2[1])