licensing: Deal with custom vs stock licenses better.

The old code treated custom licenses as Gentoo ones.
These changes makes the difference between the two, and in the process
takes factored out licenses and folds them back in a package when a
license specified in an ebuild is only used by a single package.

Other cleanups:
- Hack for Marvell International Ltd. license removed (source fixed)
- _GetEbuildPath is now inlined and only run once
- netiface special case removed (we don't ship it)
- ppp special case removed (source contains license file we find now)

diff_license_html is updated accordingly.

TEST=../bin/diff_license_html output.html-M31 output.html > diff
shows:
Package removed: less-441

Package added: patch-2.6.1

Package updated: openvpn from 2.1.12 to 2.3.2
Package updated: portage from 2.1.10.11 to 2.1.11.50

License removed: Stock License Atheros
License added: Custom License Atheros
License removed: Stock License BSD-Google
License added: Custom License BSD-Google
License removed: Stock License BSD-bsdiff
License added: Custom License BSD-bsdiff
License removed: Stock License BSD-dhcpcd
License added: Custom License BSD-dhcpcd
License removed: Stock License BSD-iputils
License added: Custom License BSD-iputils
License removed: Stock License BSD-libevent
License added: Custom License BSD-libevent
License removed: Stock License MIT-MIT
License added: Custom License MIT-MIT
License removed: Stock License MIT-Mesa
License added: Custom License MIT-Mesa
License removed: Stock License X
License added: Custom License X
License removed: Stock License fontconfig
License added: Custom License fontconfig
License removed: Stock License ncurses
License added: Custom License ncurses

License removed: Stock License ppp-2.4.4
License added: Source License ppp/pppd/plugins/pppoatm/COPYING
License added: Source License ppp/pppd/plugins/radius/COPYRIGHT

License removed: Stock License Marvell
License added: Custom License Marvell-sd8787

BUG=chromium:271812

Change-Id: I051145539eefd5dab18df8f0dbd04933af34bc63
Reviewed-on: https://chromium-review.googlesource.com/173345
Reviewed-by: Matt Tennant <mtennant@chromium.org>
Reviewed-by: Marc MERLIN <merlin@chromium.org>
Tested-by: Marc MERLIN <merlin@chromium.org>
Commit-Queue: Marc MERLIN <merlin@chromium.org>
Commit-Queue: David James <davidjames@chromium.org>
diff --git a/scripts/diff_license_html.py b/scripts/diff_license_html.py
index acdeab2..83bd138 100644
--- a/scripts/diff_license_html.py
+++ b/scripts/diff_license_html.py
@@ -12,8 +12,8 @@
 from chromite.lib import commandline
 
 
-def GetTreePackages(html_file):
-  """Get the list of debian packages in an unpacked ProdNG tree.
+def GetPackagesLicensesFromHtml(html_file):
+  """Get the list of packages and licenses in a ChromeOS license file.
 
   Args:
     html_file: which html license file to scan for packages.
@@ -29,8 +29,14 @@
   licenses = set()
 
   pkg_rgx = re.compile(r'<span class="title">(.+)-(.+)</span>')
-  license_rgx = re.compile(
-      r'(?:Gentoo Package (Stock License .+)</a>|Scanned (Source license .+):)')
+  license_rgx1 = re.compile(r'Scanned (Source License .+):', re.IGNORECASE)
+  license_rgx2 = re.compile(r'<pre>(Custom License .+):', re.IGNORECASE)
+  # Gentoo Package Stock License BZIP2:
+  # <a ... class="title">Gentoo Package Stock License public-domain</a>
+  license_rgx3 = re.compile(r'Gentoo Package (Stock License [^<:]+)',
+                            re.IGNORECASE)
+  license_rgx4 = re.compile(r'class="title">(Custom License .+)</a>',
+                            re.IGNORECASE)
   with open(html_file, 'r') as f:
     for line in f:
       # Grep and turn
@@ -41,20 +47,20 @@
       if match:
         packages[match.group(1)] = match.group(2)
 
-      match = license_rgx.search(line)
+      match = license_rgx1.search(line)
       if match:
-        lic = None
-        if match.group(1):
-          lic = match.group(1)
-        else:
-          # Turn Source license simplejson-2.5.0/LICENSE.txt
-          # into Source license simplejson/LICENSE.txt
-          # (we don't want to create diffs based on version numbers)
-          lic = re.sub(r'(.+)-([^/]+)/(.+)', r'\1/\3', match.group(2))
-
+        # Turn Source license simplejson-2.5.0/LICENSE.txt
+        # into Source license simplejson/LICENSE.txt
+        # (we don't want to create diffs based on version numbers)
+        lic = re.sub(r'(.+)-([^/]+)/(.+)', r'\1/\3', match.group(1))
+        # Old files had this lowercased.
+        lic = re.sub(r'Source license', r'Source License', lic)
         licenses.add(lic)
-        if not lic:
-          raise AssertionError('License for %s came up empty')
+
+      for rgx in (license_rgx2, license_rgx3, license_rgx4):
+        match = rgx.search(line)
+        if match:
+          licenses.add(match.group(1))
 
   return (packages, licenses)
 
@@ -63,8 +69,8 @@
   """Compare the package list in 2 dictionaries and output the differences.
 
   Args:
-    pkg_list1: dict from GetTreePackages.
-    pkg_list2: dict from GetTreePackages.
+    pkg_list1: dict from GetPackagesLicensesFromHtml.
+    pkg_list2: dict from GetPackagesLicensesFromHtml.
 
   Returns:
     N/A (outputs result on stdout).
@@ -91,8 +97,8 @@
   """Compare the license list in 2 sets and output the differences.
 
   Args:
-    set1: set from GetTreePackages.
-    set2: set from GetTreePackages.
+    set1: set from GetPackagesLicensesFromHtml.
+    set2: set from GetPackagesLicensesFromHtml.
 
   Returns:
     N/A (outputs result on stdout).
@@ -114,8 +120,8 @@
                       help='new html file')
   opts = parser.parse_args(args)
 
-  pkg_list1 = GetTreePackages(opts.html1)
-  pkg_list2 = GetTreePackages(opts.html2)
+  pkg_list1 = GetPackagesLicensesFromHtml(opts.html1)
+  pkg_list2 = GetPackagesLicensesFromHtml(opts.html2)
   ComparePkgLists(pkg_list1[0], pkg_list2[0])
   print
   CompareLicenseSets(pkg_list1[1], pkg_list2[1])