Add scripts that will export the content for the site.

This copies over the scripts from /experimental/website
that export the content from Google Sites and will
enable us to actually check in the markdown content.

However, the first pass of that will be in a follow-on CL.

Bug: 1260171
Change-Id: Ia257cbbfde089385ca11a391eb4779a0b9bed0ee
Reviewed-on: https://chromium-review.googlesource.com/c/website/+/3258094
Reviewed-by: Struan Shrimpton <sshrimp@google.com>
Commit-Queue: Dirk Pranke <dpranke@google.com>
diff --git a/scripts/export.py b/scripts/export.py
new file mode 100755
index 0000000..5e7f7e4
--- /dev/null
+++ b/scripts/export.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env vpython3
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Export www.chromium.org to local files.
+
+This script uses the Google GData and Google Sites APIs to extract the
+content from http://www.chromium.org/ and write it into local files
+that can be used to serve the same content.
+
+The APIs are documented at
+
+https://developers.google.com/sites/docs/1.0/developers_guide_protocol
+https://developers.google.com/gdata/docs/json
+
+Because www.chromium.org is a public site, this script requires no
+authentication to work.
+
+The exporting process attempts to convert the original content into
+sane modern HTML as much as possible without changing the appearance
+of any page significantly, with some minor exceptions.
+"""
+
+import argparse
+import io
+import json
+import os
+import pdb
+import sys
+import time
+import traceback
+from urllib.request import urlopen
+from urllib.error import HTTPError, URLError
+
+import yaml
+
+import common
+import html2markdown
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--force', action='store_true',
+                        help='ignore updated timestamps in local cache')
+    parser.add_argument('-t', '--test', action='store_true')
+    parser.add_argument('-r', '--raw', action='store_true')
+    parser.add_argument('-v', '--verbose', action='count')
+    parser.add_argument('--max_results', type=int, default=5000)
+    parser.add_argument('--start-index', type=int, default=1)
+    parser.add_argument('--path-list')
+    parser.add_argument('path', nargs='*')
+    args = parser.parse_args()
+
+    entries, parents = _entries(args)
+
+    if args.path:
+        paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
+                                     path)
+                           for path in args.path]
+    elif args.path_list:
+        paths_to_export = common.read_paths(args.path_list)
+    else:
+        paths_to_export = []
+
+    max_input_mtime = max(os.stat(__file__).st_mtime,
+                          os.stat(common.__file__).st_mtime,
+                          os.stat(html2markdown.__file__).st_mtime)
+
+    updated = 0
+    paths = []
+
+    if args.test:
+        entry = _find_entry_by_path(paths_to_export[0], entries, parents)
+        if entry:
+            metadata = _metadata(entry, entries, parents)
+            path = _path(entry, entries, parents)
+            _ = _handle_entry(path,
+                              (entry, metadata, max_input_mtime, args.force,
+                               args.raw))
+            content = common.read_text_file('%s/%s.md' % (common.SOURCE_DIR,
+                                                          path))
+            print(content)
+            return 0
+        else:
+            print('%s not found' % paths_to_export[0])
+            return 1
+
+    q = common.JobQueue(_handle_entry, common.cpu_count())
+
+    paths_to_export = set(paths_to_export)
+    for i, entry in enumerate(list(entries.values())[:args.max_results]):
+        if entry['kind'] in ('webpage', 'listpage', 'announcementspage', 'filecabinet'):
+            metadata = _metadata(entry, entries, parents)
+            path = _path(entry, entries, parents)
+        elif entry['kind'] == 'attachment':
+            metadata = {}
+            path = entry['url'].replace(
+                 'https://sites.google.com/a/chromium.org/dev/', '')
+        else:
+            continue
+
+        if not paths_to_export or (
+            ('/' + path).replace('/index', '') in paths_to_export):
+            q.request(path, (entry, metadata, max_input_mtime, args.force, False))
+
+    for path, res, did_update in q.results():
+        if did_update:
+            updated += 1
+
+    print('updated %d entries' % updated)
+
+
+def _find_entry_by_path(path, entries, parents):
+    seen = set()
+    for entry in entries.values():
+        if entry['kind'] not in ('webpage', 'listpage', 'announcmentspage', 'filecabinet'):
+            continue
+        entry_path = _path(entry, entries, parents)
+        seen.add(entry_path)
+        if '/' + entry_path in (path, path + '/index'):
+            return entry
+    return None
+
+
+def _handle_entry(task, obj):
+    entry, metadata, max_input_mtime, force, raw = obj
+    err = ''
+    did_update = False
+
+    yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
+
+    if task in (
+        'developers/jinja',
+        'developers/polymer-1-0',
+        'devtools/breakpoints-tutorial/index.html',
+        'devtools/breakpoints-tutorial/script.js',
+        ):
+        # TODO: Eleventy chokes on these files.
+        return '', False
+
+    def repr_str(dumper, data):
+        if '\n' in data:
+            return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
+                                           style='|')
+        return dumper.org_represent_str(data)
+
+    yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
+
+
+    mtime = _to_ts(entry['updated'])
+    if entry['kind'] in ('webpage',
+                         'listpage',
+                         'announcementspage',
+                         'filecabinet'):
+        target_mtime = max(mtime, max_input_mtime)
+        path = '%s/%s.md' % (common.SOURCE_DIR, task)
+        if True or _needs_update(path, target_mtime, force):
+            if raw:
+                content = entry['content']
+            else:
+                content_sio = io.StringIO(entry['content'])
+                md_sio = io.StringIO()
+                md_sio.write('---\n')
+                md_sio.write(yaml.safe_dump(metadata))
+                md_sio.write('---\n\n')
+                url_converter = _URLConverter()
+                html2markdown.Convert(content_sio, md_sio, url_converter)
+                content = md_sio.getvalue()
+                content = content.replace('    \b\b\b\b', '')
+            did_update = common.write_if_changed(path, content.encode('utf-8'))
+        else:
+            did_update = False
+    elif entry['kind'] in ('announcement', 'listitem'):
+        # TODO: implement me.
+        pass
+    elif entry['kind'] == 'attachment':
+        path = '%s/%s' % (common.SOURCE_DIR, task)
+        if path in (
+            'site/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
+            'site/developers/design-documents/cookie-split-loading/objects.png',
+        ):
+            # These are expected 404's that we ignore.
+            did_update = False
+        elif _needs_update(path, mtime, force):
+            try:
+                fp = urlopen(entry['url'])
+                content = fp.read()
+                did_update = common.write_if_changed(path, content)
+            except (HTTPError, URLError, TimeoutError) as e:
+                err = 'Error: %s' % e
+
+    elif entry['kind'] == 'comment':
+        # ignore comments in the migration
+        pass
+    elif entry['kind'] == 'tag':
+        err = 'tag kind not implemented'
+    else:
+        err = 'unknown kind %s' % entry['kind']
+
+    return err, did_update
+
+
+class _URLConverter:
+    def Translate(self, href):
+        if not href:
+            return ''
+
+        for path in common.alternates:
+            if href.startswith(path):
+                href = href.replace(path, '')
+
+        if href.startswith('/_/rsrc'):
+            href = '/' + '/'.join(href.split('/')[4:])
+        if '?' in href:
+            href = href[0:href.index('?')]
+        return href
+
+
+def _path(entry, entries, parents):
+    path = entry['page_name']
+    if entry['id'] in parents:
+        path = path + '/index'
+    parent_id = entry.get('parent_id')
+    while parent_id:
+        path = entries[parent_id]['page_name'] + '/' + path
+        parent_id = entries[parent_id].get('parent_id')
+
+    return path
+
+
+def _metadata(entry, entries, parents):
+    metadata = {}
+    metadata['page_name'] = entry['page_name']
+    metadata['title'] = entry['title']
+
+    crumbs = []
+    parent_id = entry.get('parent_id')
+    while parent_id:
+        parent = entries[parent_id]
+        path = '/' + _path(parent, entries, parents).replace('/index', '')
+        title = parent['title']
+        crumbs = [[path, title]] + crumbs
+        parent_id = parent.get('parent_id')
+
+    metadata['breadcrumbs'] = crumbs
+
+    if metadata['page_name'] in (
+        'chromium-projects',
+        'chromium',
+    ):
+        metadata['use_title_as_h1'] = False
+
+    return metadata
+
+
+def _needs_update(path, mtime, force):
+    if force:
+        return True
+    if os.path.exists(path):
+        st = os.stat(path)
+        return mtime > st.st_mtime
+    return True
+
+
+def _entries(args):
+    entries = {}
+    parents = set()
+
+    # Looks like Sites probably caps results at 500 entries per request,
+    # even if we request more than that.
+    rownum = 0
+    url = ('https://sites.google.com/feeds/content/chromium.org/dev'
+           '?start-index=%d&max-results=%d&alt=json' %
+               (args.start_index, 500 - rownum))
+    doc, next_url = _fetch(url, args.force)
+
+    for rownum, entry in enumerate(doc['feed']['entry'], start=1):
+        row = _to_row(entry, rownum)
+        entries[row['id']] = row
+        if row.get('parent_id'):
+            parents.add(row['parent_id'])
+    if args.verbose:
+        print(' ... [%d]' % rownum)
+    while next_url:
+        doc, next_url = _fetch(next_url, args.force)
+        for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
+            row = _to_row(entry, rownum)
+            entries[row['id']] = row
+            if row.get('parent_id'):
+                parents.add(row['parent_id'])
+        if args.verbose:
+            print(' ... [%d]' % rownum)
+
+    return entries, parents
+
+
+def _fetch(url, force):
+    path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
+    if _needs_update(path, 0, force):
+        fp = urlopen(url)
+        content = fp.read()
+        doc = json.loads(content)
+        updated = _to_ts(doc['feed']['updated']['$t'])
+        common.write_if_changed(path, content)
+    else:
+        with open(path) as fp:
+            doc = json.load(fp)
+    next_url = _find_link(doc['feed'], 'next')
+    return doc, next_url
+
+
+def _find_link(doc, rel):
+    for ent in doc['link']:
+        if ent['rel'] == rel:
+            return ent['href']
+    return None
+
+
+def _to_row(entry, rownum):
+    row = {
+        'rownum': rownum,
+        'content': entry.get('content', {}).get('$t'),
+        'id': _to_id(entry['id']['$t']),
+        'kind': entry['category'][0]['label'],
+        'published': entry['published']['$t'],
+        'updated': entry['updated']['$t'],
+    }
+
+    row['page_name'] = entry.get('sites$pageName', {}).get('$t')
+    row['title'] = entry.get('title', {}).get('$t')
+    row['alt_url'] = _find_link(entry, 'alternate')
+
+    if row['kind'] == 'attachment':
+        row['url'] = _find_link(entry, 'alternate')
+    else:
+        row['url'] = _find_link(entry, 'self')
+
+    parent_url = _find_link(entry,
+                            'http://schemas.google.com/sites/2008#parent')
+    if parent_url:
+        row['parent_id'] = _to_id(parent_url)
+    return row
+
+
+def _to_id(url):
+    return url[url.rfind('/') + 1:]
+
+
+def _to_ts(iso_time):
+    return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
+
+if __name__ == '__main__':
+    try:
+        main()
+    except Exception:
+        extype, value, tb = sys.exc_info()
+        traceback.print_exc()
+        pdb.post_mortem(tb)