Add scripts that will export the content for the site.
This copies over the scripts from /experimental/website
that export the content from Google Sites and will
enable us to actually check in the markdown content.
However, the first pass of that will be in a follow-on CL.
Bug: 1260171
Change-Id: Ia257cbbfde089385ca11a391eb4779a0b9bed0ee
Reviewed-on: https://chromium-review.googlesource.com/c/website/+/3258094
Reviewed-by: Struan Shrimpton <sshrimp@google.com>
Commit-Queue: Dirk Pranke <dpranke@google.com>
diff --git a/scripts/export.py b/scripts/export.py
new file mode 100755
index 0000000..5e7f7e4
--- /dev/null
+++ b/scripts/export.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env vpython3
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Export www.chromium.org to local files.
+
+This script uses the Google GData and Google Sites APIs to extract the
+content from http://www.chromium.org/ and write it into local files
+that can be used to serve the same content.
+
+The APIs are documented at
+
+https://developers.google.com/sites/docs/1.0/developers_guide_protocol
+https://developers.google.com/gdata/docs/json
+
+Because www.chromium.org is a public site, this script requires no
+authentication to work.
+
+The exporting process attempts to convert the original content into
+sane modern HTML as much as possible without changing the appearance
+of any page significantly, with some minor exceptions.
+"""
+
+import argparse
+import io
+import json
+import os
+import pdb
+import sys
+import time
+import traceback
+from urllib.request import urlopen
+from urllib.error import HTTPError, URLError
+
+import yaml
+
+import common
+import html2markdown
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--force', action='store_true',
+ help='ignore updated timestamps in local cache')
+ parser.add_argument('-t', '--test', action='store_true')
+ parser.add_argument('-r', '--raw', action='store_true')
+ parser.add_argument('-v', '--verbose', action='count')
+ parser.add_argument('--max_results', type=int, default=5000)
+ parser.add_argument('--start-index', type=int, default=1)
+ parser.add_argument('--path-list')
+ parser.add_argument('path', nargs='*')
+ args = parser.parse_args()
+
+ entries, parents = _entries(args)
+
+ if args.path:
+ paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
+ path)
+ for path in args.path]
+ elif args.path_list:
+ paths_to_export = common.read_paths(args.path_list)
+ else:
+ paths_to_export = []
+
+ max_input_mtime = max(os.stat(__file__).st_mtime,
+ os.stat(common.__file__).st_mtime,
+ os.stat(html2markdown.__file__).st_mtime)
+
+ updated = 0
+ paths = []
+
+ if args.test:
+ entry = _find_entry_by_path(paths_to_export[0], entries, parents)
+ if entry:
+ metadata = _metadata(entry, entries, parents)
+ path = _path(entry, entries, parents)
+ _ = _handle_entry(path,
+ (entry, metadata, max_input_mtime, args.force,
+ args.raw))
+ content = common.read_text_file('%s/%s.md' % (common.SOURCE_DIR,
+ path))
+ print(content)
+ return 0
+ else:
+ print('%s not found' % paths_to_export[0])
+ return 1
+
+ q = common.JobQueue(_handle_entry, common.cpu_count())
+
+ paths_to_export = set(paths_to_export)
+ for i, entry in enumerate(list(entries.values())[:args.max_results]):
+ if entry['kind'] in ('webpage', 'listpage', 'announcementspage', 'filecabinet'):
+ metadata = _metadata(entry, entries, parents)
+ path = _path(entry, entries, parents)
+ elif entry['kind'] == 'attachment':
+ metadata = {}
+ path = entry['url'].replace(
+ 'https://sites.google.com/a/chromium.org/dev/', '')
+ else:
+ continue
+
+ if not paths_to_export or (
+ ('/' + path).replace('/index', '') in paths_to_export):
+ q.request(path, (entry, metadata, max_input_mtime, args.force, False))
+
+ for path, res, did_update in q.results():
+ if did_update:
+ updated += 1
+
+ print('updated %d entries' % updated)
+
+
+def _find_entry_by_path(path, entries, parents):
+ seen = set()
+ for entry in entries.values():
+ if entry['kind'] not in ('webpage', 'listpage', 'announcmentspage', 'filecabinet'):
+ continue
+ entry_path = _path(entry, entries, parents)
+ seen.add(entry_path)
+ if '/' + entry_path in (path, path + '/index'):
+ return entry
+ return None
+
+
+def _handle_entry(task, obj):
+ entry, metadata, max_input_mtime, force, raw = obj
+ err = ''
+ did_update = False
+
+ yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
+
+ if task in (
+ 'developers/jinja',
+ 'developers/polymer-1-0',
+ 'devtools/breakpoints-tutorial/index.html',
+ 'devtools/breakpoints-tutorial/script.js',
+ ):
+ # TODO: Eleventy chokes on these files.
+ return '', False
+
+ def repr_str(dumper, data):
+ if '\n' in data:
+ return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
+ style='|')
+ return dumper.org_represent_str(data)
+
+ yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
+
+
+ mtime = _to_ts(entry['updated'])
+ if entry['kind'] in ('webpage',
+ 'listpage',
+ 'announcementspage',
+ 'filecabinet'):
+ target_mtime = max(mtime, max_input_mtime)
+ path = '%s/%s.md' % (common.SOURCE_DIR, task)
+ if True or _needs_update(path, target_mtime, force):
+ if raw:
+ content = entry['content']
+ else:
+ content_sio = io.StringIO(entry['content'])
+ md_sio = io.StringIO()
+ md_sio.write('---\n')
+ md_sio.write(yaml.safe_dump(metadata))
+ md_sio.write('---\n\n')
+ url_converter = _URLConverter()
+ html2markdown.Convert(content_sio, md_sio, url_converter)
+ content = md_sio.getvalue()
+ content = content.replace(' \b\b\b\b', '')
+ did_update = common.write_if_changed(path, content.encode('utf-8'))
+ else:
+ did_update = False
+ elif entry['kind'] in ('announcement', 'listitem'):
+ # TODO: implement me.
+ pass
+ elif entry['kind'] == 'attachment':
+ path = '%s/%s' % (common.SOURCE_DIR, task)
+ if path in (
+ 'site/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
+ 'site/developers/design-documents/cookie-split-loading/objects.png',
+ ):
+ # These are expected 404's that we ignore.
+ did_update = False
+ elif _needs_update(path, mtime, force):
+ try:
+ fp = urlopen(entry['url'])
+ content = fp.read()
+ did_update = common.write_if_changed(path, content)
+ except (HTTPError, URLError, TimeoutError) as e:
+ err = 'Error: %s' % e
+
+ elif entry['kind'] == 'comment':
+ # ignore comments in the migration
+ pass
+ elif entry['kind'] == 'tag':
+ err = 'tag kind not implemented'
+ else:
+ err = 'unknown kind %s' % entry['kind']
+
+ return err, did_update
+
+
+class _URLConverter:
+ def Translate(self, href):
+ if not href:
+ return ''
+
+ for path in common.alternates:
+ if href.startswith(path):
+ href = href.replace(path, '')
+
+ if href.startswith('/_/rsrc'):
+ href = '/' + '/'.join(href.split('/')[4:])
+ if '?' in href:
+ href = href[0:href.index('?')]
+ return href
+
+
+def _path(entry, entries, parents):
+ path = entry['page_name']
+ if entry['id'] in parents:
+ path = path + '/index'
+ parent_id = entry.get('parent_id')
+ while parent_id:
+ path = entries[parent_id]['page_name'] + '/' + path
+ parent_id = entries[parent_id].get('parent_id')
+
+ return path
+
+
+def _metadata(entry, entries, parents):
+ metadata = {}
+ metadata['page_name'] = entry['page_name']
+ metadata['title'] = entry['title']
+
+ crumbs = []
+ parent_id = entry.get('parent_id')
+ while parent_id:
+ parent = entries[parent_id]
+ path = '/' + _path(parent, entries, parents).replace('/index', '')
+ title = parent['title']
+ crumbs = [[path, title]] + crumbs
+ parent_id = parent.get('parent_id')
+
+ metadata['breadcrumbs'] = crumbs
+
+ if metadata['page_name'] in (
+ 'chromium-projects',
+ 'chromium',
+ ):
+ metadata['use_title_as_h1'] = False
+
+ return metadata
+
+
+def _needs_update(path, mtime, force):
+ if force:
+ return True
+ if os.path.exists(path):
+ st = os.stat(path)
+ return mtime > st.st_mtime
+ return True
+
+
+def _entries(args):
+ entries = {}
+ parents = set()
+
+ # Looks like Sites probably caps results at 500 entries per request,
+ # even if we request more than that.
+ rownum = 0
+ url = ('https://sites.google.com/feeds/content/chromium.org/dev'
+ '?start-index=%d&max-results=%d&alt=json' %
+ (args.start_index, 500 - rownum))
+ doc, next_url = _fetch(url, args.force)
+
+ for rownum, entry in enumerate(doc['feed']['entry'], start=1):
+ row = _to_row(entry, rownum)
+ entries[row['id']] = row
+ if row.get('parent_id'):
+ parents.add(row['parent_id'])
+ if args.verbose:
+ print(' ... [%d]' % rownum)
+ while next_url:
+ doc, next_url = _fetch(next_url, args.force)
+ for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
+ row = _to_row(entry, rownum)
+ entries[row['id']] = row
+ if row.get('parent_id'):
+ parents.add(row['parent_id'])
+ if args.verbose:
+ print(' ... [%d]' % rownum)
+
+ return entries, parents
+
+
+def _fetch(url, force):
+ path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
+ if _needs_update(path, 0, force):
+ fp = urlopen(url)
+ content = fp.read()
+ doc = json.loads(content)
+ updated = _to_ts(doc['feed']['updated']['$t'])
+ common.write_if_changed(path, content)
+ else:
+ with open(path) as fp:
+ doc = json.load(fp)
+ next_url = _find_link(doc['feed'], 'next')
+ return doc, next_url
+
+
+def _find_link(doc, rel):
+ for ent in doc['link']:
+ if ent['rel'] == rel:
+ return ent['href']
+ return None
+
+
+def _to_row(entry, rownum):
+ row = {
+ 'rownum': rownum,
+ 'content': entry.get('content', {}).get('$t'),
+ 'id': _to_id(entry['id']['$t']),
+ 'kind': entry['category'][0]['label'],
+ 'published': entry['published']['$t'],
+ 'updated': entry['updated']['$t'],
+ }
+
+ row['page_name'] = entry.get('sites$pageName', {}).get('$t')
+ row['title'] = entry.get('title', {}).get('$t')
+ row['alt_url'] = _find_link(entry, 'alternate')
+
+ if row['kind'] == 'attachment':
+ row['url'] = _find_link(entry, 'alternate')
+ else:
+ row['url'] = _find_link(entry, 'self')
+
+ parent_url = _find_link(entry,
+ 'http://schemas.google.com/sites/2008#parent')
+ if parent_url:
+ row['parent_id'] = _to_id(parent_url)
+ return row
+
+
+def _to_id(url):
+ return url[url.rfind('/') + 1:]
+
+
+def _to_ts(iso_time):
+ return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
+
+if __name__ == '__main__':
+ try:
+ main()
+ except Exception:
+ extype, value, tb = sys.exc_info()
+ traceback.print_exc()
+ pdb.post_mortem(tb)