Blame - scripts/export.py - chromium.googlesource.com/website

blob: 51f234461cd67011286d4587d750364cd0289db2 [file] [log] [blame]

Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	1	#!/usr/bin/env vpython3
				2	# Copyright 2021 Google LLC
				3	#
				4	# Licensed under the Apache License, Version 2.0 (the "License");
				5	# you may not use this file except in compliance with the License.
				6	# You may obtain a copy of the License at
				7	#
				8	# https://www.apache.org/licenses/LICENSE-2.0
				9	#
				10	# Unless required by applicable law or agreed to in writing, software
				11	# distributed under the License is distributed on an "AS IS" BASIS,
				12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	# See the License for the specific language governing permissions and
				14	# limitations under the License.
				15
				16	"""Export www.chromium.org to local files.
				17
				18	This script uses the Google GData and Google Sites APIs to extract the
				19	content from http://www.chromium.org/ and write it into local files
				20	that can be used to serve the same content.
				21
				22	The APIs are documented at
				23
				24	https://developers.google.com/sites/docs/1.0/developers_guide_protocol
				25	https://developers.google.com/gdata/docs/json
				26
				27	Because www.chromium.org is a public site, this script requires no
				28	authentication to work.
				29
				30	The exporting process attempts to convert the original content into
				31	sane modern HTML as much as possible without changing the appearance
				32	of any page significantly, with some minor exceptions.
				33	"""
				34
				35	import argparse
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	36	import collections
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	37	import io
				38	import json
				39	import os
				40	import pdb
				41	import sys
				42	import time
				43	import traceback
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	44	import xml.etree.ElementTree as ET
				45
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	46	from urllib.request import urlopen
				47	from urllib.error import HTTPError, URLError
				48
				49	import yaml
				50
				51	import common
				52	import html2markdown
				53
				54
				55	def main():
				56	parser = argparse.ArgumentParser()
				57	parser.add_argument('--force', action='store_true',
				58	help='ignore updated timestamps in local cache')
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	59	parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count())
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	60	parser.add_argument('-t', '--test', action='store_true')
				61	parser.add_argument('-r', '--raw', action='store_true')
				62	parser.add_argument('-v', '--verbose', action='count')
				63	parser.add_argument('--max_results', type=int, default=5000)
				64	parser.add_argument('--start-index', type=int, default=1)
				65	parser.add_argument('--path-list')
				66	parser.add_argument('path', nargs='*')
				67	args = parser.parse_args()
				68
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	69	entries = _entries(args)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	70
				71	if args.path:
				72	paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
				73	path)
				74	for path in args.path]
				75	elif args.path_list:
				76	paths_to_export = common.read_paths(args.path_list)
				77	else:
				78	paths_to_export = []
				79
				80	max_input_mtime = max(os.stat(__file__).st_mtime,
				81	os.stat(common.__file__).st_mtime,
				82	os.stat(html2markdown.__file__).st_mtime)
				83
				84	updated = 0
				85	paths = []
				86
				87	if args.test:
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	88	entry = _find_entry_by_path(paths_to_export[0], entries)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	89	if entry:
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	90	metadata = _metadata(entry, entries)
				91	path = _path(entry, entries)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	92	_ = _handle_entry(path,
				93	(entry, metadata, max_input_mtime, args.force,
				94	args.raw))
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	95	content = common.read_text_file('%s%s/index.md' %
				96	(common.SITE_DIR, path))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	97	print(content)
				98	return 0
				99	else:
				100	print('%s not found' % paths_to_export[0])
				101	return 1
				102
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	103	q = common.JobQueue(_handle_entry, args.jobs)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	104
				105	paths_to_export = set(paths_to_export)
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame]	106	exported_pages = set()
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	107	for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	108	if entry['kind'] in ('webpage', 'listpage',
				109	'announcementspage', 'filecabinet'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	110	metadata = _metadata(entry, entries)
				111	path = _path(entry, entries)
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame]	112	exported_pages.add(path.rstrip('/') or '/')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	113	elif entry['kind'] == 'attachment':
				114	metadata = {}
				115	path = entry['url'].replace(
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	116	'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	117	else:
				118	continue
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	119	if not paths_to_export or (path in paths_to_export):
				120	q.request(path, (entry, metadata, max_input_mtime, args.force,
				121	False))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	122
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	123	ret = 0
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	124	for path, res, did_update in q.results():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	125	if res:
				126	ret = 1
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	127	if did_update:
				128	updated += 1
				129
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame]	130	if ret == 0:
				131	common.write_text_file(
				132	os.path.join(common.SITE_DIR, 'pages.json'),
				133	json.dumps(sorted(exported_pages), indent=2) + '\n')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	134	print('updated %d entries' % updated)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	135	return ret
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	136
				137
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	138	def _find_entry_by_path(path, entries):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	139	for entry in entries.values():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	140	if entry['kind'] not in ('webpage', 'listpage',
				141	'announcmentspage', 'filecabinet'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	142	continue
				143	entry_path = _path(entry, entries)
Dirk Pranke	7aa0137	2021-11-05 16:16:09 -0700	[diff] [blame]	144	if entry_path == path:
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	145	return entry
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	146	return None
				147
				148
				149	def _handle_entry(task, obj):
				150	entry, metadata, max_input_mtime, force, raw = obj
				151	err = ''
				152	did_update = False
				153
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	154	if not task.startswith('/'):
				155	return 'malformed task', False
				156
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	157	yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
				158
				159	if task in (
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	160	'/developers/jinja',
				161	'/developers/polymer-1-0',
				162	'/devtools/breakpoints-tutorial/index.html',
				163	'/devtools/breakpoints-tutorial/script.js',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	164	):
				165	# TODO: Eleventy chokes on these files.
				166	return '', False
				167
				168	def repr_str(dumper, data):
				169	if '\n' in data:
				170	return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
				171	style='\|')
				172	return dumper.org_represent_str(data)
				173
				174	yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
				175
				176
				177	mtime = _to_ts(entry['updated'])
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	178	target_mtime = max(mtime, max_input_mtime)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	179	if entry['kind'] in ('webpage',
				180	'listpage',
				181	'announcementspage',
				182	'filecabinet'):
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	183	path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	184	if _needs_update(path, target_mtime, force):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	185	if raw:
				186	content = entry['content']
				187	else:
				188	content_sio = io.StringIO(entry['content'])
				189	md_sio = io.StringIO()
				190	md_sio.write('---\n')
				191	md_sio.write(yaml.safe_dump(metadata))
				192	md_sio.write('---\n\n')
				193	url_converter = _URLConverter()
				194	html2markdown.Convert(content_sio, md_sio, url_converter)
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	195	if entry['kind'] == 'listpage':
				196	md_sio.write('\n\n')
				197	_write_listitems(md_sio, entry)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	198	content = md_sio.getvalue()
				199	content = content.replace(' \b\b\b\b', '')
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	200
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	201	did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	202	else:
				203	did_update = False
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	204	elif entry['kind'] == 'listitem':
				205	# Handled as part of the corresponding 'listpage' entry.
				206	pass
				207	elif entry['kind'] == 'announcement':
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	208	# TODO: implement me.
				209	pass
				210	elif entry['kind'] == 'attachment':
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	211	path = '%s%s' % (common.SITE_DIR, task)
				212	if task in (
				213	'/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
				214	'/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	215	):
				216	# These are expected 404's that we ignore.
				217	did_update = False
				218	elif _needs_update(path, mtime, force):
				219	try:
				220	fp = urlopen(entry['url'])
				221	content = fp.read()
				222	did_update = common.write_if_changed(path, content)
				223	except (HTTPError, URLError, TimeoutError) as e:
				224	err = 'Error: %s' % e
				225
				226	elif entry['kind'] == 'comment':
				227	# ignore comments in the migration
				228	pass
				229	elif entry['kind'] == 'tag':
				230	err = 'tag kind not implemented'
				231	else:
				232	err = 'unknown kind %s' % entry['kind']
				233
				234	return err, did_update
				235
				236
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	237	def _write_listitems(content, entry):
				238	if not entry['listitems']:
				239	return
				240
				241	headers = entry['listitems'][0].keys()
				242	rows = sorted(entry['listitems'],
				243	key=lambda row: row.get('Release') or '')
				244
				245	content.write('<table>\n')
				246	content.write(' <tr>\n')
				247	for header in headers:
				248	content.write(' <th>%s</th>\n' % header)
				249	content.write(' </tr>\n')
				250	for row in rows:
				251	content.write(' <tr>\n')
				252	for value in row.values():
				253	if value and value.startswith('<a xmlns='):
				254	value = value.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
				255	content.write(' <td>%s</td>\n' % (value or ''))
				256	content.write(' </tr>\n')
				257	content.write('</table>\n')
				258
				259
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	260	class _URLConverter:
				261	def Translate(self, href):
				262	if not href:
				263	return ''
				264
				265	for path in common.alternates:
				266	if href.startswith(path):
				267	href = href.replace(path, '')
				268
				269	if href.startswith('/_/rsrc'):
				270	href = '/' + '/'.join(href.split('/')[4:])
				271	if '?' in href:
				272	href = href[0:href.index('?')]
				273	return href
				274
				275
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	276	def _path(entry, entries):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	277	path = entry['page_name']
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	278	parent_id = entry.get('parent_id')
				279	while parent_id:
				280	path = entries[parent_id]['page_name'] + '/' + path
				281	parent_id = entries[parent_id].get('parent_id')
				282
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	283	return '/' + path
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	284
				285
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	286	def _metadata(entry, entries):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	287	metadata = {}
				288	metadata['page_name'] = entry['page_name']
				289	metadata['title'] = entry['title']
				290
				291	crumbs = []
				292	parent_id = entry.get('parent_id')
				293	while parent_id:
				294	parent = entries[parent_id]
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	295	path = _path(parent, entries)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	296	title = parent['title']
				297	crumbs = [[path, title]] + crumbs
				298	parent_id = parent.get('parent_id')
				299
				300	metadata['breadcrumbs'] = crumbs
				301
				302	if metadata['page_name'] in (
				303	'chromium-projects',
				304	'chromium',
				305	):
				306	metadata['use_title_as_h1'] = False
				307
				308	return metadata
				309
				310
				311	def _needs_update(path, mtime, force):
				312	if force:
				313	return True
				314	if os.path.exists(path):
				315	st = os.stat(path)
				316	return mtime > st.st_mtime
				317	return True
				318
				319
				320	def _entries(args):
				321	entries = {}
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	322	parents = {}
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	323
				324	# Looks like Sites probably caps results at 500 entries per request,
				325	# even if we request more than that.
				326	rownum = 0
				327	url = ('https://sites.google.com/feeds/content/chromium.org/dev'
				328	'?start-index=%d&max-results=%d&alt=json' %
				329	(args.start_index, 500 - rownum))
				330	doc, next_url = _fetch(url, args.force)
				331
				332	for rownum, entry in enumerate(doc['feed']['entry'], start=1):
				333	row = _to_row(entry, rownum)
				334	entries[row['id']] = row
				335	if row.get('parent_id'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	336	parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	337	if args.verbose:
				338	print(' ... [%d]' % rownum)
				339	while next_url:
				340	doc, next_url = _fetch(next_url, args.force)
				341	for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
				342	row = _to_row(entry, rownum)
				343	entries[row['id']] = row
				344	if row.get('parent_id'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	345	parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	346	if args.verbose:
				347	print(' ... [%d]' % rownum)
				348
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	349	for entry_id, entry in entries.items():
				350	if entry['kind'] == 'listpage':
				351	entry['listitems'] = [entries[child_id]['fields'] for child_id
				352	in parents[entry_id]
				353	if entries[child_id]['kind'] == 'listitem']
				354
				355	return entries
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	356
				357
				358	def _fetch(url, force):
				359	path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
				360	if _needs_update(path, 0, force):
				361	fp = urlopen(url)
				362	content = fp.read()
				363	doc = json.loads(content)
				364	updated = _to_ts(doc['feed']['updated']['$t'])
				365	common.write_if_changed(path, content)
				366	else:
				367	with open(path) as fp:
				368	doc = json.load(fp)
				369	next_url = _find_link(doc['feed'], 'next')
				370	return doc, next_url
				371
				372
				373	def _find_link(doc, rel):
				374	for ent in doc['link']:
				375	if ent['rel'] == rel:
				376	return ent['href']
				377	return None
				378
				379
				380	def _to_row(entry, rownum):
				381	row = {
				382	'rownum': rownum,
				383	'content': entry.get('content', {}).get('$t'),
				384	'id': _to_id(entry['id']['$t']),
				385	'kind': entry['category'][0]['label'],
				386	'published': entry['published']['$t'],
				387	'updated': entry['updated']['$t'],
				388	}
				389
				390	row['page_name'] = entry.get('sites$pageName', {}).get('$t')
				391	row['title'] = entry.get('title', {}).get('$t')
				392	row['alt_url'] = _find_link(entry, 'alternate')
				393
				394	if row['kind'] == 'attachment':
				395	row['url'] = _find_link(entry, 'alternate')
				396	else:
				397	row['url'] = _find_link(entry, 'self')
				398
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	399	if row['kind'] == 'listitem':
				400	path = row['url'].replace('https://sites.google.com',
				401	os.path.join(common.REPO_DIR, 'scripts'))
				402	if os.path.exists(path):
				403	xml_content = common.read_text_file(path)
				404	else:
				405	print('fetching %s' % row['url'])
				406	with urlopen(row['url']) as fp:
				407	xml_content = fp.read()
				408	common.write_if_changed(path, xml_content)
				409
				410	root = ET.fromstring(xml_content)
				411	fields = root.findall('{http://schemas.google.com/spreadsheets/2006}field')
				412	row['fields'] = collections.OrderedDict((el.attrib['name'], el.text) for el in fields)
				413
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	414	parent_url = _find_link(entry,
				415	'http://schemas.google.com/sites/2008#parent')
				416	if parent_url:
				417	row['parent_id'] = _to_id(parent_url)
				418	return row
				419
				420
				421	def _to_id(url):
				422	return url[url.rfind('/') + 1:]
				423
				424
				425	def _to_ts(iso_time):
				426	return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
				427
				428	if __name__ == '__main__':
				429	try:
				430	main()
				431	except Exception:
				432	extype, value, tb = sys.exc_info()
				433	traceback.print_exc()
				434	pdb.post_mortem(tb)