Blame - scripts/export.py - chromium.googlesource.com/website

blob: 45caceaa4281c9818c08ef95909e150d787c9619 [file] [log] [blame]

Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	1	#!/usr/bin/env vpython3
				2	# Copyright 2021 Google LLC
				3	#
				4	# Licensed under the Apache License, Version 2.0 (the "License");
				5	# you may not use this file except in compliance with the License.
				6	# You may obtain a copy of the License at
				7	#
				8	# https://www.apache.org/licenses/LICENSE-2.0
				9	#
				10	# Unless required by applicable law or agreed to in writing, software
				11	# distributed under the License is distributed on an "AS IS" BASIS,
				12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	# See the License for the specific language governing permissions and
				14	# limitations under the License.
				15
				16	"""Export www.chromium.org to local files.
				17
				18	This script uses the Google GData and Google Sites APIs to extract the
				19	content from http://www.chromium.org/ and write it into local files
				20	that can be used to serve the same content.
				21
				22	The APIs are documented at
				23
				24	https://developers.google.com/sites/docs/1.0/developers_guide_protocol
				25	https://developers.google.com/gdata/docs/json
				26
				27	Because www.chromium.org is a public site, this script requires no
				28	authentication to work.
				29
				30	The exporting process attempts to convert the original content into
				31	sane modern HTML as much as possible without changing the appearance
				32	of any page significantly, with some minor exceptions.
				33	"""
				34
				35	import argparse
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	36	import collections
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	37	import io
				38	import json
				39	import os
				40	import pdb
				41	import sys
				42	import time
				43	import traceback
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	44	import xml.etree.ElementTree as ET
				45
Dirk Pranke	0f82ab8	2021-11-16 18:43:10 -0800	[diff] [blame^]	46	from urllib.parse import urlparse
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	47	from urllib.request import urlopen
				48	from urllib.error import HTTPError, URLError
				49
				50	import yaml
				51
				52	import common
				53	import html2markdown
				54
				55
				56	def main():
				57	parser = argparse.ArgumentParser()
				58	parser.add_argument('--force', action='store_true',
				59	help='ignore updated timestamps in local cache')
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	60	parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count())
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	61	parser.add_argument('-t', '--test', action='store_true')
				62	parser.add_argument('-r', '--raw', action='store_true')
				63	parser.add_argument('-v', '--verbose', action='count')
				64	parser.add_argument('--max_results', type=int, default=5000)
				65	parser.add_argument('--start-index', type=int, default=1)
				66	parser.add_argument('--path-list')
				67	parser.add_argument('path', nargs='*')
				68	args = parser.parse_args()
				69
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	70	entries = _entries(args)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	71
				72	if args.path:
				73	paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
				74	path)
				75	for path in args.path]
				76	elif args.path_list:
				77	paths_to_export = common.read_paths(args.path_list)
				78	else:
				79	paths_to_export = []
				80
				81	max_input_mtime = max(os.stat(__file__).st_mtime,
				82	os.stat(common.__file__).st_mtime,
				83	os.stat(html2markdown.__file__).st_mtime)
				84
				85	updated = 0
				86	paths = []
				87
				88	if args.test:
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	89	entry = _find_entry_by_path(paths_to_export[0], entries)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	90	if entry:
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	91	metadata = _metadata(entry, entries)
				92	path = _path(entry, entries)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	93	_ = _handle_entry(path,
				94	(entry, metadata, max_input_mtime, args.force,
				95	args.raw))
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	96	content = common.read_text_file('%s%s/index.md' %
				97	(common.SITE_DIR, path))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	98	print(content)
				99	return 0
				100	else:
				101	print('%s not found' % paths_to_export[0])
				102	return 1
				103
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	104	q = common.JobQueue(_handle_entry, args.jobs)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	105
				106	paths_to_export = set(paths_to_export)
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame]	107	exported_pages = set()
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	108	for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	109	if entry['kind'] in ('webpage', 'listpage',
				110	'announcementspage', 'filecabinet'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	111	metadata = _metadata(entry, entries)
				112	path = _path(entry, entries)
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame]	113	exported_pages.add(path.rstrip('/') or '/')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	114	elif entry['kind'] == 'attachment':
				115	metadata = {}
				116	path = entry['url'].replace(
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	117	'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	118	else:
				119	continue
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	120	if not paths_to_export or (path in paths_to_export):
				121	q.request(path, (entry, metadata, max_input_mtime, args.force,
				122	False))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	123
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	124	ret = 0
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	125	for path, res, did_update in q.results():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	126	if res:
				127	ret = 1
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	128	if did_update:
				129	updated += 1
				130
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame]	131	if ret == 0:
				132	common.write_text_file(
				133	os.path.join(common.SITE_DIR, 'pages.json'),
				134	json.dumps(sorted(exported_pages), indent=2) + '\n')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	135	print('updated %d entries' % updated)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	136	return ret
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	137
				138
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	139	def _find_entry_by_path(path, entries):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	140	for entry in entries.values():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	141	if entry['kind'] not in ('webpage', 'listpage',
				142	'announcmentspage', 'filecabinet'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	143	continue
				144	entry_path = _path(entry, entries)
Dirk Pranke	7aa0137	2021-11-05 16:16:09 -0700	[diff] [blame]	145	if entry_path == path:
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	146	return entry
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	147	return None
				148
				149
				150	def _handle_entry(task, obj):
				151	entry, metadata, max_input_mtime, force, raw = obj
				152	err = ''
				153	did_update = False
				154
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	155	if not task.startswith('/'):
				156	return 'malformed task', False
				157
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	158	yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
				159
				160	if task in (
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	161	'/developers/jinja',
				162	'/developers/polymer-1-0',
				163	'/devtools/breakpoints-tutorial/index.html',
				164	'/devtools/breakpoints-tutorial/script.js',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	165	):
				166	# TODO: Eleventy chokes on these files.
				167	return '', False
				168
				169	def repr_str(dumper, data):
				170	if '\n' in data:
				171	return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
				172	style='\|')
				173	return dumper.org_represent_str(data)
				174
				175	yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
				176
				177
				178	mtime = _to_ts(entry['updated'])
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	179	target_mtime = max(mtime, max_input_mtime)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	180	if entry['kind'] in ('webpage',
				181	'listpage',
				182	'announcementspage',
				183	'filecabinet'):
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	184	path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	185	if _needs_update(path, target_mtime, force):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	186	if raw:
				187	content = entry['content']
				188	else:
				189	content_sio = io.StringIO(entry['content'])
				190	md_sio = io.StringIO()
				191	md_sio.write('---\n')
				192	md_sio.write(yaml.safe_dump(metadata))
				193	md_sio.write('---\n\n')
				194	url_converter = _URLConverter()
				195	html2markdown.Convert(content_sio, md_sio, url_converter)
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	196	if entry['kind'] == 'listpage':
				197	md_sio.write('\n\n')
				198	_write_listitems(md_sio, entry)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	199	content = md_sio.getvalue()
				200	content = content.replace(' \b\b\b\b', '')
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	201
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	202	did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	203	else:
				204	did_update = False
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	205	elif entry['kind'] == 'listitem':
				206	# Handled as part of the corresponding 'listpage' entry.
				207	pass
				208	elif entry['kind'] == 'announcement':
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	209	# TODO: implement me.
				210	pass
				211	elif entry['kind'] == 'attachment':
Dirk Pranke	221a47d	2021-11-11 20:26:31 -0800	[diff] [blame]	212	if ':' in task:
				213	task = _URLConverter().Translate(task)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	214	path = '%s%s' % (common.SITE_DIR, task)
				215	if task in (
				216	'/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
				217	'/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	218	):
				219	# These are expected 404's that we ignore.
				220	did_update = False
				221	elif _needs_update(path, mtime, force):
				222	try:
				223	fp = urlopen(entry['url'])
				224	content = fp.read()
				225	did_update = common.write_if_changed(path, content)
				226	except (HTTPError, URLError, TimeoutError) as e:
				227	err = 'Error: %s' % e
				228
				229	elif entry['kind'] == 'comment':
				230	# ignore comments in the migration
				231	pass
				232	elif entry['kind'] == 'tag':
				233	err = 'tag kind not implemented'
				234	else:
				235	err = 'unknown kind %s' % entry['kind']
				236
				237	return err, did_update
				238
				239
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	240	def _write_listitems(content, entry):
				241	if not entry['listitems']:
				242	return
				243
				244	headers = entry['listitems'][0].keys()
				245	rows = sorted(entry['listitems'],
				246	key=lambda row: row.get('Release') or '')
				247
				248	content.write('<table>\n')
				249	content.write(' <tr>\n')
				250	for header in headers:
				251	content.write(' <th>%s</th>\n' % header)
				252	content.write(' </tr>\n')
				253	for row in rows:
				254	content.write(' <tr>\n')
				255	for value in row.values():
				256	if value and value.startswith('<a xmlns='):
				257	value = value.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
				258	content.write(' <td>%s</td>\n' % (value or ''))
				259	content.write(' </tr>\n')
				260	content.write('</table>\n')
				261
				262
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	263	class _URLConverter:
				264	def Translate(self, href):
				265	if not href:
				266	return ''
				267
				268	for path in common.alternates:
				269	if href.startswith(path):
				270	href = href.replace(path, '')
				271
				272	if href.startswith('/_/rsrc'):
				273	href = '/' + '/'.join(href.split('/')[4:])
Dirk Pranke	0f82ab8	2021-11-16 18:43:10 -0800	[diff] [blame^]	274
				275	url = urlparse(href)
				276	if '?' in href and url.netloc == '':
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	277	href = href[0:href.index('?')]
Dirk Pranke	221a47d	2021-11-11 20:26:31 -0800	[diff] [blame]	278	if 'Screenshot' in href:
				279	head, tail = href.split('Screenshot')
				280	tail = tail.replace(':', '%3A')
				281	href = head + 'Screenshot' + tail
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	282	return href
				283
				284
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	285	def _path(entry, entries):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	286	path = entry['page_name']
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	287	parent_id = entry.get('parent_id')
				288	while parent_id:
				289	path = entries[parent_id]['page_name'] + '/' + path
				290	parent_id = entries[parent_id].get('parent_id')
				291
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	292	return '/' + path
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	293
				294
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	295	def _metadata(entry, entries):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	296	metadata = {}
				297	metadata['page_name'] = entry['page_name']
				298	metadata['title'] = entry['title']
				299
				300	crumbs = []
				301	parent_id = entry.get('parent_id')
				302	while parent_id:
				303	parent = entries[parent_id]
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	304	path = _path(parent, entries)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	305	title = parent['title']
				306	crumbs = [[path, title]] + crumbs
				307	parent_id = parent.get('parent_id')
				308
				309	metadata['breadcrumbs'] = crumbs
				310
				311	if metadata['page_name'] in (
				312	'chromium-projects',
				313	'chromium',
				314	):
				315	metadata['use_title_as_h1'] = False
				316
				317	return metadata
				318
				319
				320	def _needs_update(path, mtime, force):
				321	if force:
				322	return True
				323	if os.path.exists(path):
				324	st = os.stat(path)
				325	return mtime > st.st_mtime
				326	return True
				327
				328
				329	def _entries(args):
				330	entries = {}
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	331	parents = {}
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	332
				333	# Looks like Sites probably caps results at 500 entries per request,
				334	# even if we request more than that.
				335	rownum = 0
				336	url = ('https://sites.google.com/feeds/content/chromium.org/dev'
				337	'?start-index=%d&max-results=%d&alt=json' %
				338	(args.start_index, 500 - rownum))
				339	doc, next_url = _fetch(url, args.force)
				340
				341	for rownum, entry in enumerate(doc['feed']['entry'], start=1):
				342	row = _to_row(entry, rownum)
				343	entries[row['id']] = row
				344	if row.get('parent_id'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	345	parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	346	if args.verbose:
				347	print(' ... [%d]' % rownum)
				348	while next_url:
				349	doc, next_url = _fetch(next_url, args.force)
				350	for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
				351	row = _to_row(entry, rownum)
				352	entries[row['id']] = row
				353	if row.get('parent_id'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	354	parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	355	if args.verbose:
				356	print(' ... [%d]' % rownum)
				357
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	358	for entry_id, entry in entries.items():
				359	if entry['kind'] == 'listpage':
				360	entry['listitems'] = [entries[child_id]['fields'] for child_id
				361	in parents[entry_id]
				362	if entries[child_id]['kind'] == 'listitem']
				363
				364	return entries
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	365
				366
				367	def _fetch(url, force):
				368	path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
				369	if _needs_update(path, 0, force):
				370	fp = urlopen(url)
				371	content = fp.read()
				372	doc = json.loads(content)
				373	updated = _to_ts(doc['feed']['updated']['$t'])
				374	common.write_if_changed(path, content)
				375	else:
				376	with open(path) as fp:
				377	doc = json.load(fp)
				378	next_url = _find_link(doc['feed'], 'next')
				379	return doc, next_url
				380
				381
				382	def _find_link(doc, rel):
				383	for ent in doc['link']:
				384	if ent['rel'] == rel:
				385	return ent['href']
				386	return None
				387
				388
				389	def _to_row(entry, rownum):
				390	row = {
				391	'rownum': rownum,
				392	'content': entry.get('content', {}).get('$t'),
				393	'id': _to_id(entry['id']['$t']),
				394	'kind': entry['category'][0]['label'],
				395	'published': entry['published']['$t'],
				396	'updated': entry['updated']['$t'],
				397	}
				398
				399	row['page_name'] = entry.get('sites$pageName', {}).get('$t')
				400	row['title'] = entry.get('title', {}).get('$t')
				401	row['alt_url'] = _find_link(entry, 'alternate')
				402
				403	if row['kind'] == 'attachment':
				404	row['url'] = _find_link(entry, 'alternate')
				405	else:
				406	row['url'] = _find_link(entry, 'self')
				407
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	408	if row['kind'] == 'listitem':
				409	path = row['url'].replace('https://sites.google.com',
				410	os.path.join(common.REPO_DIR, 'scripts'))
				411	if os.path.exists(path):
				412	xml_content = common.read_text_file(path)
				413	else:
				414	print('fetching %s' % row['url'])
				415	with urlopen(row['url']) as fp:
				416	xml_content = fp.read()
				417	common.write_if_changed(path, xml_content)
				418
				419	root = ET.fromstring(xml_content)
				420	fields = root.findall('{http://schemas.google.com/spreadsheets/2006}field')
				421	row['fields'] = collections.OrderedDict((el.attrib['name'], el.text) for el in fields)
				422
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	423	parent_url = _find_link(entry,
				424	'http://schemas.google.com/sites/2008#parent')
				425	if parent_url:
				426	row['parent_id'] = _to_id(parent_url)
				427	return row
				428
				429
				430	def _to_id(url):
				431	return url[url.rfind('/') + 1:]
				432
				433
				434	def _to_ts(iso_time):
				435	return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
				436
				437	if __name__ == '__main__':
				438	try:
				439	main()
				440	except Exception:
				441	extype, value, tb = sys.exc_info()
				442	traceback.print_exc()
				443	pdb.post_mortem(tb)