Blame - scripts/export.py - chromium.googlesource.com/website

blob: d6f889426965b87c54be0bb4e2d3748994d54280 [file] [log] [blame]

Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	1	#!/usr/bin/env vpython3
				2	# Copyright 2021 Google LLC
				3	#
				4	# Licensed under the Apache License, Version 2.0 (the "License");
				5	# you may not use this file except in compliance with the License.
				6	# You may obtain a copy of the License at
				7	#
				8	# https://www.apache.org/licenses/LICENSE-2.0
				9	#
				10	# Unless required by applicable law or agreed to in writing, software
				11	# distributed under the License is distributed on an "AS IS" BASIS,
				12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	# See the License for the specific language governing permissions and
				14	# limitations under the License.
				15
				16	"""Export www.chromium.org to local files.
				17
				18	This script uses the Google GData and Google Sites APIs to extract the
				19	content from http://www.chromium.org/ and write it into local files
				20	that can be used to serve the same content.
				21
				22	The APIs are documented at
				23
				24	https://developers.google.com/sites/docs/1.0/developers_guide_protocol
				25	https://developers.google.com/gdata/docs/json
				26
				27	Because www.chromium.org is a public site, this script requires no
				28	authentication to work.
				29
				30	The exporting process attempts to convert the original content into
				31	sane modern HTML as much as possible without changing the appearance
				32	of any page significantly, with some minor exceptions.
				33	"""
				34
				35	import argparse
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	36	import collections
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	37	import io
				38	import json
				39	import os
				40	import pdb
				41	import sys
				42	import time
				43	import traceback
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	44	import xml.etree.ElementTree as ET
				45
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	46	from urllib.request import urlopen
				47	from urllib.error import HTTPError, URLError
				48
				49	import yaml
				50
				51	import common
				52	import html2markdown
				53
				54
				55	def main():
				56	parser = argparse.ArgumentParser()
				57	parser.add_argument('--force', action='store_true',
				58	help='ignore updated timestamps in local cache')
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	59	parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count())
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	60	parser.add_argument('-t', '--test', action='store_true')
				61	parser.add_argument('-r', '--raw', action='store_true')
				62	parser.add_argument('-v', '--verbose', action='count')
				63	parser.add_argument('--max_results', type=int, default=5000)
				64	parser.add_argument('--start-index', type=int, default=1)
				65	parser.add_argument('--path-list')
				66	parser.add_argument('path', nargs='*')
				67	args = parser.parse_args()
				68
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	69	entries = _entries(args)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	70
				71	if args.path:
				72	paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
				73	path)
				74	for path in args.path]
				75	elif args.path_list:
				76	paths_to_export = common.read_paths(args.path_list)
				77	else:
				78	paths_to_export = []
				79
				80	max_input_mtime = max(os.stat(__file__).st_mtime,
				81	os.stat(common.__file__).st_mtime,
				82	os.stat(html2markdown.__file__).st_mtime)
				83
				84	updated = 0
				85	paths = []
				86
				87	if args.test:
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	88	entry = _find_entry_by_path(paths_to_export[0], entries)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	89	if entry:
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	90	metadata = _metadata(entry, entries)
				91	path = _path(entry, entries)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	92	_ = _handle_entry(path,
				93	(entry, metadata, max_input_mtime, args.force,
				94	args.raw))
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	95	content = common.read_text_file('%s%s/index.md' %
				96	(common.SITE_DIR, path))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	97	print(content)
				98	return 0
				99	else:
				100	print('%s not found' % paths_to_export[0])
				101	return 1
				102
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	103	q = common.JobQueue(_handle_entry, args.jobs)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	104
				105	paths_to_export = set(paths_to_export)
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame]	106	exported_pages = set()
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	107	for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	108	if entry['kind'] in ('webpage', 'listpage',
				109	'announcementspage', 'filecabinet'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	110	metadata = _metadata(entry, entries)
				111	path = _path(entry, entries)
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame]	112	exported_pages.add(path.rstrip('/') or '/')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	113	elif entry['kind'] == 'attachment':
				114	metadata = {}
				115	path = entry['url'].replace(
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	116	'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	117	else:
				118	continue
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	119	if not paths_to_export or (path in paths_to_export):
				120	q.request(path, (entry, metadata, max_input_mtime, args.force,
				121	False))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	122
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	123	ret = 0
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	124	for path, res, did_update in q.results():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	125	if res:
				126	ret = 1
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	127	if did_update:
				128	updated += 1
				129
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame]	130	if ret == 0:
				131	common.write_text_file(
				132	os.path.join(common.SITE_DIR, 'pages.json'),
				133	json.dumps(sorted(exported_pages), indent=2) + '\n')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	134	print('updated %d entries' % updated)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	135	return ret
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	136
				137
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	138	def _find_entry_by_path(path, entries):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	139	for entry in entries.values():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	140	if entry['kind'] not in ('webpage', 'listpage',
				141	'announcmentspage', 'filecabinet'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	142	continue
				143	entry_path = _path(entry, entries)
Dirk Pranke	7aa0137	2021-11-05 16:16:09 -0700	[diff] [blame]	144	if entry_path == path:
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	145	return entry
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	146	return None
				147
				148
				149	def _handle_entry(task, obj):
				150	entry, metadata, max_input_mtime, force, raw = obj
				151	err = ''
				152	did_update = False
				153
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	154	if not task.startswith('/'):
				155	return 'malformed task', False
				156
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	157	yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
				158
				159	if task in (
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	160	'/developers/jinja',
				161	'/developers/polymer-1-0',
				162	'/devtools/breakpoints-tutorial/index.html',
				163	'/devtools/breakpoints-tutorial/script.js',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	164	):
				165	# TODO: Eleventy chokes on these files.
				166	return '', False
				167
				168	def repr_str(dumper, data):
				169	if '\n' in data:
				170	return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
				171	style='\|')
				172	return dumper.org_represent_str(data)
				173
				174	yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
				175
				176
				177	mtime = _to_ts(entry['updated'])
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	178	target_mtime = max(mtime, max_input_mtime)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	179	if entry['kind'] in ('webpage',
				180	'listpage',
				181	'announcementspage',
				182	'filecabinet'):
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	183	path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	184	if _needs_update(path, target_mtime, force):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	185	if raw:
				186	content = entry['content']
				187	else:
				188	content_sio = io.StringIO(entry['content'])
				189	md_sio = io.StringIO()
				190	md_sio.write('---\n')
				191	md_sio.write(yaml.safe_dump(metadata))
				192	md_sio.write('---\n\n')
				193	url_converter = _URLConverter()
				194	html2markdown.Convert(content_sio, md_sio, url_converter)
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	195	if entry['kind'] == 'listpage':
				196	md_sio.write('\n\n')
				197	_write_listitems(md_sio, entry)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	198	content = md_sio.getvalue()
				199	content = content.replace(' \b\b\b\b', '')
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	200
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	201	did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	202	else:
				203	did_update = False
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	204	elif entry['kind'] == 'listitem':
				205	# Handled as part of the corresponding 'listpage' entry.
				206	pass
				207	elif entry['kind'] == 'announcement':
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	208	# TODO: implement me.
				209	pass
				210	elif entry['kind'] == 'attachment':
Dirk Pranke	221a47d	2021-11-11 20:26:31 -0800	[diff] [blame^]	211	if ':' in task:
				212	task = _URLConverter().Translate(task)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	213	path = '%s%s' % (common.SITE_DIR, task)
				214	if task in (
				215	'/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
				216	'/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	217	):
				218	# These are expected 404's that we ignore.
				219	did_update = False
				220	elif _needs_update(path, mtime, force):
				221	try:
				222	fp = urlopen(entry['url'])
				223	content = fp.read()
				224	did_update = common.write_if_changed(path, content)
				225	except (HTTPError, URLError, TimeoutError) as e:
				226	err = 'Error: %s' % e
				227
				228	elif entry['kind'] == 'comment':
				229	# ignore comments in the migration
				230	pass
				231	elif entry['kind'] == 'tag':
				232	err = 'tag kind not implemented'
				233	else:
				234	err = 'unknown kind %s' % entry['kind']
				235
				236	return err, did_update
				237
				238
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	239	def _write_listitems(content, entry):
				240	if not entry['listitems']:
				241	return
				242
				243	headers = entry['listitems'][0].keys()
				244	rows = sorted(entry['listitems'],
				245	key=lambda row: row.get('Release') or '')
				246
				247	content.write('<table>\n')
				248	content.write(' <tr>\n')
				249	for header in headers:
				250	content.write(' <th>%s</th>\n' % header)
				251	content.write(' </tr>\n')
				252	for row in rows:
				253	content.write(' <tr>\n')
				254	for value in row.values():
				255	if value and value.startswith('<a xmlns='):
				256	value = value.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
				257	content.write(' <td>%s</td>\n' % (value or ''))
				258	content.write(' </tr>\n')
				259	content.write('</table>\n')
				260
				261
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	262	class _URLConverter:
				263	def Translate(self, href):
				264	if not href:
				265	return ''
				266
				267	for path in common.alternates:
				268	if href.startswith(path):
				269	href = href.replace(path, '')
				270
				271	if href.startswith('/_/rsrc'):
				272	href = '/' + '/'.join(href.split('/')[4:])
				273	if '?' in href:
				274	href = href[0:href.index('?')]
Dirk Pranke	221a47d	2021-11-11 20:26:31 -0800	[diff] [blame^]	275	if 'Screenshot' in href:
				276	head, tail = href.split('Screenshot')
				277	tail = tail.replace(':', '%3A')
				278	href = head + 'Screenshot' + tail
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	279	return href
				280
				281
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	282	def _path(entry, entries):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	283	path = entry['page_name']
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	284	parent_id = entry.get('parent_id')
				285	while parent_id:
				286	path = entries[parent_id]['page_name'] + '/' + path
				287	parent_id = entries[parent_id].get('parent_id')
				288
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	289	return '/' + path
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	290
				291
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	292	def _metadata(entry, entries):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	293	metadata = {}
				294	metadata['page_name'] = entry['page_name']
				295	metadata['title'] = entry['title']
				296
				297	crumbs = []
				298	parent_id = entry.get('parent_id')
				299	while parent_id:
				300	parent = entries[parent_id]
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	301	path = _path(parent, entries)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	302	title = parent['title']
				303	crumbs = [[path, title]] + crumbs
				304	parent_id = parent.get('parent_id')
				305
				306	metadata['breadcrumbs'] = crumbs
				307
				308	if metadata['page_name'] in (
				309	'chromium-projects',
				310	'chromium',
				311	):
				312	metadata['use_title_as_h1'] = False
				313
				314	return metadata
				315
				316
				317	def _needs_update(path, mtime, force):
				318	if force:
				319	return True
				320	if os.path.exists(path):
				321	st = os.stat(path)
				322	return mtime > st.st_mtime
				323	return True
				324
				325
				326	def _entries(args):
				327	entries = {}
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	328	parents = {}
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	329
				330	# Looks like Sites probably caps results at 500 entries per request,
				331	# even if we request more than that.
				332	rownum = 0
				333	url = ('https://sites.google.com/feeds/content/chromium.org/dev'
				334	'?start-index=%d&max-results=%d&alt=json' %
				335	(args.start_index, 500 - rownum))
				336	doc, next_url = _fetch(url, args.force)
				337
				338	for rownum, entry in enumerate(doc['feed']['entry'], start=1):
				339	row = _to_row(entry, rownum)
				340	entries[row['id']] = row
				341	if row.get('parent_id'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	342	parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	343	if args.verbose:
				344	print(' ... [%d]' % rownum)
				345	while next_url:
				346	doc, next_url = _fetch(next_url, args.force)
				347	for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
				348	row = _to_row(entry, rownum)
				349	entries[row['id']] = row
				350	if row.get('parent_id'):
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	351	parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	352	if args.verbose:
				353	print(' ... [%d]' % rownum)
				354
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	355	for entry_id, entry in entries.items():
				356	if entry['kind'] == 'listpage':
				357	entry['listitems'] = [entries[child_id]['fields'] for child_id
				358	in parents[entry_id]
				359	if entries[child_id]['kind'] == 'listitem']
				360
				361	return entries
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	362
				363
				364	def _fetch(url, force):
				365	path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
				366	if _needs_update(path, 0, force):
				367	fp = urlopen(url)
				368	content = fp.read()
				369	doc = json.loads(content)
				370	updated = _to_ts(doc['feed']['updated']['$t'])
				371	common.write_if_changed(path, content)
				372	else:
				373	with open(path) as fp:
				374	doc = json.load(fp)
				375	next_url = _find_link(doc['feed'], 'next')
				376	return doc, next_url
				377
				378
				379	def _find_link(doc, rel):
				380	for ent in doc['link']:
				381	if ent['rel'] == rel:
				382	return ent['href']
				383	return None
				384
				385
				386	def _to_row(entry, rownum):
				387	row = {
				388	'rownum': rownum,
				389	'content': entry.get('content', {}).get('$t'),
				390	'id': _to_id(entry['id']['$t']),
				391	'kind': entry['category'][0]['label'],
				392	'published': entry['published']['$t'],
				393	'updated': entry['updated']['$t'],
				394	}
				395
				396	row['page_name'] = entry.get('sites$pageName', {}).get('$t')
				397	row['title'] = entry.get('title', {}).get('$t')
				398	row['alt_url'] = _find_link(entry, 'alternate')
				399
				400	if row['kind'] == 'attachment':
				401	row['url'] = _find_link(entry, 'alternate')
				402	else:
				403	row['url'] = _find_link(entry, 'self')
				404
Dirk Pranke	f995947	2021-11-09 14:16:33 -0800	[diff] [blame]	405	if row['kind'] == 'listitem':
				406	path = row['url'].replace('https://sites.google.com',
				407	os.path.join(common.REPO_DIR, 'scripts'))
				408	if os.path.exists(path):
				409	xml_content = common.read_text_file(path)
				410	else:
				411	print('fetching %s' % row['url'])
				412	with urlopen(row['url']) as fp:
				413	xml_content = fp.read()
				414	common.write_if_changed(path, xml_content)
				415
				416	root = ET.fromstring(xml_content)
				417	fields = root.findall('{http://schemas.google.com/spreadsheets/2006}field')
				418	row['fields'] = collections.OrderedDict((el.attrib['name'], el.text) for el in fields)
				419
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	420	parent_url = _find_link(entry,
				421	'http://schemas.google.com/sites/2008#parent')
				422	if parent_url:
				423	row['parent_id'] = _to_id(parent_url)
				424	return row
				425
				426
				427	def _to_id(url):
				428	return url[url.rfind('/') + 1:]
				429
				430
				431	def _to_ts(iso_time):
				432	return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
				433
				434	if __name__ == '__main__':
				435	try:
				436	main()
				437	except Exception:
				438	extype, value, tb = sys.exc_info()
				439	traceback.print_exc()
				440	pdb.post_mortem(tb)