Blame - scripts/export.py - chromium.googlesource.com/website

blob: 64a3c07a44776de55de989e8dbbf8f33b4430a5c [file] [log] [blame]

Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	1	#!/usr/bin/env vpython3
				2	# Copyright 2021 Google LLC
				3	#
				4	# Licensed under the Apache License, Version 2.0 (the "License");
				5	# you may not use this file except in compliance with the License.
				6	# You may obtain a copy of the License at
				7	#
				8	# https://www.apache.org/licenses/LICENSE-2.0
				9	#
				10	# Unless required by applicable law or agreed to in writing, software
				11	# distributed under the License is distributed on an "AS IS" BASIS,
				12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	# See the License for the specific language governing permissions and
				14	# limitations under the License.
				15
				16	"""Export www.chromium.org to local files.
				17
				18	This script uses the Google GData and Google Sites APIs to extract the
				19	content from http://www.chromium.org/ and write it into local files
				20	that can be used to serve the same content.
				21
				22	The APIs are documented at
				23
				24	https://developers.google.com/sites/docs/1.0/developers_guide_protocol
				25	https://developers.google.com/gdata/docs/json
				26
				27	Because www.chromium.org is a public site, this script requires no
				28	authentication to work.
				29
				30	The exporting process attempts to convert the original content into
				31	sane modern HTML as much as possible without changing the appearance
				32	of any page significantly, with some minor exceptions.
				33	"""
				34
				35	import argparse
				36	import io
				37	import json
				38	import os
				39	import pdb
				40	import sys
				41	import time
				42	import traceback
				43	from urllib.request import urlopen
				44	from urllib.error import HTTPError, URLError
				45
				46	import yaml
				47
				48	import common
				49	import html2markdown
				50
				51
				52	def main():
				53	parser = argparse.ArgumentParser()
				54	parser.add_argument('--force', action='store_true',
				55	help='ignore updated timestamps in local cache')
				56	parser.add_argument('-t', '--test', action='store_true')
				57	parser.add_argument('-r', '--raw', action='store_true')
				58	parser.add_argument('-v', '--verbose', action='count')
				59	parser.add_argument('--max_results', type=int, default=5000)
				60	parser.add_argument('--start-index', type=int, default=1)
				61	parser.add_argument('--path-list')
				62	parser.add_argument('path', nargs='*')
				63	args = parser.parse_args()
				64
				65	entries, parents = _entries(args)
				66
				67	if args.path:
				68	paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
				69	path)
				70	for path in args.path]
				71	elif args.path_list:
				72	paths_to_export = common.read_paths(args.path_list)
				73	else:
				74	paths_to_export = []
				75
				76	max_input_mtime = max(os.stat(__file__).st_mtime,
				77	os.stat(common.__file__).st_mtime,
				78	os.stat(html2markdown.__file__).st_mtime)
				79
				80	updated = 0
				81	paths = []
				82
				83	if args.test:
				84	entry = _find_entry_by_path(paths_to_export[0], entries, parents)
				85	if entry:
				86	metadata = _metadata(entry, entries, parents)
				87	path = _path(entry, entries, parents)
				88	_ = _handle_entry(path,
				89	(entry, metadata, max_input_mtime, args.force,
				90	args.raw))
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	91	content = common.read_text_file('%s%s/index.md' %
				92	(common.SITE_DIR, path))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	93	print(content)
				94	return 0
				95	else:
				96	print('%s not found' % paths_to_export[0])
				97	return 1
				98
				99	q = common.JobQueue(_handle_entry, common.cpu_count())
				100
				101	paths_to_export = set(paths_to_export)
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame^]	102	exported_pages = set()
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	103	for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	104	if entry['kind'] in ('webpage', 'listpage',
				105	'announcementspage', 'filecabinet'):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	106	metadata = _metadata(entry, entries, parents)
				107	path = _path(entry, entries, parents)
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame^]	108	exported_pages.add(path.rstrip('/') or '/')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	109	elif entry['kind'] == 'attachment':
				110	metadata = {}
				111	path = entry['url'].replace(
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	112	'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	113	else:
				114	continue
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	115	if not paths_to_export or (path in paths_to_export):
				116	q.request(path, (entry, metadata, max_input_mtime, args.force,
				117	False))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	118
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	119	ret = 0
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	120	for path, res, did_update in q.results():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	121	if res:
				122	ret = 1
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	123	if did_update:
				124	updated += 1
				125
Dirk Pranke	2de37ac	2021-11-09 10:16:46 -0800	[diff] [blame^]	126	if ret == 0:
				127	common.write_text_file(
				128	os.path.join(common.SITE_DIR, 'pages.json'),
				129	json.dumps(sorted(exported_pages), indent=2) + '\n')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	130	print('updated %d entries' % updated)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	131	return ret
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	132
				133
				134	def _find_entry_by_path(path, entries, parents):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	135	for entry in entries.values():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	136	if entry['kind'] not in ('webpage', 'listpage',
				137	'announcmentspage', 'filecabinet'):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	138	continue
				139	entry_path = _path(entry, entries, parents)
Dirk Pranke	7aa0137	2021-11-05 16:16:09 -0700	[diff] [blame]	140	if entry_path == path:
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	141	return entry
				142	return None
				143
				144
				145	def _handle_entry(task, obj):
				146	entry, metadata, max_input_mtime, force, raw = obj
				147	err = ''
				148	did_update = False
				149
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	150	if not task.startswith('/'):
				151	return 'malformed task', False
				152
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	153	yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
				154
				155	if task in (
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	156	'/developers/jinja',
				157	'/developers/polymer-1-0',
				158	'/devtools/breakpoints-tutorial/index.html',
				159	'/devtools/breakpoints-tutorial/script.js',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	160	):
				161	# TODO: Eleventy chokes on these files.
				162	return '', False
				163
				164	def repr_str(dumper, data):
				165	if '\n' in data:
				166	return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
				167	style='\|')
				168	return dumper.org_represent_str(data)
				169
				170	yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
				171
				172
				173	mtime = _to_ts(entry['updated'])
				174	if entry['kind'] in ('webpage',
				175	'listpage',
				176	'announcementspage',
				177	'filecabinet'):
				178	target_mtime = max(mtime, max_input_mtime)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	179	path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	180	if True or _needs_update(path, target_mtime, force):
				181	if raw:
				182	content = entry['content']
				183	else:
				184	content_sio = io.StringIO(entry['content'])
				185	md_sio = io.StringIO()
				186	md_sio.write('---\n')
				187	md_sio.write(yaml.safe_dump(metadata))
				188	md_sio.write('---\n\n')
				189	url_converter = _URLConverter()
				190	html2markdown.Convert(content_sio, md_sio, url_converter)
				191	content = md_sio.getvalue()
				192	content = content.replace(' \b\b\b\b', '')
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	193	did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	194	else:
				195	did_update = False
				196	elif entry['kind'] in ('announcement', 'listitem'):
				197	# TODO: implement me.
				198	pass
				199	elif entry['kind'] == 'attachment':
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	200	path = '%s%s' % (common.SITE_DIR, task)
				201	if task in (
				202	'/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
				203	'/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	204	):
				205	# These are expected 404's that we ignore.
				206	did_update = False
				207	elif _needs_update(path, mtime, force):
				208	try:
				209	fp = urlopen(entry['url'])
				210	content = fp.read()
				211	did_update = common.write_if_changed(path, content)
				212	except (HTTPError, URLError, TimeoutError) as e:
				213	err = 'Error: %s' % e
				214
				215	elif entry['kind'] == 'comment':
				216	# ignore comments in the migration
				217	pass
				218	elif entry['kind'] == 'tag':
				219	err = 'tag kind not implemented'
				220	else:
				221	err = 'unknown kind %s' % entry['kind']
				222
				223	return err, did_update
				224
				225
				226	class _URLConverter:
				227	def Translate(self, href):
				228	if not href:
				229	return ''
				230
				231	for path in common.alternates:
				232	if href.startswith(path):
				233	href = href.replace(path, '')
				234
				235	if href.startswith('/_/rsrc'):
				236	href = '/' + '/'.join(href.split('/')[4:])
				237	if '?' in href:
				238	href = href[0:href.index('?')]
				239	return href
				240
				241
				242	def _path(entry, entries, parents):
				243	path = entry['page_name']
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	244	parent_id = entry.get('parent_id')
				245	while parent_id:
				246	path = entries[parent_id]['page_name'] + '/' + path
				247	parent_id = entries[parent_id].get('parent_id')
				248
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	249	return '/' + path
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	250
				251
				252	def _metadata(entry, entries, parents):
				253	metadata = {}
				254	metadata['page_name'] = entry['page_name']
				255	metadata['title'] = entry['title']
				256
				257	crumbs = []
				258	parent_id = entry.get('parent_id')
				259	while parent_id:
				260	parent = entries[parent_id]
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	261	path = _path(parent, entries, parents)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	262	title = parent['title']
				263	crumbs = [[path, title]] + crumbs
				264	parent_id = parent.get('parent_id')
				265
				266	metadata['breadcrumbs'] = crumbs
				267
				268	if metadata['page_name'] in (
				269	'chromium-projects',
				270	'chromium',
				271	):
				272	metadata['use_title_as_h1'] = False
				273
				274	return metadata
				275
				276
				277	def _needs_update(path, mtime, force):
				278	if force:
				279	return True
				280	if os.path.exists(path):
				281	st = os.stat(path)
				282	return mtime > st.st_mtime
				283	return True
				284
				285
				286	def _entries(args):
				287	entries = {}
				288	parents = set()
				289
				290	# Looks like Sites probably caps results at 500 entries per request,
				291	# even if we request more than that.
				292	rownum = 0
				293	url = ('https://sites.google.com/feeds/content/chromium.org/dev'
				294	'?start-index=%d&max-results=%d&alt=json' %
				295	(args.start_index, 500 - rownum))
				296	doc, next_url = _fetch(url, args.force)
				297
				298	for rownum, entry in enumerate(doc['feed']['entry'], start=1):
				299	row = _to_row(entry, rownum)
				300	entries[row['id']] = row
				301	if row.get('parent_id'):
				302	parents.add(row['parent_id'])
				303	if args.verbose:
				304	print(' ... [%d]' % rownum)
				305	while next_url:
				306	doc, next_url = _fetch(next_url, args.force)
				307	for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
				308	row = _to_row(entry, rownum)
				309	entries[row['id']] = row
				310	if row.get('parent_id'):
				311	parents.add(row['parent_id'])
				312	if args.verbose:
				313	print(' ... [%d]' % rownum)
				314
				315	return entries, parents
				316
				317
				318	def _fetch(url, force):
				319	path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
				320	if _needs_update(path, 0, force):
				321	fp = urlopen(url)
				322	content = fp.read()
				323	doc = json.loads(content)
				324	updated = _to_ts(doc['feed']['updated']['$t'])
				325	common.write_if_changed(path, content)
				326	else:
				327	with open(path) as fp:
				328	doc = json.load(fp)
				329	next_url = _find_link(doc['feed'], 'next')
				330	return doc, next_url
				331
				332
				333	def _find_link(doc, rel):
				334	for ent in doc['link']:
				335	if ent['rel'] == rel:
				336	return ent['href']
				337	return None
				338
				339
				340	def _to_row(entry, rownum):
				341	row = {
				342	'rownum': rownum,
				343	'content': entry.get('content', {}).get('$t'),
				344	'id': _to_id(entry['id']['$t']),
				345	'kind': entry['category'][0]['label'],
				346	'published': entry['published']['$t'],
				347	'updated': entry['updated']['$t'],
				348	}
				349
				350	row['page_name'] = entry.get('sites$pageName', {}).get('$t')
				351	row['title'] = entry.get('title', {}).get('$t')
				352	row['alt_url'] = _find_link(entry, 'alternate')
				353
				354	if row['kind'] == 'attachment':
				355	row['url'] = _find_link(entry, 'alternate')
				356	else:
				357	row['url'] = _find_link(entry, 'self')
				358
				359	parent_url = _find_link(entry,
				360	'http://schemas.google.com/sites/2008#parent')
				361	if parent_url:
				362	row['parent_id'] = _to_id(parent_url)
				363	return row
				364
				365
				366	def _to_id(url):
				367	return url[url.rfind('/') + 1:]
				368
				369
				370	def _to_ts(iso_time):
				371	return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
				372
				373	if __name__ == '__main__':
				374	try:
				375	main()
				376	except Exception:
				377	extype, value, tb = sys.exc_info()
				378	traceback.print_exc()
				379	pdb.post_mortem(tb)