Blame - scripts/export.py - chromium.googlesource.com/website

blob: 93c775f25d09636fbcdcc8443e27630ac986e584 [file] [log] [blame]

Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	1	#!/usr/bin/env vpython3
				2	# Copyright 2021 Google LLC
				3	#
				4	# Licensed under the Apache License, Version 2.0 (the "License");
				5	# you may not use this file except in compliance with the License.
				6	# You may obtain a copy of the License at
				7	#
				8	# https://www.apache.org/licenses/LICENSE-2.0
				9	#
				10	# Unless required by applicable law or agreed to in writing, software
				11	# distributed under the License is distributed on an "AS IS" BASIS,
				12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	# See the License for the specific language governing permissions and
				14	# limitations under the License.
				15
				16	"""Export www.chromium.org to local files.
				17
				18	This script uses the Google GData and Google Sites APIs to extract the
				19	content from http://www.chromium.org/ and write it into local files
				20	that can be used to serve the same content.
				21
				22	The APIs are documented at
				23
				24	https://developers.google.com/sites/docs/1.0/developers_guide_protocol
				25	https://developers.google.com/gdata/docs/json
				26
				27	Because www.chromium.org is a public site, this script requires no
				28	authentication to work.
				29
				30	The exporting process attempts to convert the original content into
				31	sane modern HTML as much as possible without changing the appearance
				32	of any page significantly, with some minor exceptions.
				33	"""
				34
				35	import argparse
				36	import io
				37	import json
				38	import os
				39	import pdb
				40	import sys
				41	import time
				42	import traceback
				43	from urllib.request import urlopen
				44	from urllib.error import HTTPError, URLError
				45
				46	import yaml
				47
				48	import common
				49	import html2markdown
				50
				51
				52	def main():
				53	parser = argparse.ArgumentParser()
				54	parser.add_argument('--force', action='store_true',
				55	help='ignore updated timestamps in local cache')
				56	parser.add_argument('-t', '--test', action='store_true')
				57	parser.add_argument('-r', '--raw', action='store_true')
				58	parser.add_argument('-v', '--verbose', action='count')
				59	parser.add_argument('--max_results', type=int, default=5000)
				60	parser.add_argument('--start-index', type=int, default=1)
				61	parser.add_argument('--path-list')
				62	parser.add_argument('path', nargs='*')
				63	args = parser.parse_args()
				64
				65	entries, parents = _entries(args)
				66
				67	if args.path:
				68	paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
				69	path)
				70	for path in args.path]
				71	elif args.path_list:
				72	paths_to_export = common.read_paths(args.path_list)
				73	else:
				74	paths_to_export = []
				75
				76	max_input_mtime = max(os.stat(__file__).st_mtime,
				77	os.stat(common.__file__).st_mtime,
				78	os.stat(html2markdown.__file__).st_mtime)
				79
				80	updated = 0
				81	paths = []
				82
				83	if args.test:
				84	entry = _find_entry_by_path(paths_to_export[0], entries, parents)
				85	if entry:
				86	metadata = _metadata(entry, entries, parents)
				87	path = _path(entry, entries, parents)
				88	_ = _handle_entry(path,
				89	(entry, metadata, max_input_mtime, args.force,
				90	args.raw))
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	91	content = common.read_text_file('%s%s/index.md' %
				92	(common.SITE_DIR, path))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	93	print(content)
				94	return 0
				95	else:
				96	print('%s not found' % paths_to_export[0])
				97	return 1
				98
				99	q = common.JobQueue(_handle_entry, common.cpu_count())
				100
				101	paths_to_export = set(paths_to_export)
				102	for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	103	if entry['kind'] in ('webpage', 'listpage',
				104	'announcementspage', 'filecabinet'):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	105	metadata = _metadata(entry, entries, parents)
				106	path = _path(entry, entries, parents)
				107	elif entry['kind'] == 'attachment':
				108	metadata = {}
				109	path = entry['url'].replace(
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	110	'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	111	else:
				112	continue
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	113	if not paths_to_export or (path in paths_to_export):
				114	q.request(path, (entry, metadata, max_input_mtime, args.force,
				115	False))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	116
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	117	ret = 0
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	118	for path, res, did_update in q.results():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	119	if res:
				120	ret = 1
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	121	if did_update:
				122	updated += 1
				123
				124	print('updated %d entries' % updated)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	125	return ret
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	126
				127
				128	def _find_entry_by_path(path, entries, parents):
				129	seen = set()
				130	for entry in entries.values():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	131	if entry['kind'] not in ('webpage', 'listpage',
				132	'announcmentspage', 'filecabinet'):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	133	continue
				134	entry_path = _path(entry, entries, parents)
				135	seen.add(entry_path)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	136	if '/' + entry_path == path:
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	137	return entry
				138	return None
				139
				140
				141	def _handle_entry(task, obj):
				142	entry, metadata, max_input_mtime, force, raw = obj
				143	err = ''
				144	did_update = False
				145
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	146	if not task.startswith('/'):
				147	return 'malformed task', False
				148
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	149	yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
				150
				151	if task in (
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	152	'/developers/jinja',
				153	'/developers/polymer-1-0',
				154	'/devtools/breakpoints-tutorial/index.html',
				155	'/devtools/breakpoints-tutorial/script.js',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	156	):
				157	# TODO: Eleventy chokes on these files.
				158	return '', False
				159
				160	def repr_str(dumper, data):
				161	if '\n' in data:
				162	return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
				163	style='\|')
				164	return dumper.org_represent_str(data)
				165
				166	yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
				167
				168
				169	mtime = _to_ts(entry['updated'])
				170	if entry['kind'] in ('webpage',
				171	'listpage',
				172	'announcementspage',
				173	'filecabinet'):
				174	target_mtime = max(mtime, max_input_mtime)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	175	path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	176	if True or _needs_update(path, target_mtime, force):
				177	if raw:
				178	content = entry['content']
				179	else:
				180	content_sio = io.StringIO(entry['content'])
				181	md_sio = io.StringIO()
				182	md_sio.write('---\n')
				183	md_sio.write(yaml.safe_dump(metadata))
				184	md_sio.write('---\n\n')
				185	url_converter = _URLConverter()
				186	html2markdown.Convert(content_sio, md_sio, url_converter)
				187	content = md_sio.getvalue()
				188	content = content.replace(' \b\b\b\b', '')
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	189	did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	190	else:
				191	did_update = False
				192	elif entry['kind'] in ('announcement', 'listitem'):
				193	# TODO: implement me.
				194	pass
				195	elif entry['kind'] == 'attachment':
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	196	path = '%s%s' % (common.SITE_DIR, task)
				197	if task in (
				198	'/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
				199	'/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	200	):
				201	# These are expected 404's that we ignore.
				202	did_update = False
				203	elif _needs_update(path, mtime, force):
				204	try:
				205	fp = urlopen(entry['url'])
				206	content = fp.read()
				207	did_update = common.write_if_changed(path, content)
				208	except (HTTPError, URLError, TimeoutError) as e:
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	209	import pdb; pdb.set_trace()
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	210	err = 'Error: %s' % e
				211
				212	elif entry['kind'] == 'comment':
				213	# ignore comments in the migration
				214	pass
				215	elif entry['kind'] == 'tag':
				216	err = 'tag kind not implemented'
				217	else:
				218	err = 'unknown kind %s' % entry['kind']
				219
				220	return err, did_update
				221
				222
				223	class _URLConverter:
				224	def Translate(self, href):
				225	if not href:
				226	return ''
				227
				228	for path in common.alternates:
				229	if href.startswith(path):
				230	href = href.replace(path, '')
				231
				232	if href.startswith('/_/rsrc'):
				233	href = '/' + '/'.join(href.split('/')[4:])
				234	if '?' in href:
				235	href = href[0:href.index('?')]
				236	return href
				237
				238
				239	def _path(entry, entries, parents):
				240	path = entry['page_name']
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	241	parent_id = entry.get('parent_id')
				242	while parent_id:
				243	path = entries[parent_id]['page_name'] + '/' + path
				244	parent_id = entries[parent_id].get('parent_id')
				245
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	246	return '/' + path
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	247
				248
				249	def _metadata(entry, entries, parents):
				250	metadata = {}
				251	metadata['page_name'] = entry['page_name']
				252	metadata['title'] = entry['title']
				253
				254	crumbs = []
				255	parent_id = entry.get('parent_id')
				256	while parent_id:
				257	parent = entries[parent_id]
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	258	path = _path(parent, entries, parents)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	259	title = parent['title']
				260	crumbs = [[path, title]] + crumbs
				261	parent_id = parent.get('parent_id')
				262
				263	metadata['breadcrumbs'] = crumbs
				264
				265	if metadata['page_name'] in (
				266	'chromium-projects',
				267	'chromium',
				268	):
				269	metadata['use_title_as_h1'] = False
				270
				271	return metadata
				272
				273
				274	def _needs_update(path, mtime, force):
				275	if force:
				276	return True
				277	if os.path.exists(path):
				278	st = os.stat(path)
				279	return mtime > st.st_mtime
				280	return True
				281
				282
				283	def _entries(args):
				284	entries = {}
				285	parents = set()
				286
				287	# Looks like Sites probably caps results at 500 entries per request,
				288	# even if we request more than that.
				289	rownum = 0
				290	url = ('https://sites.google.com/feeds/content/chromium.org/dev'
				291	'?start-index=%d&max-results=%d&alt=json' %
				292	(args.start_index, 500 - rownum))
				293	doc, next_url = _fetch(url, args.force)
				294
				295	for rownum, entry in enumerate(doc['feed']['entry'], start=1):
				296	row = _to_row(entry, rownum)
				297	entries[row['id']] = row
				298	if row.get('parent_id'):
				299	parents.add(row['parent_id'])
				300	if args.verbose:
				301	print(' ... [%d]' % rownum)
				302	while next_url:
				303	doc, next_url = _fetch(next_url, args.force)
				304	for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
				305	row = _to_row(entry, rownum)
				306	entries[row['id']] = row
				307	if row.get('parent_id'):
				308	parents.add(row['parent_id'])
				309	if args.verbose:
				310	print(' ... [%d]' % rownum)
				311
				312	return entries, parents
				313
				314
				315	def _fetch(url, force):
				316	path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
				317	if _needs_update(path, 0, force):
				318	fp = urlopen(url)
				319	content = fp.read()
				320	doc = json.loads(content)
				321	updated = _to_ts(doc['feed']['updated']['$t'])
				322	common.write_if_changed(path, content)
				323	else:
				324	with open(path) as fp:
				325	doc = json.load(fp)
				326	next_url = _find_link(doc['feed'], 'next')
				327	return doc, next_url
				328
				329
				330	def _find_link(doc, rel):
				331	for ent in doc['link']:
				332	if ent['rel'] == rel:
				333	return ent['href']
				334	return None
				335
				336
				337	def _to_row(entry, rownum):
				338	row = {
				339	'rownum': rownum,
				340	'content': entry.get('content', {}).get('$t'),
				341	'id': _to_id(entry['id']['$t']),
				342	'kind': entry['category'][0]['label'],
				343	'published': entry['published']['$t'],
				344	'updated': entry['updated']['$t'],
				345	}
				346
				347	row['page_name'] = entry.get('sites$pageName', {}).get('$t')
				348	row['title'] = entry.get('title', {}).get('$t')
				349	row['alt_url'] = _find_link(entry, 'alternate')
				350
				351	if row['kind'] == 'attachment':
				352	row['url'] = _find_link(entry, 'alternate')
				353	else:
				354	row['url'] = _find_link(entry, 'self')
				355
				356	parent_url = _find_link(entry,
				357	'http://schemas.google.com/sites/2008#parent')
				358	if parent_url:
				359	row['parent_id'] = _to_id(parent_url)
				360	return row
				361
				362
				363	def _to_id(url):
				364	return url[url.rfind('/') + 1:]
				365
				366
				367	def _to_ts(iso_time):
				368	return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
				369
				370	if __name__ == '__main__':
				371	try:
				372	main()
				373	except Exception:
				374	extype, value, tb = sys.exc_info()
				375	traceback.print_exc()
				376	pdb.post_mortem(tb)