Blame - scripts/export.py - chromium.googlesource.com/website

blob: 5e7f7e491a5840b81625624f2388b71199d10236 [file] [log] [blame]

Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame^]	1	#!/usr/bin/env vpython3
				2	# Copyright 2021 Google LLC
				3	#
				4	# Licensed under the Apache License, Version 2.0 (the "License");
				5	# you may not use this file except in compliance with the License.
				6	# You may obtain a copy of the License at
				7	#
				8	# https://www.apache.org/licenses/LICENSE-2.0
				9	#
				10	# Unless required by applicable law or agreed to in writing, software
				11	# distributed under the License is distributed on an "AS IS" BASIS,
				12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	# See the License for the specific language governing permissions and
				14	# limitations under the License.
				15
				16	"""Export www.chromium.org to local files.
				17
				18	This script uses the Google GData and Google Sites APIs to extract the
				19	content from http://www.chromium.org/ and write it into local files
				20	that can be used to serve the same content.
				21
				22	The APIs are documented at
				23
				24	https://developers.google.com/sites/docs/1.0/developers_guide_protocol
				25	https://developers.google.com/gdata/docs/json
				26
				27	Because www.chromium.org is a public site, this script requires no
				28	authentication to work.
				29
				30	The exporting process attempts to convert the original content into
				31	sane modern HTML as much as possible without changing the appearance
				32	of any page significantly, with some minor exceptions.
				33	"""
				34
				35	import argparse
				36	import io
				37	import json
				38	import os
				39	import pdb
				40	import sys
				41	import time
				42	import traceback
				43	from urllib.request import urlopen
				44	from urllib.error import HTTPError, URLError
				45
				46	import yaml
				47
				48	import common
				49	import html2markdown
				50
				51
				52	def main():
				53	parser = argparse.ArgumentParser()
				54	parser.add_argument('--force', action='store_true',
				55	help='ignore updated timestamps in local cache')
				56	parser.add_argument('-t', '--test', action='store_true')
				57	parser.add_argument('-r', '--raw', action='store_true')
				58	parser.add_argument('-v', '--verbose', action='count')
				59	parser.add_argument('--max_results', type=int, default=5000)
				60	parser.add_argument('--start-index', type=int, default=1)
				61	parser.add_argument('--path-list')
				62	parser.add_argument('path', nargs='*')
				63	args = parser.parse_args()
				64
				65	entries, parents = _entries(args)
				66
				67	if args.path:
				68	paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
				69	path)
				70	for path in args.path]
				71	elif args.path_list:
				72	paths_to_export = common.read_paths(args.path_list)
				73	else:
				74	paths_to_export = []
				75
				76	max_input_mtime = max(os.stat(__file__).st_mtime,
				77	os.stat(common.__file__).st_mtime,
				78	os.stat(html2markdown.__file__).st_mtime)
				79
				80	updated = 0
				81	paths = []
				82
				83	if args.test:
				84	entry = _find_entry_by_path(paths_to_export[0], entries, parents)
				85	if entry:
				86	metadata = _metadata(entry, entries, parents)
				87	path = _path(entry, entries, parents)
				88	_ = _handle_entry(path,
				89	(entry, metadata, max_input_mtime, args.force,
				90	args.raw))
				91	content = common.read_text_file('%s/%s.md' % (common.SOURCE_DIR,
				92	path))
				93	print(content)
				94	return 0
				95	else:
				96	print('%s not found' % paths_to_export[0])
				97	return 1
				98
				99	q = common.JobQueue(_handle_entry, common.cpu_count())
				100
				101	paths_to_export = set(paths_to_export)
				102	for i, entry in enumerate(list(entries.values())[:args.max_results]):
				103	if entry['kind'] in ('webpage', 'listpage', 'announcementspage', 'filecabinet'):
				104	metadata = _metadata(entry, entries, parents)
				105	path = _path(entry, entries, parents)
				106	elif entry['kind'] == 'attachment':
				107	metadata = {}
				108	path = entry['url'].replace(
				109	'https://sites.google.com/a/chromium.org/dev/', '')
				110	else:
				111	continue
				112
				113	if not paths_to_export or (
				114	('/' + path).replace('/index', '') in paths_to_export):
				115	q.request(path, (entry, metadata, max_input_mtime, args.force, False))
				116
				117	for path, res, did_update in q.results():
				118	if did_update:
				119	updated += 1
				120
				121	print('updated %d entries' % updated)
				122
				123
				124	def _find_entry_by_path(path, entries, parents):
				125	seen = set()
				126	for entry in entries.values():
				127	if entry['kind'] not in ('webpage', 'listpage', 'announcmentspage', 'filecabinet'):
				128	continue
				129	entry_path = _path(entry, entries, parents)
				130	seen.add(entry_path)
				131	if '/' + entry_path in (path, path + '/index'):
				132	return entry
				133	return None
				134
				135
				136	def _handle_entry(task, obj):
				137	entry, metadata, max_input_mtime, force, raw = obj
				138	err = ''
				139	did_update = False
				140
				141	yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
				142
				143	if task in (
				144	'developers/jinja',
				145	'developers/polymer-1-0',
				146	'devtools/breakpoints-tutorial/index.html',
				147	'devtools/breakpoints-tutorial/script.js',
				148	):
				149	# TODO: Eleventy chokes on these files.
				150	return '', False
				151
				152	def repr_str(dumper, data):
				153	if '\n' in data:
				154	return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
				155	style='\|')
				156	return dumper.org_represent_str(data)
				157
				158	yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
				159
				160
				161	mtime = _to_ts(entry['updated'])
				162	if entry['kind'] in ('webpage',
				163	'listpage',
				164	'announcementspage',
				165	'filecabinet'):
				166	target_mtime = max(mtime, max_input_mtime)
				167	path = '%s/%s.md' % (common.SOURCE_DIR, task)
				168	if True or _needs_update(path, target_mtime, force):
				169	if raw:
				170	content = entry['content']
				171	else:
				172	content_sio = io.StringIO(entry['content'])
				173	md_sio = io.StringIO()
				174	md_sio.write('---\n')
				175	md_sio.write(yaml.safe_dump(metadata))
				176	md_sio.write('---\n\n')
				177	url_converter = _URLConverter()
				178	html2markdown.Convert(content_sio, md_sio, url_converter)
				179	content = md_sio.getvalue()
				180	content = content.replace(' \b\b\b\b', '')
				181	did_update = common.write_if_changed(path, content.encode('utf-8'))
				182	else:
				183	did_update = False
				184	elif entry['kind'] in ('announcement', 'listitem'):
				185	# TODO: implement me.
				186	pass
				187	elif entry['kind'] == 'attachment':
				188	path = '%s/%s' % (common.SOURCE_DIR, task)
				189	if path in (
				190	'site/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
				191	'site/developers/design-documents/cookie-split-loading/objects.png',
				192	):
				193	# These are expected 404's that we ignore.
				194	did_update = False
				195	elif _needs_update(path, mtime, force):
				196	try:
				197	fp = urlopen(entry['url'])
				198	content = fp.read()
				199	did_update = common.write_if_changed(path, content)
				200	except (HTTPError, URLError, TimeoutError) as e:
				201	err = 'Error: %s' % e
				202
				203	elif entry['kind'] == 'comment':
				204	# ignore comments in the migration
				205	pass
				206	elif entry['kind'] == 'tag':
				207	err = 'tag kind not implemented'
				208	else:
				209	err = 'unknown kind %s' % entry['kind']
				210
				211	return err, did_update
				212
				213
				214	class _URLConverter:
				215	def Translate(self, href):
				216	if not href:
				217	return ''
				218
				219	for path in common.alternates:
				220	if href.startswith(path):
				221	href = href.replace(path, '')
				222
				223	if href.startswith('/_/rsrc'):
				224	href = '/' + '/'.join(href.split('/')[4:])
				225	if '?' in href:
				226	href = href[0:href.index('?')]
				227	return href
				228
				229
				230	def _path(entry, entries, parents):
				231	path = entry['page_name']
				232	if entry['id'] in parents:
				233	path = path + '/index'
				234	parent_id = entry.get('parent_id')
				235	while parent_id:
				236	path = entries[parent_id]['page_name'] + '/' + path
				237	parent_id = entries[parent_id].get('parent_id')
				238
				239	return path
				240
				241
				242	def _metadata(entry, entries, parents):
				243	metadata = {}
				244	metadata['page_name'] = entry['page_name']
				245	metadata['title'] = entry['title']
				246
				247	crumbs = []
				248	parent_id = entry.get('parent_id')
				249	while parent_id:
				250	parent = entries[parent_id]
				251	path = '/' + _path(parent, entries, parents).replace('/index', '')
				252	title = parent['title']
				253	crumbs = [[path, title]] + crumbs
				254	parent_id = parent.get('parent_id')
				255
				256	metadata['breadcrumbs'] = crumbs
				257
				258	if metadata['page_name'] in (
				259	'chromium-projects',
				260	'chromium',
				261	):
				262	metadata['use_title_as_h1'] = False
				263
				264	return metadata
				265
				266
				267	def _needs_update(path, mtime, force):
				268	if force:
				269	return True
				270	if os.path.exists(path):
				271	st = os.stat(path)
				272	return mtime > st.st_mtime
				273	return True
				274
				275
				276	def _entries(args):
				277	entries = {}
				278	parents = set()
				279
				280	# Looks like Sites probably caps results at 500 entries per request,
				281	# even if we request more than that.
				282	rownum = 0
				283	url = ('https://sites.google.com/feeds/content/chromium.org/dev'
				284	'?start-index=%d&max-results=%d&alt=json' %
				285	(args.start_index, 500 - rownum))
				286	doc, next_url = _fetch(url, args.force)
				287
				288	for rownum, entry in enumerate(doc['feed']['entry'], start=1):
				289	row = _to_row(entry, rownum)
				290	entries[row['id']] = row
				291	if row.get('parent_id'):
				292	parents.add(row['parent_id'])
				293	if args.verbose:
				294	print(' ... [%d]' % rownum)
				295	while next_url:
				296	doc, next_url = _fetch(next_url, args.force)
				297	for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
				298	row = _to_row(entry, rownum)
				299	entries[row['id']] = row
				300	if row.get('parent_id'):
				301	parents.add(row['parent_id'])
				302	if args.verbose:
				303	print(' ... [%d]' % rownum)
				304
				305	return entries, parents
				306
				307
				308	def _fetch(url, force):
				309	path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
				310	if _needs_update(path, 0, force):
				311	fp = urlopen(url)
				312	content = fp.read()
				313	doc = json.loads(content)
				314	updated = _to_ts(doc['feed']['updated']['$t'])
				315	common.write_if_changed(path, content)
				316	else:
				317	with open(path) as fp:
				318	doc = json.load(fp)
				319	next_url = _find_link(doc['feed'], 'next')
				320	return doc, next_url
				321
				322
				323	def _find_link(doc, rel):
				324	for ent in doc['link']:
				325	if ent['rel'] == rel:
				326	return ent['href']
				327	return None
				328
				329
				330	def _to_row(entry, rownum):
				331	row = {
				332	'rownum': rownum,
				333	'content': entry.get('content', {}).get('$t'),
				334	'id': _to_id(entry['id']['$t']),
				335	'kind': entry['category'][0]['label'],
				336	'published': entry['published']['$t'],
				337	'updated': entry['updated']['$t'],
				338	}
				339
				340	row['page_name'] = entry.get('sites$pageName', {}).get('$t')
				341	row['title'] = entry.get('title', {}).get('$t')
				342	row['alt_url'] = _find_link(entry, 'alternate')
				343
				344	if row['kind'] == 'attachment':
				345	row['url'] = _find_link(entry, 'alternate')
				346	else:
				347	row['url'] = _find_link(entry, 'self')
				348
				349	parent_url = _find_link(entry,
				350	'http://schemas.google.com/sites/2008#parent')
				351	if parent_url:
				352	row['parent_id'] = _to_id(parent_url)
				353	return row
				354
				355
				356	def _to_id(url):
				357	return url[url.rfind('/') + 1:]
				358
				359
				360	def _to_ts(iso_time):
				361	return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
				362
				363	if __name__ == '__main__':
				364	try:
				365	main()
				366	except Exception:
				367	extype, value, tb = sys.exc_info()
				368	traceback.print_exc()
				369	pdb.post_mortem(tb)