Blame - scripts/export.py - chromium.googlesource.com/website

blob: 8fa6ab48096744514c8004ce6a2daddd8a8bee9f [file] [log] [blame]

Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	1	#!/usr/bin/env vpython3
				2	# Copyright 2021 Google LLC
				3	#
				4	# Licensed under the Apache License, Version 2.0 (the "License");
				5	# you may not use this file except in compliance with the License.
				6	# You may obtain a copy of the License at
				7	#
				8	# https://www.apache.org/licenses/LICENSE-2.0
				9	#
				10	# Unless required by applicable law or agreed to in writing, software
				11	# distributed under the License is distributed on an "AS IS" BASIS,
				12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	# See the License for the specific language governing permissions and
				14	# limitations under the License.
				15
				16	"""Export www.chromium.org to local files.
				17
				18	This script uses the Google GData and Google Sites APIs to extract the
				19	content from http://www.chromium.org/ and write it into local files
				20	that can be used to serve the same content.
				21
				22	The APIs are documented at
				23
				24	https://developers.google.com/sites/docs/1.0/developers_guide_protocol
				25	https://developers.google.com/gdata/docs/json
				26
				27	Because www.chromium.org is a public site, this script requires no
				28	authentication to work.
				29
				30	The exporting process attempts to convert the original content into
				31	sane modern HTML as much as possible without changing the appearance
				32	of any page significantly, with some minor exceptions.
				33	"""
				34
				35	import argparse
				36	import io
				37	import json
				38	import os
				39	import pdb
				40	import sys
				41	import time
				42	import traceback
				43	from urllib.request import urlopen
				44	from urllib.error import HTTPError, URLError
				45
				46	import yaml
				47
				48	import common
				49	import html2markdown
				50
				51
				52	def main():
				53	parser = argparse.ArgumentParser()
				54	parser.add_argument('--force', action='store_true',
				55	help='ignore updated timestamps in local cache')
				56	parser.add_argument('-t', '--test', action='store_true')
				57	parser.add_argument('-r', '--raw', action='store_true')
				58	parser.add_argument('-v', '--verbose', action='count')
				59	parser.add_argument('--max_results', type=int, default=5000)
				60	parser.add_argument('--start-index', type=int, default=1)
				61	parser.add_argument('--path-list')
				62	parser.add_argument('path', nargs='*')
				63	args = parser.parse_args()
				64
				65	entries, parents = _entries(args)
				66
				67	if args.path:
				68	paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
				69	path)
				70	for path in args.path]
				71	elif args.path_list:
				72	paths_to_export = common.read_paths(args.path_list)
				73	else:
				74	paths_to_export = []
				75
				76	max_input_mtime = max(os.stat(__file__).st_mtime,
				77	os.stat(common.__file__).st_mtime,
				78	os.stat(html2markdown.__file__).st_mtime)
				79
				80	updated = 0
				81	paths = []
				82
				83	if args.test:
				84	entry = _find_entry_by_path(paths_to_export[0], entries, parents)
				85	if entry:
				86	metadata = _metadata(entry, entries, parents)
				87	path = _path(entry, entries, parents)
				88	_ = _handle_entry(path,
				89	(entry, metadata, max_input_mtime, args.force,
				90	args.raw))
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	91	content = common.read_text_file('%s%s/index.md' %
				92	(common.SITE_DIR, path))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	93	print(content)
				94	return 0
				95	else:
				96	print('%s not found' % paths_to_export[0])
				97	return 1
				98
				99	q = common.JobQueue(_handle_entry, common.cpu_count())
				100
				101	paths_to_export = set(paths_to_export)
				102	for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	103	if entry['kind'] in ('webpage', 'listpage',
				104	'announcementspage', 'filecabinet'):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	105	metadata = _metadata(entry, entries, parents)
				106	path = _path(entry, entries, parents)
				107	elif entry['kind'] == 'attachment':
				108	metadata = {}
				109	path = entry['url'].replace(
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	110	'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	111	else:
				112	continue
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	113	if not paths_to_export or (path in paths_to_export):
				114	q.request(path, (entry, metadata, max_input_mtime, args.force,
				115	False))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	116
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	117	ret = 0
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	118	for path, res, did_update in q.results():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	119	if res:
				120	ret = 1
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	121	if did_update:
				122	updated += 1
				123
				124	print('updated %d entries' % updated)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	125	return ret
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	126
				127
				128	def _find_entry_by_path(path, entries, parents):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	129	for entry in entries.values():
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	130	if entry['kind'] not in ('webpage', 'listpage',
				131	'announcmentspage', 'filecabinet'):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	132	continue
				133	entry_path = _path(entry, entries, parents)
Dirk Pranke	7aa0137	2021-11-05 16:16:09 -0700	[diff] [blame]	134	if entry_path == path:
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	135	return entry
				136	return None
				137
				138
				139	def _handle_entry(task, obj):
				140	entry, metadata, max_input_mtime, force, raw = obj
				141	err = ''
				142	did_update = False
				143
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	144	if not task.startswith('/'):
				145	return 'malformed task', False
				146
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	147	yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
				148
				149	if task in (
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	150	'/developers/jinja',
				151	'/developers/polymer-1-0',
				152	'/devtools/breakpoints-tutorial/index.html',
				153	'/devtools/breakpoints-tutorial/script.js',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	154	):
				155	# TODO: Eleventy chokes on these files.
				156	return '', False
				157
				158	def repr_str(dumper, data):
				159	if '\n' in data:
				160	return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
				161	style='\|')
				162	return dumper.org_represent_str(data)
				163
				164	yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
				165
				166
				167	mtime = _to_ts(entry['updated'])
				168	if entry['kind'] in ('webpage',
				169	'listpage',
				170	'announcementspage',
				171	'filecabinet'):
				172	target_mtime = max(mtime, max_input_mtime)
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	173	path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	174	if True or _needs_update(path, target_mtime, force):
				175	if raw:
				176	content = entry['content']
				177	else:
				178	content_sio = io.StringIO(entry['content'])
				179	md_sio = io.StringIO()
				180	md_sio.write('---\n')
				181	md_sio.write(yaml.safe_dump(metadata))
				182	md_sio.write('---\n\n')
				183	url_converter = _URLConverter()
				184	html2markdown.Convert(content_sio, md_sio, url_converter)
				185	content = md_sio.getvalue()
				186	content = content.replace(' \b\b\b\b', '')
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	187	did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	188	else:
				189	did_update = False
				190	elif entry['kind'] in ('announcement', 'listitem'):
				191	# TODO: implement me.
				192	pass
				193	elif entry['kind'] == 'attachment':
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	194	path = '%s%s' % (common.SITE_DIR, task)
				195	if task in (
				196	'/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
				197	'/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	198	):
				199	# These are expected 404's that we ignore.
				200	did_update = False
				201	elif _needs_update(path, mtime, force):
				202	try:
				203	fp = urlopen(entry['url'])
				204	content = fp.read()
				205	did_update = common.write_if_changed(path, content)
				206	except (HTTPError, URLError, TimeoutError) as e:
				207	err = 'Error: %s' % e
				208
				209	elif entry['kind'] == 'comment':
				210	# ignore comments in the migration
				211	pass
				212	elif entry['kind'] == 'tag':
				213	err = 'tag kind not implemented'
				214	else:
				215	err = 'unknown kind %s' % entry['kind']
				216
				217	return err, did_update
				218
				219
				220	class _URLConverter:
				221	def Translate(self, href):
				222	if not href:
				223	return ''
				224
				225	for path in common.alternates:
				226	if href.startswith(path):
				227	href = href.replace(path, '')
				228
				229	if href.startswith('/_/rsrc'):
				230	href = '/' + '/'.join(href.split('/')[4:])
				231	if '?' in href:
				232	href = href[0:href.index('?')]
				233	return href
				234
				235
				236	def _path(entry, entries, parents):
				237	path = entry['page_name']
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	238	parent_id = entry.get('parent_id')
				239	while parent_id:
				240	path = entries[parent_id]['page_name'] + '/' + path
				241	parent_id = entries[parent_id].get('parent_id')
				242
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	243	return '/' + path
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	244
				245
				246	def _metadata(entry, entries, parents):
				247	metadata = {}
				248	metadata['page_name'] = entry['page_name']
				249	metadata['title'] = entry['title']
				250
				251	crumbs = []
				252	parent_id = entry.get('parent_id')
				253	while parent_id:
				254	parent = entries[parent_id]
Dirk Pranke	304c534	2021-11-03 12:34:21 -0700	[diff] [blame]	255	path = _path(parent, entries, parents)
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	256	title = parent['title']
				257	crumbs = [[path, title]] + crumbs
				258	parent_id = parent.get('parent_id')
				259
				260	metadata['breadcrumbs'] = crumbs
				261
				262	if metadata['page_name'] in (
				263	'chromium-projects',
				264	'chromium',
				265	):
				266	metadata['use_title_as_h1'] = False
				267
				268	return metadata
				269
				270
				271	def _needs_update(path, mtime, force):
				272	if force:
				273	return True
				274	if os.path.exists(path):
				275	st = os.stat(path)
				276	return mtime > st.st_mtime
				277	return True
				278
				279
				280	def _entries(args):
				281	entries = {}
				282	parents = set()
				283
				284	# Looks like Sites probably caps results at 500 entries per request,
				285	# even if we request more than that.
				286	rownum = 0
				287	url = ('https://sites.google.com/feeds/content/chromium.org/dev'
				288	'?start-index=%d&max-results=%d&alt=json' %
				289	(args.start_index, 500 - rownum))
				290	doc, next_url = _fetch(url, args.force)
				291
				292	for rownum, entry in enumerate(doc['feed']['entry'], start=1):
				293	row = _to_row(entry, rownum)
				294	entries[row['id']] = row
				295	if row.get('parent_id'):
				296	parents.add(row['parent_id'])
				297	if args.verbose:
				298	print(' ... [%d]' % rownum)
				299	while next_url:
				300	doc, next_url = _fetch(next_url, args.force)
				301	for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
				302	row = _to_row(entry, rownum)
				303	entries[row['id']] = row
				304	if row.get('parent_id'):
				305	parents.add(row['parent_id'])
				306	if args.verbose:
				307	print(' ... [%d]' % rownum)
				308
				309	return entries, parents
				310
				311
				312	def _fetch(url, force):
				313	path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
				314	if _needs_update(path, 0, force):
				315	fp = urlopen(url)
				316	content = fp.read()
				317	doc = json.loads(content)
				318	updated = _to_ts(doc['feed']['updated']['$t'])
				319	common.write_if_changed(path, content)
				320	else:
				321	with open(path) as fp:
				322	doc = json.load(fp)
				323	next_url = _find_link(doc['feed'], 'next')
				324	return doc, next_url
				325
				326
				327	def _find_link(doc, rel):
				328	for ent in doc['link']:
				329	if ent['rel'] == rel:
				330	return ent['href']
				331	return None
				332
				333
				334	def _to_row(entry, rownum):
				335	row = {
				336	'rownum': rownum,
				337	'content': entry.get('content', {}).get('$t'),
				338	'id': _to_id(entry['id']['$t']),
				339	'kind': entry['category'][0]['label'],
				340	'published': entry['published']['$t'],
				341	'updated': entry['updated']['$t'],
				342	}
				343
				344	row['page_name'] = entry.get('sites$pageName', {}).get('$t')
				345	row['title'] = entry.get('title', {}).get('$t')
				346	row['alt_url'] = _find_link(entry, 'alternate')
				347
				348	if row['kind'] == 'attachment':
				349	row['url'] = _find_link(entry, 'alternate')
				350	else:
				351	row['url'] = _find_link(entry, 'self')
				352
				353	parent_url = _find_link(entry,
				354	'http://schemas.google.com/sites/2008#parent')
				355	if parent_url:
				356	row['parent_id'] = _to_id(parent_url)
				357	return row
				358
				359
				360	def _to_id(url):
				361	return url[url.rfind('/') + 1:]
				362
				363
				364	def _to_ts(iso_time):
				365	return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
				366
				367	if __name__ == '__main__':
				368	try:
				369	main()
				370	except Exception:
				371	extype, value, tb = sys.exc_info()
				372	traceback.print_exc()
				373	pdb.post_mortem(tb)