blob: d6f889426965b87c54be0bb4e2d3748994d54280 [file] [log] [blame]
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001#!/usr/bin/env vpython3
2# Copyright 2021 Google LLC
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Export www.chromium.org to local files.
17
18This script uses the Google GData and Google Sites APIs to extract the
19content from http://www.chromium.org/ and write it into local files
20that can be used to serve the same content.
21
22The APIs are documented at
23
24https://developers.google.com/sites/docs/1.0/developers_guide_protocol
25https://developers.google.com/gdata/docs/json
26
27Because www.chromium.org is a public site, this script requires no
28authentication to work.
29
30The exporting process attempts to convert the original content into
31sane modern HTML as much as possible without changing the appearance
32of any page significantly, with some minor exceptions.
33"""
34
35import argparse
Dirk Prankef9959472021-11-09 14:16:33 -080036import collections
Dirk Pranke7bbb5472021-11-02 16:33:21 -070037import io
38import json
39import os
40import pdb
41import sys
42import time
43import traceback
Dirk Prankef9959472021-11-09 14:16:33 -080044import xml.etree.ElementTree as ET
45
Dirk Pranke7bbb5472021-11-02 16:33:21 -070046from urllib.request import urlopen
47from urllib.error import HTTPError, URLError
48
49import yaml
50
51import common
52import html2markdown
53
54
55def main():
56 parser = argparse.ArgumentParser()
57 parser.add_argument('--force', action='store_true',
58 help='ignore updated timestamps in local cache')
Dirk Prankef9959472021-11-09 14:16:33 -080059 parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count())
Dirk Pranke7bbb5472021-11-02 16:33:21 -070060 parser.add_argument('-t', '--test', action='store_true')
61 parser.add_argument('-r', '--raw', action='store_true')
62 parser.add_argument('-v', '--verbose', action='count')
63 parser.add_argument('--max_results', type=int, default=5000)
64 parser.add_argument('--start-index', type=int, default=1)
65 parser.add_argument('--path-list')
66 parser.add_argument('path', nargs='*')
67 args = parser.parse_args()
68
Dirk Prankef9959472021-11-09 14:16:33 -080069 entries = _entries(args)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070070
71 if args.path:
72 paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
73 path)
74 for path in args.path]
75 elif args.path_list:
76 paths_to_export = common.read_paths(args.path_list)
77 else:
78 paths_to_export = []
79
80 max_input_mtime = max(os.stat(__file__).st_mtime,
81 os.stat(common.__file__).st_mtime,
82 os.stat(html2markdown.__file__).st_mtime)
83
84 updated = 0
85 paths = []
86
87 if args.test:
Dirk Prankef9959472021-11-09 14:16:33 -080088 entry = _find_entry_by_path(paths_to_export[0], entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070089 if entry:
Dirk Prankef9959472021-11-09 14:16:33 -080090 metadata = _metadata(entry, entries)
91 path = _path(entry, entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070092 _ = _handle_entry(path,
93 (entry, metadata, max_input_mtime, args.force,
94 args.raw))
Dirk Pranke304c5342021-11-03 12:34:21 -070095 content = common.read_text_file('%s%s/index.md' %
96 (common.SITE_DIR, path))
Dirk Pranke7bbb5472021-11-02 16:33:21 -070097 print(content)
98 return 0
99 else:
100 print('%s not found' % paths_to_export[0])
101 return 1
102
Dirk Prankef9959472021-11-09 14:16:33 -0800103 q = common.JobQueue(_handle_entry, args.jobs)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700104
105 paths_to_export = set(paths_to_export)
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800106 exported_pages = set()
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700107 for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke304c5342021-11-03 12:34:21 -0700108 if entry['kind'] in ('webpage', 'listpage',
109 'announcementspage', 'filecabinet'):
Dirk Prankef9959472021-11-09 14:16:33 -0800110 metadata = _metadata(entry, entries)
111 path = _path(entry, entries)
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800112 exported_pages.add(path.rstrip('/') or '/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700113 elif entry['kind'] == 'attachment':
114 metadata = {}
115 path = entry['url'].replace(
Dirk Pranke304c5342021-11-03 12:34:21 -0700116 'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700117 else:
118 continue
Dirk Pranke304c5342021-11-03 12:34:21 -0700119 if not paths_to_export or (path in paths_to_export):
120 q.request(path, (entry, metadata, max_input_mtime, args.force,
121 False))
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700122
Dirk Pranke304c5342021-11-03 12:34:21 -0700123 ret = 0
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700124 for path, res, did_update in q.results():
Dirk Pranke304c5342021-11-03 12:34:21 -0700125 if res:
126 ret = 1
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700127 if did_update:
128 updated += 1
129
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800130 if ret == 0:
131 common.write_text_file(
132 os.path.join(common.SITE_DIR, 'pages.json'),
133 json.dumps(sorted(exported_pages), indent=2) + '\n')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700134 print('updated %d entries' % updated)
Dirk Pranke304c5342021-11-03 12:34:21 -0700135 return ret
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700136
137
Dirk Prankef9959472021-11-09 14:16:33 -0800138def _find_entry_by_path(path, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700139 for entry in entries.values():
Dirk Pranke304c5342021-11-03 12:34:21 -0700140 if entry['kind'] not in ('webpage', 'listpage',
141 'announcmentspage', 'filecabinet'):
Dirk Prankef9959472021-11-09 14:16:33 -0800142 continue
143 entry_path = _path(entry, entries)
Dirk Pranke7aa01372021-11-05 16:16:09 -0700144 if entry_path == path:
Dirk Prankef9959472021-11-09 14:16:33 -0800145 return entry
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700146 return None
147
148
149def _handle_entry(task, obj):
150 entry, metadata, max_input_mtime, force, raw = obj
151 err = ''
152 did_update = False
153
Dirk Pranke304c5342021-11-03 12:34:21 -0700154 if not task.startswith('/'):
155 return 'malformed task', False
156
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700157 yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
158
159 if task in (
Dirk Pranke304c5342021-11-03 12:34:21 -0700160 '/developers/jinja',
161 '/developers/polymer-1-0',
162 '/devtools/breakpoints-tutorial/index.html',
163 '/devtools/breakpoints-tutorial/script.js',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700164 ):
165 # TODO: Eleventy chokes on these files.
166 return '', False
167
168 def repr_str(dumper, data):
169 if '\n' in data:
170 return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
171 style='|')
172 return dumper.org_represent_str(data)
173
174 yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
175
176
177 mtime = _to_ts(entry['updated'])
Dirk Prankef9959472021-11-09 14:16:33 -0800178 target_mtime = max(mtime, max_input_mtime)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700179 if entry['kind'] in ('webpage',
180 'listpage',
181 'announcementspage',
182 'filecabinet'):
Dirk Pranke304c5342021-11-03 12:34:21 -0700183 path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Prankef9959472021-11-09 14:16:33 -0800184 if _needs_update(path, target_mtime, force):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700185 if raw:
186 content = entry['content']
187 else:
188 content_sio = io.StringIO(entry['content'])
189 md_sio = io.StringIO()
190 md_sio.write('---\n')
191 md_sio.write(yaml.safe_dump(metadata))
192 md_sio.write('---\n\n')
193 url_converter = _URLConverter()
194 html2markdown.Convert(content_sio, md_sio, url_converter)
Dirk Prankef9959472021-11-09 14:16:33 -0800195 if entry['kind'] == 'listpage':
196 md_sio.write('\n\n')
197 _write_listitems(md_sio, entry)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700198 content = md_sio.getvalue()
199 content = content.replace(' \b\b\b\b', '')
Dirk Prankef9959472021-11-09 14:16:33 -0800200
Dirk Pranke304c5342021-11-03 12:34:21 -0700201 did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700202 else:
203 did_update = False
Dirk Prankef9959472021-11-09 14:16:33 -0800204 elif entry['kind'] == 'listitem':
205 # Handled as part of the corresponding 'listpage' entry.
206 pass
207 elif entry['kind'] == 'announcement':
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700208 # TODO: implement me.
209 pass
210 elif entry['kind'] == 'attachment':
Dirk Pranke221a47d2021-11-11 20:26:31 -0800211 if ':' in task:
212 task = _URLConverter().Translate(task)
Dirk Pranke304c5342021-11-03 12:34:21 -0700213 path = '%s%s' % (common.SITE_DIR, task)
214 if task in (
215 '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
216 '/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700217 ):
218 # These are expected 404's that we ignore.
219 did_update = False
220 elif _needs_update(path, mtime, force):
221 try:
222 fp = urlopen(entry['url'])
223 content = fp.read()
224 did_update = common.write_if_changed(path, content)
225 except (HTTPError, URLError, TimeoutError) as e:
226 err = 'Error: %s' % e
227
228 elif entry['kind'] == 'comment':
229 # ignore comments in the migration
230 pass
231 elif entry['kind'] == 'tag':
232 err = 'tag kind not implemented'
233 else:
234 err = 'unknown kind %s' % entry['kind']
235
236 return err, did_update
237
238
Dirk Prankef9959472021-11-09 14:16:33 -0800239def _write_listitems(content, entry):
240 if not entry['listitems']:
241 return
242
243 headers = entry['listitems'][0].keys()
244 rows = sorted(entry['listitems'],
245 key=lambda row: row.get('Release') or '')
246
247 content.write('<table>\n')
248 content.write(' <tr>\n')
249 for header in headers:
250 content.write(' <th>%s</th>\n' % header)
251 content.write(' </tr>\n')
252 for row in rows:
253 content.write(' <tr>\n')
254 for value in row.values():
255 if value and value.startswith('<a xmlns='):
256 value = value.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
257 content.write(' <td>%s</td>\n' % (value or ''))
258 content.write(' </tr>\n')
259 content.write('</table>\n')
260
261
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700262class _URLConverter:
263 def Translate(self, href):
264 if not href:
265 return ''
266
267 for path in common.alternates:
268 if href.startswith(path):
269 href = href.replace(path, '')
270
271 if href.startswith('/_/rsrc'):
272 href = '/' + '/'.join(href.split('/')[4:])
273 if '?' in href:
274 href = href[0:href.index('?')]
Dirk Pranke221a47d2021-11-11 20:26:31 -0800275 if 'Screenshot' in href:
276 head, tail = href.split('Screenshot')
277 tail = tail.replace(':', '%3A')
278 href = head + 'Screenshot' + tail
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700279 return href
280
281
Dirk Prankef9959472021-11-09 14:16:33 -0800282def _path(entry, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700283 path = entry['page_name']
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700284 parent_id = entry.get('parent_id')
285 while parent_id:
286 path = entries[parent_id]['page_name'] + '/' + path
287 parent_id = entries[parent_id].get('parent_id')
288
Dirk Pranke304c5342021-11-03 12:34:21 -0700289 return '/' + path
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700290
291
Dirk Prankef9959472021-11-09 14:16:33 -0800292def _metadata(entry, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700293 metadata = {}
294 metadata['page_name'] = entry['page_name']
295 metadata['title'] = entry['title']
296
297 crumbs = []
298 parent_id = entry.get('parent_id')
299 while parent_id:
300 parent = entries[parent_id]
Dirk Prankef9959472021-11-09 14:16:33 -0800301 path = _path(parent, entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700302 title = parent['title']
303 crumbs = [[path, title]] + crumbs
304 parent_id = parent.get('parent_id')
305
306 metadata['breadcrumbs'] = crumbs
307
308 if metadata['page_name'] in (
309 'chromium-projects',
310 'chromium',
311 ):
312 metadata['use_title_as_h1'] = False
313
314 return metadata
315
316
317def _needs_update(path, mtime, force):
318 if force:
319 return True
320 if os.path.exists(path):
321 st = os.stat(path)
322 return mtime > st.st_mtime
323 return True
324
325
326def _entries(args):
327 entries = {}
Dirk Prankef9959472021-11-09 14:16:33 -0800328 parents = {}
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700329
330 # Looks like Sites probably caps results at 500 entries per request,
331 # even if we request more than that.
332 rownum = 0
333 url = ('https://sites.google.com/feeds/content/chromium.org/dev'
334 '?start-index=%d&max-results=%d&alt=json' %
335 (args.start_index, 500 - rownum))
336 doc, next_url = _fetch(url, args.force)
337
338 for rownum, entry in enumerate(doc['feed']['entry'], start=1):
339 row = _to_row(entry, rownum)
340 entries[row['id']] = row
341 if row.get('parent_id'):
Dirk Prankef9959472021-11-09 14:16:33 -0800342 parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700343 if args.verbose:
344 print(' ... [%d]' % rownum)
345 while next_url:
346 doc, next_url = _fetch(next_url, args.force)
347 for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
348 row = _to_row(entry, rownum)
349 entries[row['id']] = row
350 if row.get('parent_id'):
Dirk Prankef9959472021-11-09 14:16:33 -0800351 parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700352 if args.verbose:
353 print(' ... [%d]' % rownum)
354
Dirk Prankef9959472021-11-09 14:16:33 -0800355 for entry_id, entry in entries.items():
356 if entry['kind'] == 'listpage':
357 entry['listitems'] = [entries[child_id]['fields'] for child_id
358 in parents[entry_id]
359 if entries[child_id]['kind'] == 'listitem']
360
361 return entries
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700362
363
364def _fetch(url, force):
365 path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
366 if _needs_update(path, 0, force):
367 fp = urlopen(url)
368 content = fp.read()
369 doc = json.loads(content)
370 updated = _to_ts(doc['feed']['updated']['$t'])
371 common.write_if_changed(path, content)
372 else:
373 with open(path) as fp:
374 doc = json.load(fp)
375 next_url = _find_link(doc['feed'], 'next')
376 return doc, next_url
377
378
379def _find_link(doc, rel):
380 for ent in doc['link']:
381 if ent['rel'] == rel:
382 return ent['href']
383 return None
384
385
386def _to_row(entry, rownum):
387 row = {
388 'rownum': rownum,
389 'content': entry.get('content', {}).get('$t'),
390 'id': _to_id(entry['id']['$t']),
391 'kind': entry['category'][0]['label'],
392 'published': entry['published']['$t'],
393 'updated': entry['updated']['$t'],
394 }
395
396 row['page_name'] = entry.get('sites$pageName', {}).get('$t')
397 row['title'] = entry.get('title', {}).get('$t')
398 row['alt_url'] = _find_link(entry, 'alternate')
399
400 if row['kind'] == 'attachment':
401 row['url'] = _find_link(entry, 'alternate')
402 else:
403 row['url'] = _find_link(entry, 'self')
404
Dirk Prankef9959472021-11-09 14:16:33 -0800405 if row['kind'] == 'listitem':
406 path = row['url'].replace('https://sites.google.com',
407 os.path.join(common.REPO_DIR, 'scripts'))
408 if os.path.exists(path):
409 xml_content = common.read_text_file(path)
410 else:
411 print('fetching %s' % row['url'])
412 with urlopen(row['url']) as fp:
413 xml_content = fp.read()
414 common.write_if_changed(path, xml_content)
415
416 root = ET.fromstring(xml_content)
417 fields = root.findall('{http://schemas.google.com/spreadsheets/2006}field')
418 row['fields'] = collections.OrderedDict((el.attrib['name'], el.text) for el in fields)
419
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700420 parent_url = _find_link(entry,
421 'http://schemas.google.com/sites/2008#parent')
422 if parent_url:
423 row['parent_id'] = _to_id(parent_url)
424 return row
425
426
427def _to_id(url):
428 return url[url.rfind('/') + 1:]
429
430
431def _to_ts(iso_time):
432 return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
433
434if __name__ == '__main__':
435 try:
436 main()
437 except Exception:
438 extype, value, tb = sys.exc_info()
439 traceback.print_exc()
440 pdb.post_mortem(tb)