blob: 51f234461cd67011286d4587d750364cd0289db2 [file] [log] [blame]
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001#!/usr/bin/env vpython3
2# Copyright 2021 Google LLC
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Export www.chromium.org to local files.
17
18This script uses the Google GData and Google Sites APIs to extract the
19content from http://www.chromium.org/ and write it into local files
20that can be used to serve the same content.
21
22The APIs are documented at
23
24https://developers.google.com/sites/docs/1.0/developers_guide_protocol
25https://developers.google.com/gdata/docs/json
26
27Because www.chromium.org is a public site, this script requires no
28authentication to work.
29
30The exporting process attempts to convert the original content into
31sane modern HTML as much as possible without changing the appearance
32of any page significantly, with some minor exceptions.
33"""
34
35import argparse
Dirk Prankef9959472021-11-09 14:16:33 -080036import collections
Dirk Pranke7bbb5472021-11-02 16:33:21 -070037import io
38import json
39import os
40import pdb
41import sys
42import time
43import traceback
Dirk Prankef9959472021-11-09 14:16:33 -080044import xml.etree.ElementTree as ET
45
Dirk Pranke7bbb5472021-11-02 16:33:21 -070046from urllib.request import urlopen
47from urllib.error import HTTPError, URLError
48
49import yaml
50
51import common
52import html2markdown
53
54
55def main():
56 parser = argparse.ArgumentParser()
57 parser.add_argument('--force', action='store_true',
58 help='ignore updated timestamps in local cache')
Dirk Prankef9959472021-11-09 14:16:33 -080059 parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count())
Dirk Pranke7bbb5472021-11-02 16:33:21 -070060 parser.add_argument('-t', '--test', action='store_true')
61 parser.add_argument('-r', '--raw', action='store_true')
62 parser.add_argument('-v', '--verbose', action='count')
63 parser.add_argument('--max_results', type=int, default=5000)
64 parser.add_argument('--start-index', type=int, default=1)
65 parser.add_argument('--path-list')
66 parser.add_argument('path', nargs='*')
67 args = parser.parse_args()
68
Dirk Prankef9959472021-11-09 14:16:33 -080069 entries = _entries(args)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070070
71 if args.path:
72 paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
73 path)
74 for path in args.path]
75 elif args.path_list:
76 paths_to_export = common.read_paths(args.path_list)
77 else:
78 paths_to_export = []
79
80 max_input_mtime = max(os.stat(__file__).st_mtime,
81 os.stat(common.__file__).st_mtime,
82 os.stat(html2markdown.__file__).st_mtime)
83
84 updated = 0
85 paths = []
86
87 if args.test:
Dirk Prankef9959472021-11-09 14:16:33 -080088 entry = _find_entry_by_path(paths_to_export[0], entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070089 if entry:
Dirk Prankef9959472021-11-09 14:16:33 -080090 metadata = _metadata(entry, entries)
91 path = _path(entry, entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070092 _ = _handle_entry(path,
93 (entry, metadata, max_input_mtime, args.force,
94 args.raw))
Dirk Pranke304c5342021-11-03 12:34:21 -070095 content = common.read_text_file('%s%s/index.md' %
96 (common.SITE_DIR, path))
Dirk Pranke7bbb5472021-11-02 16:33:21 -070097 print(content)
98 return 0
99 else:
100 print('%s not found' % paths_to_export[0])
101 return 1
102
Dirk Prankef9959472021-11-09 14:16:33 -0800103 q = common.JobQueue(_handle_entry, args.jobs)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700104
105 paths_to_export = set(paths_to_export)
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800106 exported_pages = set()
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700107 for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke304c5342021-11-03 12:34:21 -0700108 if entry['kind'] in ('webpage', 'listpage',
109 'announcementspage', 'filecabinet'):
Dirk Prankef9959472021-11-09 14:16:33 -0800110 metadata = _metadata(entry, entries)
111 path = _path(entry, entries)
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800112 exported_pages.add(path.rstrip('/') or '/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700113 elif entry['kind'] == 'attachment':
114 metadata = {}
115 path = entry['url'].replace(
Dirk Pranke304c5342021-11-03 12:34:21 -0700116 'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700117 else:
118 continue
Dirk Pranke304c5342021-11-03 12:34:21 -0700119 if not paths_to_export or (path in paths_to_export):
120 q.request(path, (entry, metadata, max_input_mtime, args.force,
121 False))
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700122
Dirk Pranke304c5342021-11-03 12:34:21 -0700123 ret = 0
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700124 for path, res, did_update in q.results():
Dirk Pranke304c5342021-11-03 12:34:21 -0700125 if res:
126 ret = 1
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700127 if did_update:
128 updated += 1
129
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800130 if ret == 0:
131 common.write_text_file(
132 os.path.join(common.SITE_DIR, 'pages.json'),
133 json.dumps(sorted(exported_pages), indent=2) + '\n')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700134 print('updated %d entries' % updated)
Dirk Pranke304c5342021-11-03 12:34:21 -0700135 return ret
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700136
137
Dirk Prankef9959472021-11-09 14:16:33 -0800138def _find_entry_by_path(path, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700139 for entry in entries.values():
Dirk Pranke304c5342021-11-03 12:34:21 -0700140 if entry['kind'] not in ('webpage', 'listpage',
141 'announcmentspage', 'filecabinet'):
Dirk Prankef9959472021-11-09 14:16:33 -0800142 continue
143 entry_path = _path(entry, entries)
Dirk Pranke7aa01372021-11-05 16:16:09 -0700144 if entry_path == path:
Dirk Prankef9959472021-11-09 14:16:33 -0800145 return entry
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700146 return None
147
148
149def _handle_entry(task, obj):
150 entry, metadata, max_input_mtime, force, raw = obj
151 err = ''
152 did_update = False
153
Dirk Pranke304c5342021-11-03 12:34:21 -0700154 if not task.startswith('/'):
155 return 'malformed task', False
156
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700157 yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
158
159 if task in (
Dirk Pranke304c5342021-11-03 12:34:21 -0700160 '/developers/jinja',
161 '/developers/polymer-1-0',
162 '/devtools/breakpoints-tutorial/index.html',
163 '/devtools/breakpoints-tutorial/script.js',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700164 ):
165 # TODO: Eleventy chokes on these files.
166 return '', False
167
168 def repr_str(dumper, data):
169 if '\n' in data:
170 return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
171 style='|')
172 return dumper.org_represent_str(data)
173
174 yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
175
176
177 mtime = _to_ts(entry['updated'])
Dirk Prankef9959472021-11-09 14:16:33 -0800178 target_mtime = max(mtime, max_input_mtime)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700179 if entry['kind'] in ('webpage',
180 'listpage',
181 'announcementspage',
182 'filecabinet'):
Dirk Pranke304c5342021-11-03 12:34:21 -0700183 path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Prankef9959472021-11-09 14:16:33 -0800184 if _needs_update(path, target_mtime, force):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700185 if raw:
186 content = entry['content']
187 else:
188 content_sio = io.StringIO(entry['content'])
189 md_sio = io.StringIO()
190 md_sio.write('---\n')
191 md_sio.write(yaml.safe_dump(metadata))
192 md_sio.write('---\n\n')
193 url_converter = _URLConverter()
194 html2markdown.Convert(content_sio, md_sio, url_converter)
Dirk Prankef9959472021-11-09 14:16:33 -0800195 if entry['kind'] == 'listpage':
196 md_sio.write('\n\n')
197 _write_listitems(md_sio, entry)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700198 content = md_sio.getvalue()
199 content = content.replace(' \b\b\b\b', '')
Dirk Prankef9959472021-11-09 14:16:33 -0800200
Dirk Pranke304c5342021-11-03 12:34:21 -0700201 did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700202 else:
203 did_update = False
Dirk Prankef9959472021-11-09 14:16:33 -0800204 elif entry['kind'] == 'listitem':
205 # Handled as part of the corresponding 'listpage' entry.
206 pass
207 elif entry['kind'] == 'announcement':
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700208 # TODO: implement me.
209 pass
210 elif entry['kind'] == 'attachment':
Dirk Pranke304c5342021-11-03 12:34:21 -0700211 path = '%s%s' % (common.SITE_DIR, task)
212 if task in (
213 '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
214 '/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700215 ):
216 # These are expected 404's that we ignore.
217 did_update = False
218 elif _needs_update(path, mtime, force):
219 try:
220 fp = urlopen(entry['url'])
221 content = fp.read()
222 did_update = common.write_if_changed(path, content)
223 except (HTTPError, URLError, TimeoutError) as e:
224 err = 'Error: %s' % e
225
226 elif entry['kind'] == 'comment':
227 # ignore comments in the migration
228 pass
229 elif entry['kind'] == 'tag':
230 err = 'tag kind not implemented'
231 else:
232 err = 'unknown kind %s' % entry['kind']
233
234 return err, did_update
235
236
Dirk Prankef9959472021-11-09 14:16:33 -0800237def _write_listitems(content, entry):
238 if not entry['listitems']:
239 return
240
241 headers = entry['listitems'][0].keys()
242 rows = sorted(entry['listitems'],
243 key=lambda row: row.get('Release') or '')
244
245 content.write('<table>\n')
246 content.write(' <tr>\n')
247 for header in headers:
248 content.write(' <th>%s</th>\n' % header)
249 content.write(' </tr>\n')
250 for row in rows:
251 content.write(' <tr>\n')
252 for value in row.values():
253 if value and value.startswith('<a xmlns='):
254 value = value.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
255 content.write(' <td>%s</td>\n' % (value or ''))
256 content.write(' </tr>\n')
257 content.write('</table>\n')
258
259
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700260class _URLConverter:
261 def Translate(self, href):
262 if not href:
263 return ''
264
265 for path in common.alternates:
266 if href.startswith(path):
267 href = href.replace(path, '')
268
269 if href.startswith('/_/rsrc'):
270 href = '/' + '/'.join(href.split('/')[4:])
271 if '?' in href:
272 href = href[0:href.index('?')]
273 return href
274
275
Dirk Prankef9959472021-11-09 14:16:33 -0800276def _path(entry, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700277 path = entry['page_name']
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700278 parent_id = entry.get('parent_id')
279 while parent_id:
280 path = entries[parent_id]['page_name'] + '/' + path
281 parent_id = entries[parent_id].get('parent_id')
282
Dirk Pranke304c5342021-11-03 12:34:21 -0700283 return '/' + path
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700284
285
Dirk Prankef9959472021-11-09 14:16:33 -0800286def _metadata(entry, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700287 metadata = {}
288 metadata['page_name'] = entry['page_name']
289 metadata['title'] = entry['title']
290
291 crumbs = []
292 parent_id = entry.get('parent_id')
293 while parent_id:
294 parent = entries[parent_id]
Dirk Prankef9959472021-11-09 14:16:33 -0800295 path = _path(parent, entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700296 title = parent['title']
297 crumbs = [[path, title]] + crumbs
298 parent_id = parent.get('parent_id')
299
300 metadata['breadcrumbs'] = crumbs
301
302 if metadata['page_name'] in (
303 'chromium-projects',
304 'chromium',
305 ):
306 metadata['use_title_as_h1'] = False
307
308 return metadata
309
310
311def _needs_update(path, mtime, force):
312 if force:
313 return True
314 if os.path.exists(path):
315 st = os.stat(path)
316 return mtime > st.st_mtime
317 return True
318
319
320def _entries(args):
321 entries = {}
Dirk Prankef9959472021-11-09 14:16:33 -0800322 parents = {}
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700323
324 # Looks like Sites probably caps results at 500 entries per request,
325 # even if we request more than that.
326 rownum = 0
327 url = ('https://sites.google.com/feeds/content/chromium.org/dev'
328 '?start-index=%d&max-results=%d&alt=json' %
329 (args.start_index, 500 - rownum))
330 doc, next_url = _fetch(url, args.force)
331
332 for rownum, entry in enumerate(doc['feed']['entry'], start=1):
333 row = _to_row(entry, rownum)
334 entries[row['id']] = row
335 if row.get('parent_id'):
Dirk Prankef9959472021-11-09 14:16:33 -0800336 parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700337 if args.verbose:
338 print(' ... [%d]' % rownum)
339 while next_url:
340 doc, next_url = _fetch(next_url, args.force)
341 for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
342 row = _to_row(entry, rownum)
343 entries[row['id']] = row
344 if row.get('parent_id'):
Dirk Prankef9959472021-11-09 14:16:33 -0800345 parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700346 if args.verbose:
347 print(' ... [%d]' % rownum)
348
Dirk Prankef9959472021-11-09 14:16:33 -0800349 for entry_id, entry in entries.items():
350 if entry['kind'] == 'listpage':
351 entry['listitems'] = [entries[child_id]['fields'] for child_id
352 in parents[entry_id]
353 if entries[child_id]['kind'] == 'listitem']
354
355 return entries
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700356
357
358def _fetch(url, force):
359 path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
360 if _needs_update(path, 0, force):
361 fp = urlopen(url)
362 content = fp.read()
363 doc = json.loads(content)
364 updated = _to_ts(doc['feed']['updated']['$t'])
365 common.write_if_changed(path, content)
366 else:
367 with open(path) as fp:
368 doc = json.load(fp)
369 next_url = _find_link(doc['feed'], 'next')
370 return doc, next_url
371
372
373def _find_link(doc, rel):
374 for ent in doc['link']:
375 if ent['rel'] == rel:
376 return ent['href']
377 return None
378
379
380def _to_row(entry, rownum):
381 row = {
382 'rownum': rownum,
383 'content': entry.get('content', {}).get('$t'),
384 'id': _to_id(entry['id']['$t']),
385 'kind': entry['category'][0]['label'],
386 'published': entry['published']['$t'],
387 'updated': entry['updated']['$t'],
388 }
389
390 row['page_name'] = entry.get('sites$pageName', {}).get('$t')
391 row['title'] = entry.get('title', {}).get('$t')
392 row['alt_url'] = _find_link(entry, 'alternate')
393
394 if row['kind'] == 'attachment':
395 row['url'] = _find_link(entry, 'alternate')
396 else:
397 row['url'] = _find_link(entry, 'self')
398
Dirk Prankef9959472021-11-09 14:16:33 -0800399 if row['kind'] == 'listitem':
400 path = row['url'].replace('https://sites.google.com',
401 os.path.join(common.REPO_DIR, 'scripts'))
402 if os.path.exists(path):
403 xml_content = common.read_text_file(path)
404 else:
405 print('fetching %s' % row['url'])
406 with urlopen(row['url']) as fp:
407 xml_content = fp.read()
408 common.write_if_changed(path, xml_content)
409
410 root = ET.fromstring(xml_content)
411 fields = root.findall('{http://schemas.google.com/spreadsheets/2006}field')
412 row['fields'] = collections.OrderedDict((el.attrib['name'], el.text) for el in fields)
413
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700414 parent_url = _find_link(entry,
415 'http://schemas.google.com/sites/2008#parent')
416 if parent_url:
417 row['parent_id'] = _to_id(parent_url)
418 return row
419
420
421def _to_id(url):
422 return url[url.rfind('/') + 1:]
423
424
425def _to_ts(iso_time):
426 return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
427
428if __name__ == '__main__':
429 try:
430 main()
431 except Exception:
432 extype, value, tb = sys.exc_info()
433 traceback.print_exc()
434 pdb.post_mortem(tb)