blob: 4e53aea81bd1475139abc7a81ea31441dfff6024 [file] [log] [blame]
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001#!/usr/bin/env vpython3
2# Copyright 2021 Google LLC
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Export www.chromium.org to local files.
17
18This script uses the Google GData and Google Sites APIs to extract the
19content from http://www.chromium.org/ and write it into local files
20that can be used to serve the same content.
21
22The APIs are documented at
23
24https://developers.google.com/sites/docs/1.0/developers_guide_protocol
25https://developers.google.com/gdata/docs/json
26
27Because www.chromium.org is a public site, this script requires no
28authentication to work.
29
30The exporting process attempts to convert the original content into
31sane modern HTML as much as possible without changing the appearance
32of any page significantly, with some minor exceptions.
33"""
34
35import argparse
Dirk Prankef9959472021-11-09 14:16:33 -080036import collections
Dirk Pranke7bbb5472021-11-02 16:33:21 -070037import io
38import json
39import os
40import pdb
41import sys
42import time
43import traceback
Dirk Prankef9959472021-11-09 14:16:33 -080044import xml.etree.ElementTree as ET
45
Dirk Pranke0f82ab82021-11-16 18:43:10 -080046from urllib.parse import urlparse
Dirk Pranke7bbb5472021-11-02 16:33:21 -070047from urllib.request import urlopen
48from urllib.error import HTTPError, URLError
49
50import yaml
51
52import common
53import html2markdown
54
55
56def main():
57 parser = argparse.ArgumentParser()
58 parser.add_argument('--force', action='store_true',
59 help='ignore updated timestamps in local cache')
Dirk Prankef9959472021-11-09 14:16:33 -080060 parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count())
Dirk Pranke7bbb5472021-11-02 16:33:21 -070061 parser.add_argument('-t', '--test', action='store_true')
62 parser.add_argument('-r', '--raw', action='store_true')
63 parser.add_argument('-v', '--verbose', action='count')
64 parser.add_argument('--max_results', type=int, default=5000)
65 parser.add_argument('--start-index', type=int, default=1)
Dirk Pranke75c2b682021-12-07 18:32:00 -080066 parser.add_argument('--paths-to-skip')
Dirk Pranke7bbb5472021-11-02 16:33:21 -070067 parser.add_argument('--path-list')
68 parser.add_argument('path', nargs='*')
69 args = parser.parse_args()
70
Dirk Prankef9959472021-11-09 14:16:33 -080071 entries = _entries(args)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070072
73 if args.path:
74 paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
75 path)
76 for path in args.path]
77 elif args.path_list:
78 paths_to_export = common.read_paths(args.path_list)
79 else:
80 paths_to_export = []
81
Dirk Pranke75c2b682021-12-07 18:32:00 -080082 if args.paths_to_skip:
83 paths_to_skip = set(common.read_paths(args.paths_to_skip))
84 else:
85 paths_to_skip = set(
86 common.read_paths(os.path.join(common.REPO_DIR,
87 'scripts', 'paths_to_skip.txt')))
88
Dirk Pranke7bbb5472021-11-02 16:33:21 -070089 max_input_mtime = max(os.stat(__file__).st_mtime,
90 os.stat(common.__file__).st_mtime,
91 os.stat(html2markdown.__file__).st_mtime)
92
93 updated = 0
94 paths = []
95
96 if args.test:
Dirk Prankef9959472021-11-09 14:16:33 -080097 entry = _find_entry_by_path(paths_to_export[0], entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070098 if entry:
Dirk Prankef9959472021-11-09 14:16:33 -080099 metadata = _metadata(entry, entries)
100 path = _path(entry, entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700101 _ = _handle_entry(path,
102 (entry, metadata, max_input_mtime, args.force,
103 args.raw))
Dirk Pranke304c5342021-11-03 12:34:21 -0700104 content = common.read_text_file('%s%s/index.md' %
105 (common.SITE_DIR, path))
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700106 print(content)
107 return 0
108 else:
109 print('%s not found' % paths_to_export[0])
110 return 1
111
Dirk Prankef9959472021-11-09 14:16:33 -0800112 q = common.JobQueue(_handle_entry, args.jobs)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700113
114 paths_to_export = set(paths_to_export)
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800115 exported_pages = set()
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700116 for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke304c5342021-11-03 12:34:21 -0700117 if entry['kind'] in ('webpage', 'listpage',
118 'announcementspage', 'filecabinet'):
Dirk Prankef9959472021-11-09 14:16:33 -0800119 metadata = _metadata(entry, entries)
120 path = _path(entry, entries)
Dirk Pranke75c2b682021-12-07 18:32:00 -0800121
122 if path in paths_to_skip:
123 continue
124 exported_pages.add(path)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700125 elif entry['kind'] == 'attachment':
126 metadata = {}
127 path = entry['url'].replace(
Dirk Pranke75c2b682021-12-07 18:32:00 -0800128 'https://sites.google.com/a/chromium.org/dev/', '/').rstrip('/')
129 if path in paths_to_skip:
130 continue
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700131 else:
132 continue
Dirk Pranke304c5342021-11-03 12:34:21 -0700133 if not paths_to_export or (path in paths_to_export):
134 q.request(path, (entry, metadata, max_input_mtime, args.force,
135 False))
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700136
Dirk Pranke304c5342021-11-03 12:34:21 -0700137 ret = 0
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700138 for path, res, did_update in q.results():
Dirk Pranke304c5342021-11-03 12:34:21 -0700139 if res:
140 ret = 1
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700141 if did_update:
142 updated += 1
143
144 print('updated %d entries' % updated)
Dirk Pranke304c5342021-11-03 12:34:21 -0700145 return ret
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700146
147
Dirk Prankef9959472021-11-09 14:16:33 -0800148def _find_entry_by_path(path, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700149 for entry in entries.values():
Dirk Pranke304c5342021-11-03 12:34:21 -0700150 if entry['kind'] not in ('webpage', 'listpage',
151 'announcmentspage', 'filecabinet'):
Dirk Prankef9959472021-11-09 14:16:33 -0800152 continue
153 entry_path = _path(entry, entries)
Dirk Pranke7aa01372021-11-05 16:16:09 -0700154 if entry_path == path:
Dirk Prankef9959472021-11-09 14:16:33 -0800155 return entry
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700156 return None
157
158
159def _handle_entry(task, obj):
160 entry, metadata, max_input_mtime, force, raw = obj
161 err = ''
162 did_update = False
163
Dirk Pranke304c5342021-11-03 12:34:21 -0700164 if not task.startswith('/'):
165 return 'malformed task', False
166
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700167 yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
168
169 if task in (
Dirk Pranke304c5342021-11-03 12:34:21 -0700170 '/developers/jinja',
171 '/developers/polymer-1-0',
172 '/devtools/breakpoints-tutorial/index.html',
173 '/devtools/breakpoints-tutorial/script.js',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700174 ):
175 # TODO: Eleventy chokes on these files.
176 return '', False
177
178 def repr_str(dumper, data):
179 if '\n' in data:
180 return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
181 style='|')
182 return dumper.org_represent_str(data)
183
184 yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
185
186
187 mtime = _to_ts(entry['updated'])
Dirk Prankef9959472021-11-09 14:16:33 -0800188 target_mtime = max(mtime, max_input_mtime)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700189 if entry['kind'] in ('webpage',
190 'listpage',
191 'announcementspage',
192 'filecabinet'):
Dirk Pranke304c5342021-11-03 12:34:21 -0700193 path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Prankef9959472021-11-09 14:16:33 -0800194 if _needs_update(path, target_mtime, force):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700195 if raw:
196 content = entry['content']
197 else:
198 content_sio = io.StringIO(entry['content'])
199 md_sio = io.StringIO()
200 md_sio.write('---\n')
201 md_sio.write(yaml.safe_dump(metadata))
202 md_sio.write('---\n\n')
203 url_converter = _URLConverter()
204 html2markdown.Convert(content_sio, md_sio, url_converter)
Dirk Prankef9959472021-11-09 14:16:33 -0800205 if entry['kind'] == 'listpage':
206 md_sio.write('\n\n')
207 _write_listitems(md_sio, entry)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700208 content = md_sio.getvalue()
Dirk Pranke75c2b682021-12-07 18:32:00 -0800209 content = content.replace(
210 'chromium.googlesource.com/chromium/src/+/master/',
211 'chromium.googlesource.com/chromium/src/+/HEAD/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700212 content = content.replace(' \b\b\b\b', '')
Dirk Prankef9959472021-11-09 14:16:33 -0800213
Dirk Pranke304c5342021-11-03 12:34:21 -0700214 did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700215 else:
216 did_update = False
Dirk Prankef9959472021-11-09 14:16:33 -0800217 elif entry['kind'] == 'listitem':
218 # Handled as part of the corresponding 'listpage' entry.
219 pass
220 elif entry['kind'] == 'announcement':
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700221 # TODO: implement me.
222 pass
223 elif entry['kind'] == 'attachment':
Dirk Pranke304c5342021-11-03 12:34:21 -0700224 path = '%s%s' % (common.SITE_DIR, task)
Dirk Pranke75c2b682021-12-07 18:32:00 -0800225 path = path.replace(':', '_')
226 path = path.replace('%20', ' ')
227 path = path.replace('%2B', '+')
Dirk Pranke304c5342021-11-03 12:34:21 -0700228 if task in (
229 '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
230 '/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700231 ):
232 # These are expected 404's that we ignore.
233 did_update = False
234 elif _needs_update(path, mtime, force):
235 try:
236 fp = urlopen(entry['url'])
237 content = fp.read()
238 did_update = common.write_if_changed(path, content)
239 except (HTTPError, URLError, TimeoutError) as e:
240 err = 'Error: %s' % e
241
242 elif entry['kind'] == 'comment':
243 # ignore comments in the migration
244 pass
245 elif entry['kind'] == 'tag':
246 err = 'tag kind not implemented'
247 else:
248 err = 'unknown kind %s' % entry['kind']
249
250 return err, did_update
251
252
Dirk Prankef9959472021-11-09 14:16:33 -0800253def _write_listitems(content, entry):
254 if not entry['listitems']:
255 return
256
257 headers = entry['listitems'][0].keys()
258 rows = sorted(entry['listitems'],
259 key=lambda row: row.get('Release') or '')
260
261 content.write('<table>\n')
262 content.write(' <tr>\n')
263 for header in headers:
264 content.write(' <th>%s</th>\n' % header)
265 content.write(' </tr>\n')
266 for row in rows:
267 content.write(' <tr>\n')
268 for value in row.values():
269 if value and value.startswith('<a xmlns='):
270 value = value.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
271 content.write(' <td>%s</td>\n' % (value or ''))
272 content.write(' </tr>\n')
273 content.write('</table>\n')
274
275
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700276class _URLConverter:
277 def Translate(self, href):
278 if not href:
279 return ''
280
281 for path in common.alternates:
282 if href.startswith(path):
283 href = href.replace(path, '')
284
285 if href.startswith('/_/rsrc'):
286 href = '/' + '/'.join(href.split('/')[4:])
Dirk Pranke0f82ab82021-11-16 18:43:10 -0800287
288 url = urlparse(href)
289 if '?' in href and url.netloc == '':
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700290 href = href[0:href.index('?')]
Dirk Pranke221a47d2021-11-11 20:26:31 -0800291 if 'Screenshot' in href:
292 head, tail = href.split('Screenshot')
293 tail = tail.replace(':', '%3A')
294 href = head + 'Screenshot' + tail
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700295 return href
296
297
Dirk Prankef9959472021-11-09 14:16:33 -0800298def _path(entry, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700299 path = entry['page_name']
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700300 parent_id = entry.get('parent_id')
301 while parent_id:
302 path = entries[parent_id]['page_name'] + '/' + path
303 parent_id = entries[parent_id].get('parent_id')
304
Dirk Pranke75c2b682021-12-07 18:32:00 -0800305 path = ('/' + path).rstrip('/') or '/'
306 return path
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700307
308
Dirk Prankef9959472021-11-09 14:16:33 -0800309def _metadata(entry, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700310 metadata = {}
311 metadata['page_name'] = entry['page_name']
312 metadata['title'] = entry['title']
313
314 crumbs = []
315 parent_id = entry.get('parent_id')
316 while parent_id:
317 parent = entries[parent_id]
Dirk Prankef9959472021-11-09 14:16:33 -0800318 path = _path(parent, entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700319 title = parent['title']
320 crumbs = [[path, title]] + crumbs
321 parent_id = parent.get('parent_id')
322
323 metadata['breadcrumbs'] = crumbs
324
325 if metadata['page_name'] in (
326 'chromium-projects',
327 'chromium',
328 ):
329 metadata['use_title_as_h1'] = False
330
331 return metadata
332
333
334def _needs_update(path, mtime, force):
335 if force:
336 return True
337 if os.path.exists(path):
338 st = os.stat(path)
339 return mtime > st.st_mtime
340 return True
341
342
343def _entries(args):
344 entries = {}
Dirk Prankef9959472021-11-09 14:16:33 -0800345 parents = {}
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700346
347 # Looks like Sites probably caps results at 500 entries per request,
348 # even if we request more than that.
349 rownum = 0
350 url = ('https://sites.google.com/feeds/content/chromium.org/dev'
351 '?start-index=%d&max-results=%d&alt=json' %
352 (args.start_index, 500 - rownum))
353 doc, next_url = _fetch(url, args.force)
354
355 for rownum, entry in enumerate(doc['feed']['entry'], start=1):
356 row = _to_row(entry, rownum)
357 entries[row['id']] = row
358 if row.get('parent_id'):
Dirk Prankef9959472021-11-09 14:16:33 -0800359 parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700360 if args.verbose:
361 print(' ... [%d]' % rownum)
362 while next_url:
363 doc, next_url = _fetch(next_url, args.force)
364 for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
365 row = _to_row(entry, rownum)
366 entries[row['id']] = row
367 if row.get('parent_id'):
Dirk Prankef9959472021-11-09 14:16:33 -0800368 parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700369 if args.verbose:
370 print(' ... [%d]' % rownum)
371
Dirk Prankef9959472021-11-09 14:16:33 -0800372 for entry_id, entry in entries.items():
373 if entry['kind'] == 'listpage':
374 entry['listitems'] = [entries[child_id]['fields'] for child_id
375 in parents[entry_id]
376 if entries[child_id]['kind'] == 'listitem']
377
378 return entries
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700379
380
381def _fetch(url, force):
382 path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
383 if _needs_update(path, 0, force):
384 fp = urlopen(url)
385 content = fp.read()
386 doc = json.loads(content)
387 updated = _to_ts(doc['feed']['updated']['$t'])
388 common.write_if_changed(path, content)
389 else:
390 with open(path) as fp:
391 doc = json.load(fp)
392 next_url = _find_link(doc['feed'], 'next')
393 return doc, next_url
394
395
396def _find_link(doc, rel):
397 for ent in doc['link']:
398 if ent['rel'] == rel:
399 return ent['href']
400 return None
401
402
403def _to_row(entry, rownum):
404 row = {
405 'rownum': rownum,
406 'content': entry.get('content', {}).get('$t'),
407 'id': _to_id(entry['id']['$t']),
408 'kind': entry['category'][0]['label'],
409 'published': entry['published']['$t'],
410 'updated': entry['updated']['$t'],
411 }
412
413 row['page_name'] = entry.get('sites$pageName', {}).get('$t')
414 row['title'] = entry.get('title', {}).get('$t')
415 row['alt_url'] = _find_link(entry, 'alternate')
416
417 if row['kind'] == 'attachment':
418 row['url'] = _find_link(entry, 'alternate')
419 else:
420 row['url'] = _find_link(entry, 'self')
421
Dirk Prankef9959472021-11-09 14:16:33 -0800422 if row['kind'] == 'listitem':
423 path = row['url'].replace('https://sites.google.com',
424 os.path.join(common.REPO_DIR, 'scripts'))
425 if os.path.exists(path):
426 xml_content = common.read_text_file(path)
427 else:
428 print('fetching %s' % row['url'])
429 with urlopen(row['url']) as fp:
430 xml_content = fp.read()
431 common.write_if_changed(path, xml_content)
432
433 root = ET.fromstring(xml_content)
434 fields = root.findall('{http://schemas.google.com/spreadsheets/2006}field')
435 row['fields'] = collections.OrderedDict((el.attrib['name'], el.text) for el in fields)
436
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700437 parent_url = _find_link(entry,
438 'http://schemas.google.com/sites/2008#parent')
439 if parent_url:
440 row['parent_id'] = _to_id(parent_url)
441 return row
442
443
444def _to_id(url):
445 return url[url.rfind('/') + 1:]
446
447
448def _to_ts(iso_time):
449 return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
450
451if __name__ == '__main__':
452 try:
453 main()
454 except Exception:
455 extype, value, tb = sys.exc_info()
456 traceback.print_exc()
457 pdb.post_mortem(tb)