blob: 45caceaa4281c9818c08ef95909e150d787c9619 [file] [log] [blame]
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001#!/usr/bin/env vpython3
2# Copyright 2021 Google LLC
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Export www.chromium.org to local files.
17
18This script uses the Google GData and Google Sites APIs to extract the
19content from http://www.chromium.org/ and write it into local files
20that can be used to serve the same content.
21
22The APIs are documented at
23
24https://developers.google.com/sites/docs/1.0/developers_guide_protocol
25https://developers.google.com/gdata/docs/json
26
27Because www.chromium.org is a public site, this script requires no
28authentication to work.
29
30The exporting process attempts to convert the original content into
31sane modern HTML as much as possible without changing the appearance
32of any page significantly, with some minor exceptions.
33"""
34
35import argparse
Dirk Prankef9959472021-11-09 14:16:33 -080036import collections
Dirk Pranke7bbb5472021-11-02 16:33:21 -070037import io
38import json
39import os
40import pdb
41import sys
42import time
43import traceback
Dirk Prankef9959472021-11-09 14:16:33 -080044import xml.etree.ElementTree as ET
45
Dirk Pranke0f82ab82021-11-16 18:43:10 -080046from urllib.parse import urlparse
Dirk Pranke7bbb5472021-11-02 16:33:21 -070047from urllib.request import urlopen
48from urllib.error import HTTPError, URLError
49
50import yaml
51
52import common
53import html2markdown
54
55
56def main():
57 parser = argparse.ArgumentParser()
58 parser.add_argument('--force', action='store_true',
59 help='ignore updated timestamps in local cache')
Dirk Prankef9959472021-11-09 14:16:33 -080060 parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count())
Dirk Pranke7bbb5472021-11-02 16:33:21 -070061 parser.add_argument('-t', '--test', action='store_true')
62 parser.add_argument('-r', '--raw', action='store_true')
63 parser.add_argument('-v', '--verbose', action='count')
64 parser.add_argument('--max_results', type=int, default=5000)
65 parser.add_argument('--start-index', type=int, default=1)
66 parser.add_argument('--path-list')
67 parser.add_argument('path', nargs='*')
68 args = parser.parse_args()
69
Dirk Prankef9959472021-11-09 14:16:33 -080070 entries = _entries(args)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070071
72 if args.path:
73 paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
74 path)
75 for path in args.path]
76 elif args.path_list:
77 paths_to_export = common.read_paths(args.path_list)
78 else:
79 paths_to_export = []
80
81 max_input_mtime = max(os.stat(__file__).st_mtime,
82 os.stat(common.__file__).st_mtime,
83 os.stat(html2markdown.__file__).st_mtime)
84
85 updated = 0
86 paths = []
87
88 if args.test:
Dirk Prankef9959472021-11-09 14:16:33 -080089 entry = _find_entry_by_path(paths_to_export[0], entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070090 if entry:
Dirk Prankef9959472021-11-09 14:16:33 -080091 metadata = _metadata(entry, entries)
92 path = _path(entry, entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -070093 _ = _handle_entry(path,
94 (entry, metadata, max_input_mtime, args.force,
95 args.raw))
Dirk Pranke304c5342021-11-03 12:34:21 -070096 content = common.read_text_file('%s%s/index.md' %
97 (common.SITE_DIR, path))
Dirk Pranke7bbb5472021-11-02 16:33:21 -070098 print(content)
99 return 0
100 else:
101 print('%s not found' % paths_to_export[0])
102 return 1
103
Dirk Prankef9959472021-11-09 14:16:33 -0800104 q = common.JobQueue(_handle_entry, args.jobs)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700105
106 paths_to_export = set(paths_to_export)
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800107 exported_pages = set()
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700108 for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke304c5342021-11-03 12:34:21 -0700109 if entry['kind'] in ('webpage', 'listpage',
110 'announcementspage', 'filecabinet'):
Dirk Prankef9959472021-11-09 14:16:33 -0800111 metadata = _metadata(entry, entries)
112 path = _path(entry, entries)
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800113 exported_pages.add(path.rstrip('/') or '/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700114 elif entry['kind'] == 'attachment':
115 metadata = {}
116 path = entry['url'].replace(
Dirk Pranke304c5342021-11-03 12:34:21 -0700117 'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700118 else:
119 continue
Dirk Pranke304c5342021-11-03 12:34:21 -0700120 if not paths_to_export or (path in paths_to_export):
121 q.request(path, (entry, metadata, max_input_mtime, args.force,
122 False))
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700123
Dirk Pranke304c5342021-11-03 12:34:21 -0700124 ret = 0
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700125 for path, res, did_update in q.results():
Dirk Pranke304c5342021-11-03 12:34:21 -0700126 if res:
127 ret = 1
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700128 if did_update:
129 updated += 1
130
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800131 if ret == 0:
132 common.write_text_file(
133 os.path.join(common.SITE_DIR, 'pages.json'),
134 json.dumps(sorted(exported_pages), indent=2) + '\n')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700135 print('updated %d entries' % updated)
Dirk Pranke304c5342021-11-03 12:34:21 -0700136 return ret
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700137
138
Dirk Prankef9959472021-11-09 14:16:33 -0800139def _find_entry_by_path(path, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700140 for entry in entries.values():
Dirk Pranke304c5342021-11-03 12:34:21 -0700141 if entry['kind'] not in ('webpage', 'listpage',
142 'announcmentspage', 'filecabinet'):
Dirk Prankef9959472021-11-09 14:16:33 -0800143 continue
144 entry_path = _path(entry, entries)
Dirk Pranke7aa01372021-11-05 16:16:09 -0700145 if entry_path == path:
Dirk Prankef9959472021-11-09 14:16:33 -0800146 return entry
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700147 return None
148
149
150def _handle_entry(task, obj):
151 entry, metadata, max_input_mtime, force, raw = obj
152 err = ''
153 did_update = False
154
Dirk Pranke304c5342021-11-03 12:34:21 -0700155 if not task.startswith('/'):
156 return 'malformed task', False
157
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700158 yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
159
160 if task in (
Dirk Pranke304c5342021-11-03 12:34:21 -0700161 '/developers/jinja',
162 '/developers/polymer-1-0',
163 '/devtools/breakpoints-tutorial/index.html',
164 '/devtools/breakpoints-tutorial/script.js',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700165 ):
166 # TODO: Eleventy chokes on these files.
167 return '', False
168
169 def repr_str(dumper, data):
170 if '\n' in data:
171 return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
172 style='|')
173 return dumper.org_represent_str(data)
174
175 yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
176
177
178 mtime = _to_ts(entry['updated'])
Dirk Prankef9959472021-11-09 14:16:33 -0800179 target_mtime = max(mtime, max_input_mtime)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700180 if entry['kind'] in ('webpage',
181 'listpage',
182 'announcementspage',
183 'filecabinet'):
Dirk Pranke304c5342021-11-03 12:34:21 -0700184 path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Prankef9959472021-11-09 14:16:33 -0800185 if _needs_update(path, target_mtime, force):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700186 if raw:
187 content = entry['content']
188 else:
189 content_sio = io.StringIO(entry['content'])
190 md_sio = io.StringIO()
191 md_sio.write('---\n')
192 md_sio.write(yaml.safe_dump(metadata))
193 md_sio.write('---\n\n')
194 url_converter = _URLConverter()
195 html2markdown.Convert(content_sio, md_sio, url_converter)
Dirk Prankef9959472021-11-09 14:16:33 -0800196 if entry['kind'] == 'listpage':
197 md_sio.write('\n\n')
198 _write_listitems(md_sio, entry)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700199 content = md_sio.getvalue()
200 content = content.replace(' \b\b\b\b', '')
Dirk Prankef9959472021-11-09 14:16:33 -0800201
Dirk Pranke304c5342021-11-03 12:34:21 -0700202 did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700203 else:
204 did_update = False
Dirk Prankef9959472021-11-09 14:16:33 -0800205 elif entry['kind'] == 'listitem':
206 # Handled as part of the corresponding 'listpage' entry.
207 pass
208 elif entry['kind'] == 'announcement':
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700209 # TODO: implement me.
210 pass
211 elif entry['kind'] == 'attachment':
Dirk Pranke221a47d2021-11-11 20:26:31 -0800212 if ':' in task:
213 task = _URLConverter().Translate(task)
Dirk Pranke304c5342021-11-03 12:34:21 -0700214 path = '%s%s' % (common.SITE_DIR, task)
215 if task in (
216 '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
217 '/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700218 ):
219 # These are expected 404's that we ignore.
220 did_update = False
221 elif _needs_update(path, mtime, force):
222 try:
223 fp = urlopen(entry['url'])
224 content = fp.read()
225 did_update = common.write_if_changed(path, content)
226 except (HTTPError, URLError, TimeoutError) as e:
227 err = 'Error: %s' % e
228
229 elif entry['kind'] == 'comment':
230 # ignore comments in the migration
231 pass
232 elif entry['kind'] == 'tag':
233 err = 'tag kind not implemented'
234 else:
235 err = 'unknown kind %s' % entry['kind']
236
237 return err, did_update
238
239
Dirk Prankef9959472021-11-09 14:16:33 -0800240def _write_listitems(content, entry):
241 if not entry['listitems']:
242 return
243
244 headers = entry['listitems'][0].keys()
245 rows = sorted(entry['listitems'],
246 key=lambda row: row.get('Release') or '')
247
248 content.write('<table>\n')
249 content.write(' <tr>\n')
250 for header in headers:
251 content.write(' <th>%s</th>\n' % header)
252 content.write(' </tr>\n')
253 for row in rows:
254 content.write(' <tr>\n')
255 for value in row.values():
256 if value and value.startswith('<a xmlns='):
257 value = value.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
258 content.write(' <td>%s</td>\n' % (value or ''))
259 content.write(' </tr>\n')
260 content.write('</table>\n')
261
262
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700263class _URLConverter:
264 def Translate(self, href):
265 if not href:
266 return ''
267
268 for path in common.alternates:
269 if href.startswith(path):
270 href = href.replace(path, '')
271
272 if href.startswith('/_/rsrc'):
273 href = '/' + '/'.join(href.split('/')[4:])
Dirk Pranke0f82ab82021-11-16 18:43:10 -0800274
275 url = urlparse(href)
276 if '?' in href and url.netloc == '':
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700277 href = href[0:href.index('?')]
Dirk Pranke221a47d2021-11-11 20:26:31 -0800278 if 'Screenshot' in href:
279 head, tail = href.split('Screenshot')
280 tail = tail.replace(':', '%3A')
281 href = head + 'Screenshot' + tail
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700282 return href
283
284
Dirk Prankef9959472021-11-09 14:16:33 -0800285def _path(entry, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700286 path = entry['page_name']
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700287 parent_id = entry.get('parent_id')
288 while parent_id:
289 path = entries[parent_id]['page_name'] + '/' + path
290 parent_id = entries[parent_id].get('parent_id')
291
Dirk Pranke304c5342021-11-03 12:34:21 -0700292 return '/' + path
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700293
294
Dirk Prankef9959472021-11-09 14:16:33 -0800295def _metadata(entry, entries):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700296 metadata = {}
297 metadata['page_name'] = entry['page_name']
298 metadata['title'] = entry['title']
299
300 crumbs = []
301 parent_id = entry.get('parent_id')
302 while parent_id:
303 parent = entries[parent_id]
Dirk Prankef9959472021-11-09 14:16:33 -0800304 path = _path(parent, entries)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700305 title = parent['title']
306 crumbs = [[path, title]] + crumbs
307 parent_id = parent.get('parent_id')
308
309 metadata['breadcrumbs'] = crumbs
310
311 if metadata['page_name'] in (
312 'chromium-projects',
313 'chromium',
314 ):
315 metadata['use_title_as_h1'] = False
316
317 return metadata
318
319
320def _needs_update(path, mtime, force):
321 if force:
322 return True
323 if os.path.exists(path):
324 st = os.stat(path)
325 return mtime > st.st_mtime
326 return True
327
328
329def _entries(args):
330 entries = {}
Dirk Prankef9959472021-11-09 14:16:33 -0800331 parents = {}
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700332
333 # Looks like Sites probably caps results at 500 entries per request,
334 # even if we request more than that.
335 rownum = 0
336 url = ('https://sites.google.com/feeds/content/chromium.org/dev'
337 '?start-index=%d&max-results=%d&alt=json' %
338 (args.start_index, 500 - rownum))
339 doc, next_url = _fetch(url, args.force)
340
341 for rownum, entry in enumerate(doc['feed']['entry'], start=1):
342 row = _to_row(entry, rownum)
343 entries[row['id']] = row
344 if row.get('parent_id'):
Dirk Prankef9959472021-11-09 14:16:33 -0800345 parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700346 if args.verbose:
347 print(' ... [%d]' % rownum)
348 while next_url:
349 doc, next_url = _fetch(next_url, args.force)
350 for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
351 row = _to_row(entry, rownum)
352 entries[row['id']] = row
353 if row.get('parent_id'):
Dirk Prankef9959472021-11-09 14:16:33 -0800354 parents.setdefault(row['parent_id'], set()).add(row['id'])
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700355 if args.verbose:
356 print(' ... [%d]' % rownum)
357
Dirk Prankef9959472021-11-09 14:16:33 -0800358 for entry_id, entry in entries.items():
359 if entry['kind'] == 'listpage':
360 entry['listitems'] = [entries[child_id]['fields'] for child_id
361 in parents[entry_id]
362 if entries[child_id]['kind'] == 'listitem']
363
364 return entries
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700365
366
367def _fetch(url, force):
368 path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
369 if _needs_update(path, 0, force):
370 fp = urlopen(url)
371 content = fp.read()
372 doc = json.loads(content)
373 updated = _to_ts(doc['feed']['updated']['$t'])
374 common.write_if_changed(path, content)
375 else:
376 with open(path) as fp:
377 doc = json.load(fp)
378 next_url = _find_link(doc['feed'], 'next')
379 return doc, next_url
380
381
382def _find_link(doc, rel):
383 for ent in doc['link']:
384 if ent['rel'] == rel:
385 return ent['href']
386 return None
387
388
389def _to_row(entry, rownum):
390 row = {
391 'rownum': rownum,
392 'content': entry.get('content', {}).get('$t'),
393 'id': _to_id(entry['id']['$t']),
394 'kind': entry['category'][0]['label'],
395 'published': entry['published']['$t'],
396 'updated': entry['updated']['$t'],
397 }
398
399 row['page_name'] = entry.get('sites$pageName', {}).get('$t')
400 row['title'] = entry.get('title', {}).get('$t')
401 row['alt_url'] = _find_link(entry, 'alternate')
402
403 if row['kind'] == 'attachment':
404 row['url'] = _find_link(entry, 'alternate')
405 else:
406 row['url'] = _find_link(entry, 'self')
407
Dirk Prankef9959472021-11-09 14:16:33 -0800408 if row['kind'] == 'listitem':
409 path = row['url'].replace('https://sites.google.com',
410 os.path.join(common.REPO_DIR, 'scripts'))
411 if os.path.exists(path):
412 xml_content = common.read_text_file(path)
413 else:
414 print('fetching %s' % row['url'])
415 with urlopen(row['url']) as fp:
416 xml_content = fp.read()
417 common.write_if_changed(path, xml_content)
418
419 root = ET.fromstring(xml_content)
420 fields = root.findall('{http://schemas.google.com/spreadsheets/2006}field')
421 row['fields'] = collections.OrderedDict((el.attrib['name'], el.text) for el in fields)
422
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700423 parent_url = _find_link(entry,
424 'http://schemas.google.com/sites/2008#parent')
425 if parent_url:
426 row['parent_id'] = _to_id(parent_url)
427 return row
428
429
430def _to_id(url):
431 return url[url.rfind('/') + 1:]
432
433
434def _to_ts(iso_time):
435 return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
436
437if __name__ == '__main__':
438 try:
439 main()
440 except Exception:
441 extype, value, tb = sys.exc_info()
442 traceback.print_exc()
443 pdb.post_mortem(tb)