blob: 64a3c07a44776de55de989e8dbbf8f33b4430a5c [file] [log] [blame]
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001#!/usr/bin/env vpython3
2# Copyright 2021 Google LLC
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Export www.chromium.org to local files.
17
18This script uses the Google GData and Google Sites APIs to extract the
19content from http://www.chromium.org/ and write it into local files
20that can be used to serve the same content.
21
22The APIs are documented at
23
24https://developers.google.com/sites/docs/1.0/developers_guide_protocol
25https://developers.google.com/gdata/docs/json
26
27Because www.chromium.org is a public site, this script requires no
28authentication to work.
29
30The exporting process attempts to convert the original content into
31sane modern HTML as much as possible without changing the appearance
32of any page significantly, with some minor exceptions.
33"""
34
35import argparse
36import io
37import json
38import os
39import pdb
40import sys
41import time
42import traceback
43from urllib.request import urlopen
44from urllib.error import HTTPError, URLError
45
46import yaml
47
48import common
49import html2markdown
50
51
52def main():
53 parser = argparse.ArgumentParser()
54 parser.add_argument('--force', action='store_true',
55 help='ignore updated timestamps in local cache')
56 parser.add_argument('-t', '--test', action='store_true')
57 parser.add_argument('-r', '--raw', action='store_true')
58 parser.add_argument('-v', '--verbose', action='count')
59 parser.add_argument('--max_results', type=int, default=5000)
60 parser.add_argument('--start-index', type=int, default=1)
61 parser.add_argument('--path-list')
62 parser.add_argument('path', nargs='*')
63 args = parser.parse_args()
64
65 entries, parents = _entries(args)
66
67 if args.path:
68 paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
69 path)
70 for path in args.path]
71 elif args.path_list:
72 paths_to_export = common.read_paths(args.path_list)
73 else:
74 paths_to_export = []
75
76 max_input_mtime = max(os.stat(__file__).st_mtime,
77 os.stat(common.__file__).st_mtime,
78 os.stat(html2markdown.__file__).st_mtime)
79
80 updated = 0
81 paths = []
82
83 if args.test:
84 entry = _find_entry_by_path(paths_to_export[0], entries, parents)
85 if entry:
86 metadata = _metadata(entry, entries, parents)
87 path = _path(entry, entries, parents)
88 _ = _handle_entry(path,
89 (entry, metadata, max_input_mtime, args.force,
90 args.raw))
Dirk Pranke304c5342021-11-03 12:34:21 -070091 content = common.read_text_file('%s%s/index.md' %
92 (common.SITE_DIR, path))
Dirk Pranke7bbb5472021-11-02 16:33:21 -070093 print(content)
94 return 0
95 else:
96 print('%s not found' % paths_to_export[0])
97 return 1
98
99 q = common.JobQueue(_handle_entry, common.cpu_count())
100
101 paths_to_export = set(paths_to_export)
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800102 exported_pages = set()
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700103 for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke304c5342021-11-03 12:34:21 -0700104 if entry['kind'] in ('webpage', 'listpage',
105 'announcementspage', 'filecabinet'):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700106 metadata = _metadata(entry, entries, parents)
107 path = _path(entry, entries, parents)
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800108 exported_pages.add(path.rstrip('/') or '/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700109 elif entry['kind'] == 'attachment':
110 metadata = {}
111 path = entry['url'].replace(
Dirk Pranke304c5342021-11-03 12:34:21 -0700112 'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700113 else:
114 continue
Dirk Pranke304c5342021-11-03 12:34:21 -0700115 if not paths_to_export or (path in paths_to_export):
116 q.request(path, (entry, metadata, max_input_mtime, args.force,
117 False))
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700118
Dirk Pranke304c5342021-11-03 12:34:21 -0700119 ret = 0
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700120 for path, res, did_update in q.results():
Dirk Pranke304c5342021-11-03 12:34:21 -0700121 if res:
122 ret = 1
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700123 if did_update:
124 updated += 1
125
Dirk Pranke2de37ac2021-11-09 10:16:46 -0800126 if ret == 0:
127 common.write_text_file(
128 os.path.join(common.SITE_DIR, 'pages.json'),
129 json.dumps(sorted(exported_pages), indent=2) + '\n')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700130 print('updated %d entries' % updated)
Dirk Pranke304c5342021-11-03 12:34:21 -0700131 return ret
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700132
133
134def _find_entry_by_path(path, entries, parents):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700135 for entry in entries.values():
Dirk Pranke304c5342021-11-03 12:34:21 -0700136 if entry['kind'] not in ('webpage', 'listpage',
137 'announcmentspage', 'filecabinet'):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700138 continue
139 entry_path = _path(entry, entries, parents)
Dirk Pranke7aa01372021-11-05 16:16:09 -0700140 if entry_path == path:
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700141 return entry
142 return None
143
144
145def _handle_entry(task, obj):
146 entry, metadata, max_input_mtime, force, raw = obj
147 err = ''
148 did_update = False
149
Dirk Pranke304c5342021-11-03 12:34:21 -0700150 if not task.startswith('/'):
151 return 'malformed task', False
152
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700153 yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
154
155 if task in (
Dirk Pranke304c5342021-11-03 12:34:21 -0700156 '/developers/jinja',
157 '/developers/polymer-1-0',
158 '/devtools/breakpoints-tutorial/index.html',
159 '/devtools/breakpoints-tutorial/script.js',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700160 ):
161 # TODO: Eleventy chokes on these files.
162 return '', False
163
164 def repr_str(dumper, data):
165 if '\n' in data:
166 return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
167 style='|')
168 return dumper.org_represent_str(data)
169
170 yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
171
172
173 mtime = _to_ts(entry['updated'])
174 if entry['kind'] in ('webpage',
175 'listpage',
176 'announcementspage',
177 'filecabinet'):
178 target_mtime = max(mtime, max_input_mtime)
Dirk Pranke304c5342021-11-03 12:34:21 -0700179 path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700180 if True or _needs_update(path, target_mtime, force):
181 if raw:
182 content = entry['content']
183 else:
184 content_sio = io.StringIO(entry['content'])
185 md_sio = io.StringIO()
186 md_sio.write('---\n')
187 md_sio.write(yaml.safe_dump(metadata))
188 md_sio.write('---\n\n')
189 url_converter = _URLConverter()
190 html2markdown.Convert(content_sio, md_sio, url_converter)
191 content = md_sio.getvalue()
192 content = content.replace(' \b\b\b\b', '')
Dirk Pranke304c5342021-11-03 12:34:21 -0700193 did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700194 else:
195 did_update = False
196 elif entry['kind'] in ('announcement', 'listitem'):
197 # TODO: implement me.
198 pass
199 elif entry['kind'] == 'attachment':
Dirk Pranke304c5342021-11-03 12:34:21 -0700200 path = '%s%s' % (common.SITE_DIR, task)
201 if task in (
202 '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
203 '/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700204 ):
205 # These are expected 404's that we ignore.
206 did_update = False
207 elif _needs_update(path, mtime, force):
208 try:
209 fp = urlopen(entry['url'])
210 content = fp.read()
211 did_update = common.write_if_changed(path, content)
212 except (HTTPError, URLError, TimeoutError) as e:
213 err = 'Error: %s' % e
214
215 elif entry['kind'] == 'comment':
216 # ignore comments in the migration
217 pass
218 elif entry['kind'] == 'tag':
219 err = 'tag kind not implemented'
220 else:
221 err = 'unknown kind %s' % entry['kind']
222
223 return err, did_update
224
225
226class _URLConverter:
227 def Translate(self, href):
228 if not href:
229 return ''
230
231 for path in common.alternates:
232 if href.startswith(path):
233 href = href.replace(path, '')
234
235 if href.startswith('/_/rsrc'):
236 href = '/' + '/'.join(href.split('/')[4:])
237 if '?' in href:
238 href = href[0:href.index('?')]
239 return href
240
241
242def _path(entry, entries, parents):
243 path = entry['page_name']
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700244 parent_id = entry.get('parent_id')
245 while parent_id:
246 path = entries[parent_id]['page_name'] + '/' + path
247 parent_id = entries[parent_id].get('parent_id')
248
Dirk Pranke304c5342021-11-03 12:34:21 -0700249 return '/' + path
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700250
251
252def _metadata(entry, entries, parents):
253 metadata = {}
254 metadata['page_name'] = entry['page_name']
255 metadata['title'] = entry['title']
256
257 crumbs = []
258 parent_id = entry.get('parent_id')
259 while parent_id:
260 parent = entries[parent_id]
Dirk Pranke304c5342021-11-03 12:34:21 -0700261 path = _path(parent, entries, parents)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700262 title = parent['title']
263 crumbs = [[path, title]] + crumbs
264 parent_id = parent.get('parent_id')
265
266 metadata['breadcrumbs'] = crumbs
267
268 if metadata['page_name'] in (
269 'chromium-projects',
270 'chromium',
271 ):
272 metadata['use_title_as_h1'] = False
273
274 return metadata
275
276
277def _needs_update(path, mtime, force):
278 if force:
279 return True
280 if os.path.exists(path):
281 st = os.stat(path)
282 return mtime > st.st_mtime
283 return True
284
285
286def _entries(args):
287 entries = {}
288 parents = set()
289
290 # Looks like Sites probably caps results at 500 entries per request,
291 # even if we request more than that.
292 rownum = 0
293 url = ('https://sites.google.com/feeds/content/chromium.org/dev'
294 '?start-index=%d&max-results=%d&alt=json' %
295 (args.start_index, 500 - rownum))
296 doc, next_url = _fetch(url, args.force)
297
298 for rownum, entry in enumerate(doc['feed']['entry'], start=1):
299 row = _to_row(entry, rownum)
300 entries[row['id']] = row
301 if row.get('parent_id'):
302 parents.add(row['parent_id'])
303 if args.verbose:
304 print(' ... [%d]' % rownum)
305 while next_url:
306 doc, next_url = _fetch(next_url, args.force)
307 for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
308 row = _to_row(entry, rownum)
309 entries[row['id']] = row
310 if row.get('parent_id'):
311 parents.add(row['parent_id'])
312 if args.verbose:
313 print(' ... [%d]' % rownum)
314
315 return entries, parents
316
317
318def _fetch(url, force):
319 path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
320 if _needs_update(path, 0, force):
321 fp = urlopen(url)
322 content = fp.read()
323 doc = json.loads(content)
324 updated = _to_ts(doc['feed']['updated']['$t'])
325 common.write_if_changed(path, content)
326 else:
327 with open(path) as fp:
328 doc = json.load(fp)
329 next_url = _find_link(doc['feed'], 'next')
330 return doc, next_url
331
332
333def _find_link(doc, rel):
334 for ent in doc['link']:
335 if ent['rel'] == rel:
336 return ent['href']
337 return None
338
339
340def _to_row(entry, rownum):
341 row = {
342 'rownum': rownum,
343 'content': entry.get('content', {}).get('$t'),
344 'id': _to_id(entry['id']['$t']),
345 'kind': entry['category'][0]['label'],
346 'published': entry['published']['$t'],
347 'updated': entry['updated']['$t'],
348 }
349
350 row['page_name'] = entry.get('sites$pageName', {}).get('$t')
351 row['title'] = entry.get('title', {}).get('$t')
352 row['alt_url'] = _find_link(entry, 'alternate')
353
354 if row['kind'] == 'attachment':
355 row['url'] = _find_link(entry, 'alternate')
356 else:
357 row['url'] = _find_link(entry, 'self')
358
359 parent_url = _find_link(entry,
360 'http://schemas.google.com/sites/2008#parent')
361 if parent_url:
362 row['parent_id'] = _to_id(parent_url)
363 return row
364
365
366def _to_id(url):
367 return url[url.rfind('/') + 1:]
368
369
370def _to_ts(iso_time):
371 return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
372
373if __name__ == '__main__':
374 try:
375 main()
376 except Exception:
377 extype, value, tb = sys.exc_info()
378 traceback.print_exc()
379 pdb.post_mortem(tb)