blob: 8fa6ab48096744514c8004ce6a2daddd8a8bee9f [file] [log] [blame]
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001#!/usr/bin/env vpython3
2# Copyright 2021 Google LLC
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Export www.chromium.org to local files.
17
18This script uses the Google GData and Google Sites APIs to extract the
19content from http://www.chromium.org/ and write it into local files
20that can be used to serve the same content.
21
22The APIs are documented at
23
24https://developers.google.com/sites/docs/1.0/developers_guide_protocol
25https://developers.google.com/gdata/docs/json
26
27Because www.chromium.org is a public site, this script requires no
28authentication to work.
29
30The exporting process attempts to convert the original content into
31sane modern HTML as much as possible without changing the appearance
32of any page significantly, with some minor exceptions.
33"""
34
35import argparse
36import io
37import json
38import os
39import pdb
40import sys
41import time
42import traceback
43from urllib.request import urlopen
44from urllib.error import HTTPError, URLError
45
46import yaml
47
48import common
49import html2markdown
50
51
52def main():
53 parser = argparse.ArgumentParser()
54 parser.add_argument('--force', action='store_true',
55 help='ignore updated timestamps in local cache')
56 parser.add_argument('-t', '--test', action='store_true')
57 parser.add_argument('-r', '--raw', action='store_true')
58 parser.add_argument('-v', '--verbose', action='count')
59 parser.add_argument('--max_results', type=int, default=5000)
60 parser.add_argument('--start-index', type=int, default=1)
61 parser.add_argument('--path-list')
62 parser.add_argument('path', nargs='*')
63 args = parser.parse_args()
64
65 entries, parents = _entries(args)
66
67 if args.path:
68 paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
69 path)
70 for path in args.path]
71 elif args.path_list:
72 paths_to_export = common.read_paths(args.path_list)
73 else:
74 paths_to_export = []
75
76 max_input_mtime = max(os.stat(__file__).st_mtime,
77 os.stat(common.__file__).st_mtime,
78 os.stat(html2markdown.__file__).st_mtime)
79
80 updated = 0
81 paths = []
82
83 if args.test:
84 entry = _find_entry_by_path(paths_to_export[0], entries, parents)
85 if entry:
86 metadata = _metadata(entry, entries, parents)
87 path = _path(entry, entries, parents)
88 _ = _handle_entry(path,
89 (entry, metadata, max_input_mtime, args.force,
90 args.raw))
Dirk Pranke304c5342021-11-03 12:34:21 -070091 content = common.read_text_file('%s%s/index.md' %
92 (common.SITE_DIR, path))
Dirk Pranke7bbb5472021-11-02 16:33:21 -070093 print(content)
94 return 0
95 else:
96 print('%s not found' % paths_to_export[0])
97 return 1
98
99 q = common.JobQueue(_handle_entry, common.cpu_count())
100
101 paths_to_export = set(paths_to_export)
102 for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke304c5342021-11-03 12:34:21 -0700103 if entry['kind'] in ('webpage', 'listpage',
104 'announcementspage', 'filecabinet'):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700105 metadata = _metadata(entry, entries, parents)
106 path = _path(entry, entries, parents)
107 elif entry['kind'] == 'attachment':
108 metadata = {}
109 path = entry['url'].replace(
Dirk Pranke304c5342021-11-03 12:34:21 -0700110 'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700111 else:
112 continue
Dirk Pranke304c5342021-11-03 12:34:21 -0700113 if not paths_to_export or (path in paths_to_export):
114 q.request(path, (entry, metadata, max_input_mtime, args.force,
115 False))
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700116
Dirk Pranke304c5342021-11-03 12:34:21 -0700117 ret = 0
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700118 for path, res, did_update in q.results():
Dirk Pranke304c5342021-11-03 12:34:21 -0700119 if res:
120 ret = 1
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700121 if did_update:
122 updated += 1
123
124 print('updated %d entries' % updated)
Dirk Pranke304c5342021-11-03 12:34:21 -0700125 return ret
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700126
127
128def _find_entry_by_path(path, entries, parents):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700129 for entry in entries.values():
Dirk Pranke304c5342021-11-03 12:34:21 -0700130 if entry['kind'] not in ('webpage', 'listpage',
131 'announcmentspage', 'filecabinet'):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700132 continue
133 entry_path = _path(entry, entries, parents)
Dirk Pranke7aa01372021-11-05 16:16:09 -0700134 if entry_path == path:
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700135 return entry
136 return None
137
138
139def _handle_entry(task, obj):
140 entry, metadata, max_input_mtime, force, raw = obj
141 err = ''
142 did_update = False
143
Dirk Pranke304c5342021-11-03 12:34:21 -0700144 if not task.startswith('/'):
145 return 'malformed task', False
146
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700147 yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
148
149 if task in (
Dirk Pranke304c5342021-11-03 12:34:21 -0700150 '/developers/jinja',
151 '/developers/polymer-1-0',
152 '/devtools/breakpoints-tutorial/index.html',
153 '/devtools/breakpoints-tutorial/script.js',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700154 ):
155 # TODO: Eleventy chokes on these files.
156 return '', False
157
158 def repr_str(dumper, data):
159 if '\n' in data:
160 return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
161 style='|')
162 return dumper.org_represent_str(data)
163
164 yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
165
166
167 mtime = _to_ts(entry['updated'])
168 if entry['kind'] in ('webpage',
169 'listpage',
170 'announcementspage',
171 'filecabinet'):
172 target_mtime = max(mtime, max_input_mtime)
Dirk Pranke304c5342021-11-03 12:34:21 -0700173 path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700174 if True or _needs_update(path, target_mtime, force):
175 if raw:
176 content = entry['content']
177 else:
178 content_sio = io.StringIO(entry['content'])
179 md_sio = io.StringIO()
180 md_sio.write('---\n')
181 md_sio.write(yaml.safe_dump(metadata))
182 md_sio.write('---\n\n')
183 url_converter = _URLConverter()
184 html2markdown.Convert(content_sio, md_sio, url_converter)
185 content = md_sio.getvalue()
186 content = content.replace(' \b\b\b\b', '')
Dirk Pranke304c5342021-11-03 12:34:21 -0700187 did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700188 else:
189 did_update = False
190 elif entry['kind'] in ('announcement', 'listitem'):
191 # TODO: implement me.
192 pass
193 elif entry['kind'] == 'attachment':
Dirk Pranke304c5342021-11-03 12:34:21 -0700194 path = '%s%s' % (common.SITE_DIR, task)
195 if task in (
196 '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
197 '/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700198 ):
199 # These are expected 404's that we ignore.
200 did_update = False
201 elif _needs_update(path, mtime, force):
202 try:
203 fp = urlopen(entry['url'])
204 content = fp.read()
205 did_update = common.write_if_changed(path, content)
206 except (HTTPError, URLError, TimeoutError) as e:
207 err = 'Error: %s' % e
208
209 elif entry['kind'] == 'comment':
210 # ignore comments in the migration
211 pass
212 elif entry['kind'] == 'tag':
213 err = 'tag kind not implemented'
214 else:
215 err = 'unknown kind %s' % entry['kind']
216
217 return err, did_update
218
219
220class _URLConverter:
221 def Translate(self, href):
222 if not href:
223 return ''
224
225 for path in common.alternates:
226 if href.startswith(path):
227 href = href.replace(path, '')
228
229 if href.startswith('/_/rsrc'):
230 href = '/' + '/'.join(href.split('/')[4:])
231 if '?' in href:
232 href = href[0:href.index('?')]
233 return href
234
235
236def _path(entry, entries, parents):
237 path = entry['page_name']
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700238 parent_id = entry.get('parent_id')
239 while parent_id:
240 path = entries[parent_id]['page_name'] + '/' + path
241 parent_id = entries[parent_id].get('parent_id')
242
Dirk Pranke304c5342021-11-03 12:34:21 -0700243 return '/' + path
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700244
245
246def _metadata(entry, entries, parents):
247 metadata = {}
248 metadata['page_name'] = entry['page_name']
249 metadata['title'] = entry['title']
250
251 crumbs = []
252 parent_id = entry.get('parent_id')
253 while parent_id:
254 parent = entries[parent_id]
Dirk Pranke304c5342021-11-03 12:34:21 -0700255 path = _path(parent, entries, parents)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700256 title = parent['title']
257 crumbs = [[path, title]] + crumbs
258 parent_id = parent.get('parent_id')
259
260 metadata['breadcrumbs'] = crumbs
261
262 if metadata['page_name'] in (
263 'chromium-projects',
264 'chromium',
265 ):
266 metadata['use_title_as_h1'] = False
267
268 return metadata
269
270
271def _needs_update(path, mtime, force):
272 if force:
273 return True
274 if os.path.exists(path):
275 st = os.stat(path)
276 return mtime > st.st_mtime
277 return True
278
279
280def _entries(args):
281 entries = {}
282 parents = set()
283
284 # Looks like Sites probably caps results at 500 entries per request,
285 # even if we request more than that.
286 rownum = 0
287 url = ('https://sites.google.com/feeds/content/chromium.org/dev'
288 '?start-index=%d&max-results=%d&alt=json' %
289 (args.start_index, 500 - rownum))
290 doc, next_url = _fetch(url, args.force)
291
292 for rownum, entry in enumerate(doc['feed']['entry'], start=1):
293 row = _to_row(entry, rownum)
294 entries[row['id']] = row
295 if row.get('parent_id'):
296 parents.add(row['parent_id'])
297 if args.verbose:
298 print(' ... [%d]' % rownum)
299 while next_url:
300 doc, next_url = _fetch(next_url, args.force)
301 for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
302 row = _to_row(entry, rownum)
303 entries[row['id']] = row
304 if row.get('parent_id'):
305 parents.add(row['parent_id'])
306 if args.verbose:
307 print(' ... [%d]' % rownum)
308
309 return entries, parents
310
311
312def _fetch(url, force):
313 path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
314 if _needs_update(path, 0, force):
315 fp = urlopen(url)
316 content = fp.read()
317 doc = json.loads(content)
318 updated = _to_ts(doc['feed']['updated']['$t'])
319 common.write_if_changed(path, content)
320 else:
321 with open(path) as fp:
322 doc = json.load(fp)
323 next_url = _find_link(doc['feed'], 'next')
324 return doc, next_url
325
326
327def _find_link(doc, rel):
328 for ent in doc['link']:
329 if ent['rel'] == rel:
330 return ent['href']
331 return None
332
333
334def _to_row(entry, rownum):
335 row = {
336 'rownum': rownum,
337 'content': entry.get('content', {}).get('$t'),
338 'id': _to_id(entry['id']['$t']),
339 'kind': entry['category'][0]['label'],
340 'published': entry['published']['$t'],
341 'updated': entry['updated']['$t'],
342 }
343
344 row['page_name'] = entry.get('sites$pageName', {}).get('$t')
345 row['title'] = entry.get('title', {}).get('$t')
346 row['alt_url'] = _find_link(entry, 'alternate')
347
348 if row['kind'] == 'attachment':
349 row['url'] = _find_link(entry, 'alternate')
350 else:
351 row['url'] = _find_link(entry, 'self')
352
353 parent_url = _find_link(entry,
354 'http://schemas.google.com/sites/2008#parent')
355 if parent_url:
356 row['parent_id'] = _to_id(parent_url)
357 return row
358
359
360def _to_id(url):
361 return url[url.rfind('/') + 1:]
362
363
364def _to_ts(iso_time):
365 return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
366
367if __name__ == '__main__':
368 try:
369 main()
370 except Exception:
371 extype, value, tb = sys.exc_info()
372 traceback.print_exc()
373 pdb.post_mortem(tb)