blob: 93c775f25d09636fbcdcc8443e27630ac986e584 [file] [log] [blame]
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001#!/usr/bin/env vpython3
2# Copyright 2021 Google LLC
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Export www.chromium.org to local files.
17
18This script uses the Google GData and Google Sites APIs to extract the
19content from http://www.chromium.org/ and write it into local files
20that can be used to serve the same content.
21
22The APIs are documented at
23
24https://developers.google.com/sites/docs/1.0/developers_guide_protocol
25https://developers.google.com/gdata/docs/json
26
27Because www.chromium.org is a public site, this script requires no
28authentication to work.
29
30The exporting process attempts to convert the original content into
31sane modern HTML as much as possible without changing the appearance
32of any page significantly, with some minor exceptions.
33"""
34
35import argparse
36import io
37import json
38import os
39import pdb
40import sys
41import time
42import traceback
43from urllib.request import urlopen
44from urllib.error import HTTPError, URLError
45
46import yaml
47
48import common
49import html2markdown
50
51
52def main():
53 parser = argparse.ArgumentParser()
54 parser.add_argument('--force', action='store_true',
55 help='ignore updated timestamps in local cache')
56 parser.add_argument('-t', '--test', action='store_true')
57 parser.add_argument('-r', '--raw', action='store_true')
58 parser.add_argument('-v', '--verbose', action='count')
59 parser.add_argument('--max_results', type=int, default=5000)
60 parser.add_argument('--start-index', type=int, default=1)
61 parser.add_argument('--path-list')
62 parser.add_argument('path', nargs='*')
63 args = parser.parse_args()
64
65 entries, parents = _entries(args)
66
67 if args.path:
68 paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
69 path)
70 for path in args.path]
71 elif args.path_list:
72 paths_to_export = common.read_paths(args.path_list)
73 else:
74 paths_to_export = []
75
76 max_input_mtime = max(os.stat(__file__).st_mtime,
77 os.stat(common.__file__).st_mtime,
78 os.stat(html2markdown.__file__).st_mtime)
79
80 updated = 0
81 paths = []
82
83 if args.test:
84 entry = _find_entry_by_path(paths_to_export[0], entries, parents)
85 if entry:
86 metadata = _metadata(entry, entries, parents)
87 path = _path(entry, entries, parents)
88 _ = _handle_entry(path,
89 (entry, metadata, max_input_mtime, args.force,
90 args.raw))
Dirk Pranke304c5342021-11-03 12:34:21 -070091 content = common.read_text_file('%s%s/index.md' %
92 (common.SITE_DIR, path))
Dirk Pranke7bbb5472021-11-02 16:33:21 -070093 print(content)
94 return 0
95 else:
96 print('%s not found' % paths_to_export[0])
97 return 1
98
99 q = common.JobQueue(_handle_entry, common.cpu_count())
100
101 paths_to_export = set(paths_to_export)
102 for i, entry in enumerate(list(entries.values())[:args.max_results]):
Dirk Pranke304c5342021-11-03 12:34:21 -0700103 if entry['kind'] in ('webpage', 'listpage',
104 'announcementspage', 'filecabinet'):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700105 metadata = _metadata(entry, entries, parents)
106 path = _path(entry, entries, parents)
107 elif entry['kind'] == 'attachment':
108 metadata = {}
109 path = entry['url'].replace(
Dirk Pranke304c5342021-11-03 12:34:21 -0700110 'https://sites.google.com/a/chromium.org/dev/', '/')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700111 else:
112 continue
Dirk Pranke304c5342021-11-03 12:34:21 -0700113 if not paths_to_export or (path in paths_to_export):
114 q.request(path, (entry, metadata, max_input_mtime, args.force,
115 False))
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700116
Dirk Pranke304c5342021-11-03 12:34:21 -0700117 ret = 0
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700118 for path, res, did_update in q.results():
Dirk Pranke304c5342021-11-03 12:34:21 -0700119 if res:
120 ret = 1
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700121 if did_update:
122 updated += 1
123
124 print('updated %d entries' % updated)
Dirk Pranke304c5342021-11-03 12:34:21 -0700125 return ret
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700126
127
128def _find_entry_by_path(path, entries, parents):
129 seen = set()
130 for entry in entries.values():
Dirk Pranke304c5342021-11-03 12:34:21 -0700131 if entry['kind'] not in ('webpage', 'listpage',
132 'announcmentspage', 'filecabinet'):
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700133 continue
134 entry_path = _path(entry, entries, parents)
135 seen.add(entry_path)
Dirk Pranke304c5342021-11-03 12:34:21 -0700136 if '/' + entry_path == path:
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700137 return entry
138 return None
139
140
141def _handle_entry(task, obj):
142 entry, metadata, max_input_mtime, force, raw = obj
143 err = ''
144 did_update = False
145
Dirk Pranke304c5342021-11-03 12:34:21 -0700146 if not task.startswith('/'):
147 return 'malformed task', False
148
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700149 yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
150
151 if task in (
Dirk Pranke304c5342021-11-03 12:34:21 -0700152 '/developers/jinja',
153 '/developers/polymer-1-0',
154 '/devtools/breakpoints-tutorial/index.html',
155 '/devtools/breakpoints-tutorial/script.js',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700156 ):
157 # TODO: Eleventy chokes on these files.
158 return '', False
159
160 def repr_str(dumper, data):
161 if '\n' in data:
162 return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
163 style='|')
164 return dumper.org_represent_str(data)
165
166 yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
167
168
169 mtime = _to_ts(entry['updated'])
170 if entry['kind'] in ('webpage',
171 'listpage',
172 'announcementspage',
173 'filecabinet'):
174 target_mtime = max(mtime, max_input_mtime)
Dirk Pranke304c5342021-11-03 12:34:21 -0700175 path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700176 if True or _needs_update(path, target_mtime, force):
177 if raw:
178 content = entry['content']
179 else:
180 content_sio = io.StringIO(entry['content'])
181 md_sio = io.StringIO()
182 md_sio.write('---\n')
183 md_sio.write(yaml.safe_dump(metadata))
184 md_sio.write('---\n\n')
185 url_converter = _URLConverter()
186 html2markdown.Convert(content_sio, md_sio, url_converter)
187 content = md_sio.getvalue()
188 content = content.replace(' \b\b\b\b', '')
Dirk Pranke304c5342021-11-03 12:34:21 -0700189 did_update = common.write_if_changed(path, content, mode='w')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700190 else:
191 did_update = False
192 elif entry['kind'] in ('announcement', 'listitem'):
193 # TODO: implement me.
194 pass
195 elif entry['kind'] == 'attachment':
Dirk Pranke304c5342021-11-03 12:34:21 -0700196 path = '%s%s' % (common.SITE_DIR, task)
197 if task in (
198 '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
199 '/developers/design-documents/cookie-split-loading/objects.png',
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700200 ):
201 # These are expected 404's that we ignore.
202 did_update = False
203 elif _needs_update(path, mtime, force):
204 try:
205 fp = urlopen(entry['url'])
206 content = fp.read()
207 did_update = common.write_if_changed(path, content)
208 except (HTTPError, URLError, TimeoutError) as e:
Dirk Pranke304c5342021-11-03 12:34:21 -0700209 import pdb; pdb.set_trace()
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700210 err = 'Error: %s' % e
211
212 elif entry['kind'] == 'comment':
213 # ignore comments in the migration
214 pass
215 elif entry['kind'] == 'tag':
216 err = 'tag kind not implemented'
217 else:
218 err = 'unknown kind %s' % entry['kind']
219
220 return err, did_update
221
222
223class _URLConverter:
224 def Translate(self, href):
225 if not href:
226 return ''
227
228 for path in common.alternates:
229 if href.startswith(path):
230 href = href.replace(path, '')
231
232 if href.startswith('/_/rsrc'):
233 href = '/' + '/'.join(href.split('/')[4:])
234 if '?' in href:
235 href = href[0:href.index('?')]
236 return href
237
238
239def _path(entry, entries, parents):
240 path = entry['page_name']
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700241 parent_id = entry.get('parent_id')
242 while parent_id:
243 path = entries[parent_id]['page_name'] + '/' + path
244 parent_id = entries[parent_id].get('parent_id')
245
Dirk Pranke304c5342021-11-03 12:34:21 -0700246 return '/' + path
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700247
248
249def _metadata(entry, entries, parents):
250 metadata = {}
251 metadata['page_name'] = entry['page_name']
252 metadata['title'] = entry['title']
253
254 crumbs = []
255 parent_id = entry.get('parent_id')
256 while parent_id:
257 parent = entries[parent_id]
Dirk Pranke304c5342021-11-03 12:34:21 -0700258 path = _path(parent, entries, parents)
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700259 title = parent['title']
260 crumbs = [[path, title]] + crumbs
261 parent_id = parent.get('parent_id')
262
263 metadata['breadcrumbs'] = crumbs
264
265 if metadata['page_name'] in (
266 'chromium-projects',
267 'chromium',
268 ):
269 metadata['use_title_as_h1'] = False
270
271 return metadata
272
273
274def _needs_update(path, mtime, force):
275 if force:
276 return True
277 if os.path.exists(path):
278 st = os.stat(path)
279 return mtime > st.st_mtime
280 return True
281
282
283def _entries(args):
284 entries = {}
285 parents = set()
286
287 # Looks like Sites probably caps results at 500 entries per request,
288 # even if we request more than that.
289 rownum = 0
290 url = ('https://sites.google.com/feeds/content/chromium.org/dev'
291 '?start-index=%d&max-results=%d&alt=json' %
292 (args.start_index, 500 - rownum))
293 doc, next_url = _fetch(url, args.force)
294
295 for rownum, entry in enumerate(doc['feed']['entry'], start=1):
296 row = _to_row(entry, rownum)
297 entries[row['id']] = row
298 if row.get('parent_id'):
299 parents.add(row['parent_id'])
300 if args.verbose:
301 print(' ... [%d]' % rownum)
302 while next_url:
303 doc, next_url = _fetch(next_url, args.force)
304 for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
305 row = _to_row(entry, rownum)
306 entries[row['id']] = row
307 if row.get('parent_id'):
308 parents.add(row['parent_id'])
309 if args.verbose:
310 print(' ... [%d]' % rownum)
311
312 return entries, parents
313
314
315def _fetch(url, force):
316 path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
317 if _needs_update(path, 0, force):
318 fp = urlopen(url)
319 content = fp.read()
320 doc = json.loads(content)
321 updated = _to_ts(doc['feed']['updated']['$t'])
322 common.write_if_changed(path, content)
323 else:
324 with open(path) as fp:
325 doc = json.load(fp)
326 next_url = _find_link(doc['feed'], 'next')
327 return doc, next_url
328
329
330def _find_link(doc, rel):
331 for ent in doc['link']:
332 if ent['rel'] == rel:
333 return ent['href']
334 return None
335
336
337def _to_row(entry, rownum):
338 row = {
339 'rownum': rownum,
340 'content': entry.get('content', {}).get('$t'),
341 'id': _to_id(entry['id']['$t']),
342 'kind': entry['category'][0]['label'],
343 'published': entry['published']['$t'],
344 'updated': entry['updated']['$t'],
345 }
346
347 row['page_name'] = entry.get('sites$pageName', {}).get('$t')
348 row['title'] = entry.get('title', {}).get('$t')
349 row['alt_url'] = _find_link(entry, 'alternate')
350
351 if row['kind'] == 'attachment':
352 row['url'] = _find_link(entry, 'alternate')
353 else:
354 row['url'] = _find_link(entry, 'self')
355
356 parent_url = _find_link(entry,
357 'http://schemas.google.com/sites/2008#parent')
358 if parent_url:
359 row['parent_id'] = _to_id(parent_url)
360 return row
361
362
363def _to_id(url):
364 return url[url.rfind('/') + 1:]
365
366
367def _to_ts(iso_time):
368 return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
369
370if __name__ == '__main__':
371 try:
372 main()
373 except Exception:
374 extype, value, tb = sys.exc_info()
375 traceback.print_exc()
376 pdb.post_mortem(tb)