blob: 5e7f7e491a5840b81625624f2388b71199d10236 [file] [log] [blame]
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001#!/usr/bin/env vpython3
2# Copyright 2021 Google LLC
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Export www.chromium.org to local files.
17
18This script uses the Google GData and Google Sites APIs to extract the
19content from http://www.chromium.org/ and write it into local files
20that can be used to serve the same content.
21
22The APIs are documented at
23
24https://developers.google.com/sites/docs/1.0/developers_guide_protocol
25https://developers.google.com/gdata/docs/json
26
27Because www.chromium.org is a public site, this script requires no
28authentication to work.
29
30The exporting process attempts to convert the original content into
31sane modern HTML as much as possible without changing the appearance
32of any page significantly, with some minor exceptions.
33"""
34
35import argparse
36import io
37import json
38import os
39import pdb
40import sys
41import time
42import traceback
43from urllib.request import urlopen
44from urllib.error import HTTPError, URLError
45
46import yaml
47
48import common
49import html2markdown
50
51
52def main():
53 parser = argparse.ArgumentParser()
54 parser.add_argument('--force', action='store_true',
55 help='ignore updated timestamps in local cache')
56 parser.add_argument('-t', '--test', action='store_true')
57 parser.add_argument('-r', '--raw', action='store_true')
58 parser.add_argument('-v', '--verbose', action='count')
59 parser.add_argument('--max_results', type=int, default=5000)
60 parser.add_argument('--start-index', type=int, default=1)
61 parser.add_argument('--path-list')
62 parser.add_argument('path', nargs='*')
63 args = parser.parse_args()
64
65 entries, parents = _entries(args)
66
67 if args.path:
68 paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
69 path)
70 for path in args.path]
71 elif args.path_list:
72 paths_to_export = common.read_paths(args.path_list)
73 else:
74 paths_to_export = []
75
76 max_input_mtime = max(os.stat(__file__).st_mtime,
77 os.stat(common.__file__).st_mtime,
78 os.stat(html2markdown.__file__).st_mtime)
79
80 updated = 0
81 paths = []
82
83 if args.test:
84 entry = _find_entry_by_path(paths_to_export[0], entries, parents)
85 if entry:
86 metadata = _metadata(entry, entries, parents)
87 path = _path(entry, entries, parents)
88 _ = _handle_entry(path,
89 (entry, metadata, max_input_mtime, args.force,
90 args.raw))
91 content = common.read_text_file('%s/%s.md' % (common.SOURCE_DIR,
92 path))
93 print(content)
94 return 0
95 else:
96 print('%s not found' % paths_to_export[0])
97 return 1
98
99 q = common.JobQueue(_handle_entry, common.cpu_count())
100
101 paths_to_export = set(paths_to_export)
102 for i, entry in enumerate(list(entries.values())[:args.max_results]):
103 if entry['kind'] in ('webpage', 'listpage', 'announcementspage', 'filecabinet'):
104 metadata = _metadata(entry, entries, parents)
105 path = _path(entry, entries, parents)
106 elif entry['kind'] == 'attachment':
107 metadata = {}
108 path = entry['url'].replace(
109 'https://sites.google.com/a/chromium.org/dev/', '')
110 else:
111 continue
112
113 if not paths_to_export or (
114 ('/' + path).replace('/index', '') in paths_to_export):
115 q.request(path, (entry, metadata, max_input_mtime, args.force, False))
116
117 for path, res, did_update in q.results():
118 if did_update:
119 updated += 1
120
121 print('updated %d entries' % updated)
122
123
124def _find_entry_by_path(path, entries, parents):
125 seen = set()
126 for entry in entries.values():
127 if entry['kind'] not in ('webpage', 'listpage', 'announcmentspage', 'filecabinet'):
128 continue
129 entry_path = _path(entry, entries, parents)
130 seen.add(entry_path)
131 if '/' + entry_path in (path, path + '/index'):
132 return entry
133 return None
134
135
136def _handle_entry(task, obj):
137 entry, metadata, max_input_mtime, force, raw = obj
138 err = ''
139 did_update = False
140
141 yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
142
143 if task in (
144 'developers/jinja',
145 'developers/polymer-1-0',
146 'devtools/breakpoints-tutorial/index.html',
147 'devtools/breakpoints-tutorial/script.js',
148 ):
149 # TODO: Eleventy chokes on these files.
150 return '', False
151
152 def repr_str(dumper, data):
153 if '\n' in data:
154 return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
155 style='|')
156 return dumper.org_represent_str(data)
157
158 yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
159
160
161 mtime = _to_ts(entry['updated'])
162 if entry['kind'] in ('webpage',
163 'listpage',
164 'announcementspage',
165 'filecabinet'):
166 target_mtime = max(mtime, max_input_mtime)
167 path = '%s/%s.md' % (common.SOURCE_DIR, task)
168 if True or _needs_update(path, target_mtime, force):
169 if raw:
170 content = entry['content']
171 else:
172 content_sio = io.StringIO(entry['content'])
173 md_sio = io.StringIO()
174 md_sio.write('---\n')
175 md_sio.write(yaml.safe_dump(metadata))
176 md_sio.write('---\n\n')
177 url_converter = _URLConverter()
178 html2markdown.Convert(content_sio, md_sio, url_converter)
179 content = md_sio.getvalue()
180 content = content.replace(' \b\b\b\b', '')
181 did_update = common.write_if_changed(path, content.encode('utf-8'))
182 else:
183 did_update = False
184 elif entry['kind'] in ('announcement', 'listitem'):
185 # TODO: implement me.
186 pass
187 elif entry['kind'] == 'attachment':
188 path = '%s/%s' % (common.SOURCE_DIR, task)
189 if path in (
190 'site/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
191 'site/developers/design-documents/cookie-split-loading/objects.png',
192 ):
193 # These are expected 404's that we ignore.
194 did_update = False
195 elif _needs_update(path, mtime, force):
196 try:
197 fp = urlopen(entry['url'])
198 content = fp.read()
199 did_update = common.write_if_changed(path, content)
200 except (HTTPError, URLError, TimeoutError) as e:
201 err = 'Error: %s' % e
202
203 elif entry['kind'] == 'comment':
204 # ignore comments in the migration
205 pass
206 elif entry['kind'] == 'tag':
207 err = 'tag kind not implemented'
208 else:
209 err = 'unknown kind %s' % entry['kind']
210
211 return err, did_update
212
213
214class _URLConverter:
215 def Translate(self, href):
216 if not href:
217 return ''
218
219 for path in common.alternates:
220 if href.startswith(path):
221 href = href.replace(path, '')
222
223 if href.startswith('/_/rsrc'):
224 href = '/' + '/'.join(href.split('/')[4:])
225 if '?' in href:
226 href = href[0:href.index('?')]
227 return href
228
229
230def _path(entry, entries, parents):
231 path = entry['page_name']
232 if entry['id'] in parents:
233 path = path + '/index'
234 parent_id = entry.get('parent_id')
235 while parent_id:
236 path = entries[parent_id]['page_name'] + '/' + path
237 parent_id = entries[parent_id].get('parent_id')
238
239 return path
240
241
242def _metadata(entry, entries, parents):
243 metadata = {}
244 metadata['page_name'] = entry['page_name']
245 metadata['title'] = entry['title']
246
247 crumbs = []
248 parent_id = entry.get('parent_id')
249 while parent_id:
250 parent = entries[parent_id]
251 path = '/' + _path(parent, entries, parents).replace('/index', '')
252 title = parent['title']
253 crumbs = [[path, title]] + crumbs
254 parent_id = parent.get('parent_id')
255
256 metadata['breadcrumbs'] = crumbs
257
258 if metadata['page_name'] in (
259 'chromium-projects',
260 'chromium',
261 ):
262 metadata['use_title_as_h1'] = False
263
264 return metadata
265
266
267def _needs_update(path, mtime, force):
268 if force:
269 return True
270 if os.path.exists(path):
271 st = os.stat(path)
272 return mtime > st.st_mtime
273 return True
274
275
276def _entries(args):
277 entries = {}
278 parents = set()
279
280 # Looks like Sites probably caps results at 500 entries per request,
281 # even if we request more than that.
282 rownum = 0
283 url = ('https://sites.google.com/feeds/content/chromium.org/dev'
284 '?start-index=%d&max-results=%d&alt=json' %
285 (args.start_index, 500 - rownum))
286 doc, next_url = _fetch(url, args.force)
287
288 for rownum, entry in enumerate(doc['feed']['entry'], start=1):
289 row = _to_row(entry, rownum)
290 entries[row['id']] = row
291 if row.get('parent_id'):
292 parents.add(row['parent_id'])
293 if args.verbose:
294 print(' ... [%d]' % rownum)
295 while next_url:
296 doc, next_url = _fetch(next_url, args.force)
297 for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
298 row = _to_row(entry, rownum)
299 entries[row['id']] = row
300 if row.get('parent_id'):
301 parents.add(row['parent_id'])
302 if args.verbose:
303 print(' ... [%d]' % rownum)
304
305 return entries, parents
306
307
308def _fetch(url, force):
309 path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
310 if _needs_update(path, 0, force):
311 fp = urlopen(url)
312 content = fp.read()
313 doc = json.loads(content)
314 updated = _to_ts(doc['feed']['updated']['$t'])
315 common.write_if_changed(path, content)
316 else:
317 with open(path) as fp:
318 doc = json.load(fp)
319 next_url = _find_link(doc['feed'], 'next')
320 return doc, next_url
321
322
323def _find_link(doc, rel):
324 for ent in doc['link']:
325 if ent['rel'] == rel:
326 return ent['href']
327 return None
328
329
330def _to_row(entry, rownum):
331 row = {
332 'rownum': rownum,
333 'content': entry.get('content', {}).get('$t'),
334 'id': _to_id(entry['id']['$t']),
335 'kind': entry['category'][0]['label'],
336 'published': entry['published']['$t'],
337 'updated': entry['updated']['$t'],
338 }
339
340 row['page_name'] = entry.get('sites$pageName', {}).get('$t')
341 row['title'] = entry.get('title', {}).get('$t')
342 row['alt_url'] = _find_link(entry, 'alternate')
343
344 if row['kind'] == 'attachment':
345 row['url'] = _find_link(entry, 'alternate')
346 else:
347 row['url'] = _find_link(entry, 'self')
348
349 parent_url = _find_link(entry,
350 'http://schemas.google.com/sites/2008#parent')
351 if parent_url:
352 row['parent_id'] = _to_id(parent_url)
353 return row
354
355
356def _to_id(url):
357 return url[url.rfind('/') + 1:]
358
359
360def _to_ts(iso_time):
361 return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
362
363if __name__ == '__main__':
364 try:
365 main()
366 except Exception:
367 extype, value, tb = sys.exc_info()
368 traceback.print_exc()
369 pdb.post_mortem(tb)