Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 1 | #!/usr/bin/env vpython3 |
| 2 | # Copyright 2021 Google LLC |
| 3 | # |
| 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | # you may not use this file except in compliance with the License. |
| 6 | # You may obtain a copy of the License at |
| 7 | # |
| 8 | # https://www.apache.org/licenses/LICENSE-2.0 |
| 9 | # |
| 10 | # Unless required by applicable law or agreed to in writing, software |
| 11 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | # See the License for the specific language governing permissions and |
| 14 | # limitations under the License. |
| 15 | |
| 16 | """Export www.chromium.org to local files. |
| 17 | |
| 18 | This script uses the Google GData and Google Sites APIs to extract the |
| 19 | content from http://www.chromium.org/ and write it into local files |
| 20 | that can be used to serve the same content. |
| 21 | |
| 22 | The APIs are documented at |
| 23 | |
| 24 | https://developers.google.com/sites/docs/1.0/developers_guide_protocol |
| 25 | https://developers.google.com/gdata/docs/json |
| 26 | |
| 27 | Because www.chromium.org is a public site, this script requires no |
| 28 | authentication to work. |
| 29 | |
| 30 | The exporting process attempts to convert the original content into |
| 31 | sane modern HTML as much as possible without changing the appearance |
| 32 | of any page significantly, with some minor exceptions. |
| 33 | """ |
| 34 | |
| 35 | import argparse |
| 36 | import io |
| 37 | import json |
| 38 | import os |
| 39 | import pdb |
| 40 | import sys |
| 41 | import time |
| 42 | import traceback |
| 43 | from urllib.request import urlopen |
| 44 | from urllib.error import HTTPError, URLError |
| 45 | |
| 46 | import yaml |
| 47 | |
| 48 | import common |
| 49 | import html2markdown |
| 50 | |
| 51 | |
| 52 | def main(): |
| 53 | parser = argparse.ArgumentParser() |
| 54 | parser.add_argument('--force', action='store_true', |
| 55 | help='ignore updated timestamps in local cache') |
| 56 | parser.add_argument('-t', '--test', action='store_true') |
| 57 | parser.add_argument('-r', '--raw', action='store_true') |
| 58 | parser.add_argument('-v', '--verbose', action='count') |
| 59 | parser.add_argument('--max_results', type=int, default=5000) |
| 60 | parser.add_argument('--start-index', type=int, default=1) |
| 61 | parser.add_argument('--path-list') |
| 62 | parser.add_argument('path', nargs='*') |
| 63 | args = parser.parse_args() |
| 64 | |
| 65 | entries, parents = _entries(args) |
| 66 | |
| 67 | if args.path: |
| 68 | paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '', |
| 69 | path) |
| 70 | for path in args.path] |
| 71 | elif args.path_list: |
| 72 | paths_to_export = common.read_paths(args.path_list) |
| 73 | else: |
| 74 | paths_to_export = [] |
| 75 | |
| 76 | max_input_mtime = max(os.stat(__file__).st_mtime, |
| 77 | os.stat(common.__file__).st_mtime, |
| 78 | os.stat(html2markdown.__file__).st_mtime) |
| 79 | |
| 80 | updated = 0 |
| 81 | paths = [] |
| 82 | |
| 83 | if args.test: |
| 84 | entry = _find_entry_by_path(paths_to_export[0], entries, parents) |
| 85 | if entry: |
| 86 | metadata = _metadata(entry, entries, parents) |
| 87 | path = _path(entry, entries, parents) |
| 88 | _ = _handle_entry(path, |
| 89 | (entry, metadata, max_input_mtime, args.force, |
| 90 | args.raw)) |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 91 | content = common.read_text_file('%s%s/index.md' % |
| 92 | (common.SITE_DIR, path)) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 93 | print(content) |
| 94 | return 0 |
| 95 | else: |
| 96 | print('%s not found' % paths_to_export[0]) |
| 97 | return 1 |
| 98 | |
| 99 | q = common.JobQueue(_handle_entry, common.cpu_count()) |
| 100 | |
| 101 | paths_to_export = set(paths_to_export) |
| 102 | for i, entry in enumerate(list(entries.values())[:args.max_results]): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 103 | if entry['kind'] in ('webpage', 'listpage', |
| 104 | 'announcementspage', 'filecabinet'): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 105 | metadata = _metadata(entry, entries, parents) |
| 106 | path = _path(entry, entries, parents) |
| 107 | elif entry['kind'] == 'attachment': |
| 108 | metadata = {} |
| 109 | path = entry['url'].replace( |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 110 | 'https://sites.google.com/a/chromium.org/dev/', '/') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 111 | else: |
| 112 | continue |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 113 | if not paths_to_export or (path in paths_to_export): |
| 114 | q.request(path, (entry, metadata, max_input_mtime, args.force, |
| 115 | False)) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 116 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 117 | ret = 0 |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 118 | for path, res, did_update in q.results(): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 119 | if res: |
| 120 | ret = 1 |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 121 | if did_update: |
| 122 | updated += 1 |
| 123 | |
| 124 | print('updated %d entries' % updated) |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 125 | return ret |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 126 | |
| 127 | |
| 128 | def _find_entry_by_path(path, entries, parents): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 129 | for entry in entries.values(): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 130 | if entry['kind'] not in ('webpage', 'listpage', |
| 131 | 'announcmentspage', 'filecabinet'): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 132 | continue |
| 133 | entry_path = _path(entry, entries, parents) |
Dirk Pranke | 7aa0137 | 2021-11-05 16:16:09 -0700 | [diff] [blame] | 134 | if entry_path == path: |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 135 | return entry |
| 136 | return None |
| 137 | |
| 138 | |
| 139 | def _handle_entry(task, obj): |
| 140 | entry, metadata, max_input_mtime, force, raw = obj |
| 141 | err = '' |
| 142 | did_update = False |
| 143 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 144 | if not task.startswith('/'): |
| 145 | return 'malformed task', False |
| 146 | |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 147 | yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str |
| 148 | |
| 149 | if task in ( |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 150 | '/developers/jinja', |
| 151 | '/developers/polymer-1-0', |
| 152 | '/devtools/breakpoints-tutorial/index.html', |
| 153 | '/devtools/breakpoints-tutorial/script.js', |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 154 | ): |
| 155 | # TODO: Eleventy chokes on these files. |
| 156 | return '', False |
| 157 | |
| 158 | def repr_str(dumper, data): |
| 159 | if '\n' in data: |
| 160 | return dumper.represent_scalar(u'tag:yaml.org,2002:str', data, |
| 161 | style='|') |
| 162 | return dumper.org_represent_str(data) |
| 163 | |
| 164 | yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper) |
| 165 | |
| 166 | |
| 167 | mtime = _to_ts(entry['updated']) |
| 168 | if entry['kind'] in ('webpage', |
| 169 | 'listpage', |
| 170 | 'announcementspage', |
| 171 | 'filecabinet'): |
| 172 | target_mtime = max(mtime, max_input_mtime) |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 173 | path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 174 | if True or _needs_update(path, target_mtime, force): |
| 175 | if raw: |
| 176 | content = entry['content'] |
| 177 | else: |
| 178 | content_sio = io.StringIO(entry['content']) |
| 179 | md_sio = io.StringIO() |
| 180 | md_sio.write('---\n') |
| 181 | md_sio.write(yaml.safe_dump(metadata)) |
| 182 | md_sio.write('---\n\n') |
| 183 | url_converter = _URLConverter() |
| 184 | html2markdown.Convert(content_sio, md_sio, url_converter) |
| 185 | content = md_sio.getvalue() |
| 186 | content = content.replace(' \b\b\b\b', '') |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 187 | did_update = common.write_if_changed(path, content, mode='w') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 188 | else: |
| 189 | did_update = False |
| 190 | elif entry['kind'] in ('announcement', 'listitem'): |
| 191 | # TODO: implement me. |
| 192 | pass |
| 193 | elif entry['kind'] == 'attachment': |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 194 | path = '%s%s' % (common.SITE_DIR, task) |
| 195 | if task in ( |
| 196 | '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png', |
| 197 | '/developers/design-documents/cookie-split-loading/objects.png', |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 198 | ): |
| 199 | # These are expected 404's that we ignore. |
| 200 | did_update = False |
| 201 | elif _needs_update(path, mtime, force): |
| 202 | try: |
| 203 | fp = urlopen(entry['url']) |
| 204 | content = fp.read() |
| 205 | did_update = common.write_if_changed(path, content) |
| 206 | except (HTTPError, URLError, TimeoutError) as e: |
| 207 | err = 'Error: %s' % e |
| 208 | |
| 209 | elif entry['kind'] == 'comment': |
| 210 | # ignore comments in the migration |
| 211 | pass |
| 212 | elif entry['kind'] == 'tag': |
| 213 | err = 'tag kind not implemented' |
| 214 | else: |
| 215 | err = 'unknown kind %s' % entry['kind'] |
| 216 | |
| 217 | return err, did_update |
| 218 | |
| 219 | |
| 220 | class _URLConverter: |
| 221 | def Translate(self, href): |
| 222 | if not href: |
| 223 | return '' |
| 224 | |
| 225 | for path in common.alternates: |
| 226 | if href.startswith(path): |
| 227 | href = href.replace(path, '') |
| 228 | |
| 229 | if href.startswith('/_/rsrc'): |
| 230 | href = '/' + '/'.join(href.split('/')[4:]) |
| 231 | if '?' in href: |
| 232 | href = href[0:href.index('?')] |
| 233 | return href |
| 234 | |
| 235 | |
| 236 | def _path(entry, entries, parents): |
| 237 | path = entry['page_name'] |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 238 | parent_id = entry.get('parent_id') |
| 239 | while parent_id: |
| 240 | path = entries[parent_id]['page_name'] + '/' + path |
| 241 | parent_id = entries[parent_id].get('parent_id') |
| 242 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 243 | return '/' + path |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 244 | |
| 245 | |
| 246 | def _metadata(entry, entries, parents): |
| 247 | metadata = {} |
| 248 | metadata['page_name'] = entry['page_name'] |
| 249 | metadata['title'] = entry['title'] |
| 250 | |
| 251 | crumbs = [] |
| 252 | parent_id = entry.get('parent_id') |
| 253 | while parent_id: |
| 254 | parent = entries[parent_id] |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 255 | path = _path(parent, entries, parents) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 256 | title = parent['title'] |
| 257 | crumbs = [[path, title]] + crumbs |
| 258 | parent_id = parent.get('parent_id') |
| 259 | |
| 260 | metadata['breadcrumbs'] = crumbs |
| 261 | |
| 262 | if metadata['page_name'] in ( |
| 263 | 'chromium-projects', |
| 264 | 'chromium', |
| 265 | ): |
| 266 | metadata['use_title_as_h1'] = False |
| 267 | |
| 268 | return metadata |
| 269 | |
| 270 | |
| 271 | def _needs_update(path, mtime, force): |
| 272 | if force: |
| 273 | return True |
| 274 | if os.path.exists(path): |
| 275 | st = os.stat(path) |
| 276 | return mtime > st.st_mtime |
| 277 | return True |
| 278 | |
| 279 | |
| 280 | def _entries(args): |
| 281 | entries = {} |
| 282 | parents = set() |
| 283 | |
| 284 | # Looks like Sites probably caps results at 500 entries per request, |
| 285 | # even if we request more than that. |
| 286 | rownum = 0 |
| 287 | url = ('https://sites.google.com/feeds/content/chromium.org/dev' |
| 288 | '?start-index=%d&max-results=%d&alt=json' % |
| 289 | (args.start_index, 500 - rownum)) |
| 290 | doc, next_url = _fetch(url, args.force) |
| 291 | |
| 292 | for rownum, entry in enumerate(doc['feed']['entry'], start=1): |
| 293 | row = _to_row(entry, rownum) |
| 294 | entries[row['id']] = row |
| 295 | if row.get('parent_id'): |
| 296 | parents.add(row['parent_id']) |
| 297 | if args.verbose: |
| 298 | print(' ... [%d]' % rownum) |
| 299 | while next_url: |
| 300 | doc, next_url = _fetch(next_url, args.force) |
| 301 | for rownum, entry in enumerate(doc['feed']['entry'], start=rownum): |
| 302 | row = _to_row(entry, rownum) |
| 303 | entries[row['id']] = row |
| 304 | if row.get('parent_id'): |
| 305 | parents.add(row['parent_id']) |
| 306 | if args.verbose: |
| 307 | print(' ... [%d]' % rownum) |
| 308 | |
| 309 | return entries, parents |
| 310 | |
| 311 | |
| 312 | def _fetch(url, force): |
| 313 | path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/') |
| 314 | if _needs_update(path, 0, force): |
| 315 | fp = urlopen(url) |
| 316 | content = fp.read() |
| 317 | doc = json.loads(content) |
| 318 | updated = _to_ts(doc['feed']['updated']['$t']) |
| 319 | common.write_if_changed(path, content) |
| 320 | else: |
| 321 | with open(path) as fp: |
| 322 | doc = json.load(fp) |
| 323 | next_url = _find_link(doc['feed'], 'next') |
| 324 | return doc, next_url |
| 325 | |
| 326 | |
| 327 | def _find_link(doc, rel): |
| 328 | for ent in doc['link']: |
| 329 | if ent['rel'] == rel: |
| 330 | return ent['href'] |
| 331 | return None |
| 332 | |
| 333 | |
| 334 | def _to_row(entry, rownum): |
| 335 | row = { |
| 336 | 'rownum': rownum, |
| 337 | 'content': entry.get('content', {}).get('$t'), |
| 338 | 'id': _to_id(entry['id']['$t']), |
| 339 | 'kind': entry['category'][0]['label'], |
| 340 | 'published': entry['published']['$t'], |
| 341 | 'updated': entry['updated']['$t'], |
| 342 | } |
| 343 | |
| 344 | row['page_name'] = entry.get('sites$pageName', {}).get('$t') |
| 345 | row['title'] = entry.get('title', {}).get('$t') |
| 346 | row['alt_url'] = _find_link(entry, 'alternate') |
| 347 | |
| 348 | if row['kind'] == 'attachment': |
| 349 | row['url'] = _find_link(entry, 'alternate') |
| 350 | else: |
| 351 | row['url'] = _find_link(entry, 'self') |
| 352 | |
| 353 | parent_url = _find_link(entry, |
| 354 | 'http://schemas.google.com/sites/2008#parent') |
| 355 | if parent_url: |
| 356 | row['parent_id'] = _to_id(parent_url) |
| 357 | return row |
| 358 | |
| 359 | |
| 360 | def _to_id(url): |
| 361 | return url[url.rfind('/') + 1:] |
| 362 | |
| 363 | |
| 364 | def _to_ts(iso_time): |
| 365 | return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ')) |
| 366 | |
| 367 | if __name__ == '__main__': |
| 368 | try: |
| 369 | main() |
| 370 | except Exception: |
| 371 | extype, value, tb = sys.exc_info() |
| 372 | traceback.print_exc() |
| 373 | pdb.post_mortem(tb) |