Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 1 | #!/usr/bin/env vpython3 |
| 2 | # Copyright 2021 Google LLC |
| 3 | # |
| 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | # you may not use this file except in compliance with the License. |
| 6 | # You may obtain a copy of the License at |
| 7 | # |
| 8 | # https://www.apache.org/licenses/LICENSE-2.0 |
| 9 | # |
| 10 | # Unless required by applicable law or agreed to in writing, software |
| 11 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | # See the License for the specific language governing permissions and |
| 14 | # limitations under the License. |
| 15 | |
| 16 | """Export www.chromium.org to local files. |
| 17 | |
| 18 | This script uses the Google GData and Google Sites APIs to extract the |
| 19 | content from http://www.chromium.org/ and write it into local files |
| 20 | that can be used to serve the same content. |
| 21 | |
| 22 | The APIs are documented at |
| 23 | |
| 24 | https://developers.google.com/sites/docs/1.0/developers_guide_protocol |
| 25 | https://developers.google.com/gdata/docs/json |
| 26 | |
| 27 | Because www.chromium.org is a public site, this script requires no |
| 28 | authentication to work. |
| 29 | |
| 30 | The exporting process attempts to convert the original content into |
| 31 | sane modern HTML as much as possible without changing the appearance |
| 32 | of any page significantly, with some minor exceptions. |
| 33 | """ |
| 34 | |
| 35 | import argparse |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 36 | import collections |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 37 | import io |
| 38 | import json |
| 39 | import os |
| 40 | import pdb |
| 41 | import sys |
| 42 | import time |
| 43 | import traceback |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 44 | import xml.etree.ElementTree as ET |
| 45 | |
Dirk Pranke | 0f82ab8 | 2021-11-16 18:43:10 -0800 | [diff] [blame^] | 46 | from urllib.parse import urlparse |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 47 | from urllib.request import urlopen |
| 48 | from urllib.error import HTTPError, URLError |
| 49 | |
| 50 | import yaml |
| 51 | |
| 52 | import common |
| 53 | import html2markdown |
| 54 | |
| 55 | |
| 56 | def main(): |
| 57 | parser = argparse.ArgumentParser() |
| 58 | parser.add_argument('--force', action='store_true', |
| 59 | help='ignore updated timestamps in local cache') |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 60 | parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count()) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 61 | parser.add_argument('-t', '--test', action='store_true') |
| 62 | parser.add_argument('-r', '--raw', action='store_true') |
| 63 | parser.add_argument('-v', '--verbose', action='count') |
| 64 | parser.add_argument('--max_results', type=int, default=5000) |
| 65 | parser.add_argument('--start-index', type=int, default=1) |
| 66 | parser.add_argument('--path-list') |
| 67 | parser.add_argument('path', nargs='*') |
| 68 | args = parser.parse_args() |
| 69 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 70 | entries = _entries(args) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 71 | |
| 72 | if args.path: |
| 73 | paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '', |
| 74 | path) |
| 75 | for path in args.path] |
| 76 | elif args.path_list: |
| 77 | paths_to_export = common.read_paths(args.path_list) |
| 78 | else: |
| 79 | paths_to_export = [] |
| 80 | |
| 81 | max_input_mtime = max(os.stat(__file__).st_mtime, |
| 82 | os.stat(common.__file__).st_mtime, |
| 83 | os.stat(html2markdown.__file__).st_mtime) |
| 84 | |
| 85 | updated = 0 |
| 86 | paths = [] |
| 87 | |
| 88 | if args.test: |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 89 | entry = _find_entry_by_path(paths_to_export[0], entries) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 90 | if entry: |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 91 | metadata = _metadata(entry, entries) |
| 92 | path = _path(entry, entries) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 93 | _ = _handle_entry(path, |
| 94 | (entry, metadata, max_input_mtime, args.force, |
| 95 | args.raw)) |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 96 | content = common.read_text_file('%s%s/index.md' % |
| 97 | (common.SITE_DIR, path)) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 98 | print(content) |
| 99 | return 0 |
| 100 | else: |
| 101 | print('%s not found' % paths_to_export[0]) |
| 102 | return 1 |
| 103 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 104 | q = common.JobQueue(_handle_entry, args.jobs) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 105 | |
| 106 | paths_to_export = set(paths_to_export) |
Dirk Pranke | 2de37ac | 2021-11-09 10:16:46 -0800 | [diff] [blame] | 107 | exported_pages = set() |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 108 | for i, entry in enumerate(list(entries.values())[:args.max_results]): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 109 | if entry['kind'] in ('webpage', 'listpage', |
| 110 | 'announcementspage', 'filecabinet'): |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 111 | metadata = _metadata(entry, entries) |
| 112 | path = _path(entry, entries) |
Dirk Pranke | 2de37ac | 2021-11-09 10:16:46 -0800 | [diff] [blame] | 113 | exported_pages.add(path.rstrip('/') or '/') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 114 | elif entry['kind'] == 'attachment': |
| 115 | metadata = {} |
| 116 | path = entry['url'].replace( |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 117 | 'https://sites.google.com/a/chromium.org/dev/', '/') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 118 | else: |
| 119 | continue |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 120 | if not paths_to_export or (path in paths_to_export): |
| 121 | q.request(path, (entry, metadata, max_input_mtime, args.force, |
| 122 | False)) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 123 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 124 | ret = 0 |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 125 | for path, res, did_update in q.results(): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 126 | if res: |
| 127 | ret = 1 |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 128 | if did_update: |
| 129 | updated += 1 |
| 130 | |
Dirk Pranke | 2de37ac | 2021-11-09 10:16:46 -0800 | [diff] [blame] | 131 | if ret == 0: |
| 132 | common.write_text_file( |
| 133 | os.path.join(common.SITE_DIR, 'pages.json'), |
| 134 | json.dumps(sorted(exported_pages), indent=2) + '\n') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 135 | print('updated %d entries' % updated) |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 136 | return ret |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 137 | |
| 138 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 139 | def _find_entry_by_path(path, entries): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 140 | for entry in entries.values(): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 141 | if entry['kind'] not in ('webpage', 'listpage', |
| 142 | 'announcmentspage', 'filecabinet'): |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 143 | continue |
| 144 | entry_path = _path(entry, entries) |
Dirk Pranke | 7aa0137 | 2021-11-05 16:16:09 -0700 | [diff] [blame] | 145 | if entry_path == path: |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 146 | return entry |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 147 | return None |
| 148 | |
| 149 | |
| 150 | def _handle_entry(task, obj): |
| 151 | entry, metadata, max_input_mtime, force, raw = obj |
| 152 | err = '' |
| 153 | did_update = False |
| 154 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 155 | if not task.startswith('/'): |
| 156 | return 'malformed task', False |
| 157 | |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 158 | yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str |
| 159 | |
| 160 | if task in ( |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 161 | '/developers/jinja', |
| 162 | '/developers/polymer-1-0', |
| 163 | '/devtools/breakpoints-tutorial/index.html', |
| 164 | '/devtools/breakpoints-tutorial/script.js', |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 165 | ): |
| 166 | # TODO: Eleventy chokes on these files. |
| 167 | return '', False |
| 168 | |
| 169 | def repr_str(dumper, data): |
| 170 | if '\n' in data: |
| 171 | return dumper.represent_scalar(u'tag:yaml.org,2002:str', data, |
| 172 | style='|') |
| 173 | return dumper.org_represent_str(data) |
| 174 | |
| 175 | yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper) |
| 176 | |
| 177 | |
| 178 | mtime = _to_ts(entry['updated']) |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 179 | target_mtime = max(mtime, max_input_mtime) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 180 | if entry['kind'] in ('webpage', |
| 181 | 'listpage', |
| 182 | 'announcementspage', |
| 183 | 'filecabinet'): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 184 | path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md') |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 185 | if _needs_update(path, target_mtime, force): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 186 | if raw: |
| 187 | content = entry['content'] |
| 188 | else: |
| 189 | content_sio = io.StringIO(entry['content']) |
| 190 | md_sio = io.StringIO() |
| 191 | md_sio.write('---\n') |
| 192 | md_sio.write(yaml.safe_dump(metadata)) |
| 193 | md_sio.write('---\n\n') |
| 194 | url_converter = _URLConverter() |
| 195 | html2markdown.Convert(content_sio, md_sio, url_converter) |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 196 | if entry['kind'] == 'listpage': |
| 197 | md_sio.write('\n\n') |
| 198 | _write_listitems(md_sio, entry) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 199 | content = md_sio.getvalue() |
| 200 | content = content.replace(' \b\b\b\b', '') |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 201 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 202 | did_update = common.write_if_changed(path, content, mode='w') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 203 | else: |
| 204 | did_update = False |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 205 | elif entry['kind'] == 'listitem': |
| 206 | # Handled as part of the corresponding 'listpage' entry. |
| 207 | pass |
| 208 | elif entry['kind'] == 'announcement': |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 209 | # TODO: implement me. |
| 210 | pass |
| 211 | elif entry['kind'] == 'attachment': |
Dirk Pranke | 221a47d | 2021-11-11 20:26:31 -0800 | [diff] [blame] | 212 | if ':' in task: |
| 213 | task = _URLConverter().Translate(task) |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 214 | path = '%s%s' % (common.SITE_DIR, task) |
| 215 | if task in ( |
| 216 | '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png', |
| 217 | '/developers/design-documents/cookie-split-loading/objects.png', |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 218 | ): |
| 219 | # These are expected 404's that we ignore. |
| 220 | did_update = False |
| 221 | elif _needs_update(path, mtime, force): |
| 222 | try: |
| 223 | fp = urlopen(entry['url']) |
| 224 | content = fp.read() |
| 225 | did_update = common.write_if_changed(path, content) |
| 226 | except (HTTPError, URLError, TimeoutError) as e: |
| 227 | err = 'Error: %s' % e |
| 228 | |
| 229 | elif entry['kind'] == 'comment': |
| 230 | # ignore comments in the migration |
| 231 | pass |
| 232 | elif entry['kind'] == 'tag': |
| 233 | err = 'tag kind not implemented' |
| 234 | else: |
| 235 | err = 'unknown kind %s' % entry['kind'] |
| 236 | |
| 237 | return err, did_update |
| 238 | |
| 239 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 240 | def _write_listitems(content, entry): |
| 241 | if not entry['listitems']: |
| 242 | return |
| 243 | |
| 244 | headers = entry['listitems'][0].keys() |
| 245 | rows = sorted(entry['listitems'], |
| 246 | key=lambda row: row.get('Release') or '') |
| 247 | |
| 248 | content.write('<table>\n') |
| 249 | content.write(' <tr>\n') |
| 250 | for header in headers: |
| 251 | content.write(' <th>%s</th>\n' % header) |
| 252 | content.write(' </tr>\n') |
| 253 | for row in rows: |
| 254 | content.write(' <tr>\n') |
| 255 | for value in row.values(): |
| 256 | if value and value.startswith('<a xmlns='): |
| 257 | value = value.replace(' xmlns="http://www.w3.org/1999/xhtml"', '') |
| 258 | content.write(' <td>%s</td>\n' % (value or '')) |
| 259 | content.write(' </tr>\n') |
| 260 | content.write('</table>\n') |
| 261 | |
| 262 | |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 263 | class _URLConverter: |
| 264 | def Translate(self, href): |
| 265 | if not href: |
| 266 | return '' |
| 267 | |
| 268 | for path in common.alternates: |
| 269 | if href.startswith(path): |
| 270 | href = href.replace(path, '') |
| 271 | |
| 272 | if href.startswith('/_/rsrc'): |
| 273 | href = '/' + '/'.join(href.split('/')[4:]) |
Dirk Pranke | 0f82ab8 | 2021-11-16 18:43:10 -0800 | [diff] [blame^] | 274 | |
| 275 | url = urlparse(href) |
| 276 | if '?' in href and url.netloc == '': |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 277 | href = href[0:href.index('?')] |
Dirk Pranke | 221a47d | 2021-11-11 20:26:31 -0800 | [diff] [blame] | 278 | if 'Screenshot' in href: |
| 279 | head, tail = href.split('Screenshot') |
| 280 | tail = tail.replace(':', '%3A') |
| 281 | href = head + 'Screenshot' + tail |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 282 | return href |
| 283 | |
| 284 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 285 | def _path(entry, entries): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 286 | path = entry['page_name'] |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 287 | parent_id = entry.get('parent_id') |
| 288 | while parent_id: |
| 289 | path = entries[parent_id]['page_name'] + '/' + path |
| 290 | parent_id = entries[parent_id].get('parent_id') |
| 291 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 292 | return '/' + path |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 293 | |
| 294 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 295 | def _metadata(entry, entries): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 296 | metadata = {} |
| 297 | metadata['page_name'] = entry['page_name'] |
| 298 | metadata['title'] = entry['title'] |
| 299 | |
| 300 | crumbs = [] |
| 301 | parent_id = entry.get('parent_id') |
| 302 | while parent_id: |
| 303 | parent = entries[parent_id] |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 304 | path = _path(parent, entries) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 305 | title = parent['title'] |
| 306 | crumbs = [[path, title]] + crumbs |
| 307 | parent_id = parent.get('parent_id') |
| 308 | |
| 309 | metadata['breadcrumbs'] = crumbs |
| 310 | |
| 311 | if metadata['page_name'] in ( |
| 312 | 'chromium-projects', |
| 313 | 'chromium', |
| 314 | ): |
| 315 | metadata['use_title_as_h1'] = False |
| 316 | |
| 317 | return metadata |
| 318 | |
| 319 | |
| 320 | def _needs_update(path, mtime, force): |
| 321 | if force: |
| 322 | return True |
| 323 | if os.path.exists(path): |
| 324 | st = os.stat(path) |
| 325 | return mtime > st.st_mtime |
| 326 | return True |
| 327 | |
| 328 | |
| 329 | def _entries(args): |
| 330 | entries = {} |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 331 | parents = {} |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 332 | |
| 333 | # Looks like Sites probably caps results at 500 entries per request, |
| 334 | # even if we request more than that. |
| 335 | rownum = 0 |
| 336 | url = ('https://sites.google.com/feeds/content/chromium.org/dev' |
| 337 | '?start-index=%d&max-results=%d&alt=json' % |
| 338 | (args.start_index, 500 - rownum)) |
| 339 | doc, next_url = _fetch(url, args.force) |
| 340 | |
| 341 | for rownum, entry in enumerate(doc['feed']['entry'], start=1): |
| 342 | row = _to_row(entry, rownum) |
| 343 | entries[row['id']] = row |
| 344 | if row.get('parent_id'): |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 345 | parents.setdefault(row['parent_id'], set()).add(row['id']) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 346 | if args.verbose: |
| 347 | print(' ... [%d]' % rownum) |
| 348 | while next_url: |
| 349 | doc, next_url = _fetch(next_url, args.force) |
| 350 | for rownum, entry in enumerate(doc['feed']['entry'], start=rownum): |
| 351 | row = _to_row(entry, rownum) |
| 352 | entries[row['id']] = row |
| 353 | if row.get('parent_id'): |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 354 | parents.setdefault(row['parent_id'], set()).add(row['id']) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 355 | if args.verbose: |
| 356 | print(' ... [%d]' % rownum) |
| 357 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 358 | for entry_id, entry in entries.items(): |
| 359 | if entry['kind'] == 'listpage': |
| 360 | entry['listitems'] = [entries[child_id]['fields'] for child_id |
| 361 | in parents[entry_id] |
| 362 | if entries[child_id]['kind'] == 'listitem'] |
| 363 | |
| 364 | return entries |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 365 | |
| 366 | |
| 367 | def _fetch(url, force): |
| 368 | path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/') |
| 369 | if _needs_update(path, 0, force): |
| 370 | fp = urlopen(url) |
| 371 | content = fp.read() |
| 372 | doc = json.loads(content) |
| 373 | updated = _to_ts(doc['feed']['updated']['$t']) |
| 374 | common.write_if_changed(path, content) |
| 375 | else: |
| 376 | with open(path) as fp: |
| 377 | doc = json.load(fp) |
| 378 | next_url = _find_link(doc['feed'], 'next') |
| 379 | return doc, next_url |
| 380 | |
| 381 | |
| 382 | def _find_link(doc, rel): |
| 383 | for ent in doc['link']: |
| 384 | if ent['rel'] == rel: |
| 385 | return ent['href'] |
| 386 | return None |
| 387 | |
| 388 | |
| 389 | def _to_row(entry, rownum): |
| 390 | row = { |
| 391 | 'rownum': rownum, |
| 392 | 'content': entry.get('content', {}).get('$t'), |
| 393 | 'id': _to_id(entry['id']['$t']), |
| 394 | 'kind': entry['category'][0]['label'], |
| 395 | 'published': entry['published']['$t'], |
| 396 | 'updated': entry['updated']['$t'], |
| 397 | } |
| 398 | |
| 399 | row['page_name'] = entry.get('sites$pageName', {}).get('$t') |
| 400 | row['title'] = entry.get('title', {}).get('$t') |
| 401 | row['alt_url'] = _find_link(entry, 'alternate') |
| 402 | |
| 403 | if row['kind'] == 'attachment': |
| 404 | row['url'] = _find_link(entry, 'alternate') |
| 405 | else: |
| 406 | row['url'] = _find_link(entry, 'self') |
| 407 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 408 | if row['kind'] == 'listitem': |
| 409 | path = row['url'].replace('https://sites.google.com', |
| 410 | os.path.join(common.REPO_DIR, 'scripts')) |
| 411 | if os.path.exists(path): |
| 412 | xml_content = common.read_text_file(path) |
| 413 | else: |
| 414 | print('fetching %s' % row['url']) |
| 415 | with urlopen(row['url']) as fp: |
| 416 | xml_content = fp.read() |
| 417 | common.write_if_changed(path, xml_content) |
| 418 | |
| 419 | root = ET.fromstring(xml_content) |
| 420 | fields = root.findall('{http://schemas.google.com/spreadsheets/2006}field') |
| 421 | row['fields'] = collections.OrderedDict((el.attrib['name'], el.text) for el in fields) |
| 422 | |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 423 | parent_url = _find_link(entry, |
| 424 | 'http://schemas.google.com/sites/2008#parent') |
| 425 | if parent_url: |
| 426 | row['parent_id'] = _to_id(parent_url) |
| 427 | return row |
| 428 | |
| 429 | |
| 430 | def _to_id(url): |
| 431 | return url[url.rfind('/') + 1:] |
| 432 | |
| 433 | |
| 434 | def _to_ts(iso_time): |
| 435 | return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ')) |
| 436 | |
| 437 | if __name__ == '__main__': |
| 438 | try: |
| 439 | main() |
| 440 | except Exception: |
| 441 | extype, value, tb = sys.exc_info() |
| 442 | traceback.print_exc() |
| 443 | pdb.post_mortem(tb) |