Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 1 | #!/usr/bin/env vpython3 |
| 2 | # Copyright 2021 Google LLC |
| 3 | # |
| 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | # you may not use this file except in compliance with the License. |
| 6 | # You may obtain a copy of the License at |
| 7 | # |
| 8 | # https://www.apache.org/licenses/LICENSE-2.0 |
| 9 | # |
| 10 | # Unless required by applicable law or agreed to in writing, software |
| 11 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | # See the License for the specific language governing permissions and |
| 14 | # limitations under the License. |
| 15 | |
| 16 | """Export www.chromium.org to local files. |
| 17 | |
| 18 | This script uses the Google GData and Google Sites APIs to extract the |
| 19 | content from http://www.chromium.org/ and write it into local files |
| 20 | that can be used to serve the same content. |
| 21 | |
| 22 | The APIs are documented at |
| 23 | |
| 24 | https://developers.google.com/sites/docs/1.0/developers_guide_protocol |
| 25 | https://developers.google.com/gdata/docs/json |
| 26 | |
| 27 | Because www.chromium.org is a public site, this script requires no |
| 28 | authentication to work. |
| 29 | |
| 30 | The exporting process attempts to convert the original content into |
| 31 | sane modern HTML as much as possible without changing the appearance |
| 32 | of any page significantly, with some minor exceptions. |
| 33 | """ |
| 34 | |
| 35 | import argparse |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 36 | import collections |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 37 | import io |
| 38 | import json |
| 39 | import os |
| 40 | import pdb |
| 41 | import sys |
| 42 | import time |
| 43 | import traceback |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 44 | import xml.etree.ElementTree as ET |
| 45 | |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 46 | from urllib.request import urlopen |
| 47 | from urllib.error import HTTPError, URLError |
| 48 | |
| 49 | import yaml |
| 50 | |
| 51 | import common |
| 52 | import html2markdown |
| 53 | |
| 54 | |
| 55 | def main(): |
| 56 | parser = argparse.ArgumentParser() |
| 57 | parser.add_argument('--force', action='store_true', |
| 58 | help='ignore updated timestamps in local cache') |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 59 | parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count()) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 60 | parser.add_argument('-t', '--test', action='store_true') |
| 61 | parser.add_argument('-r', '--raw', action='store_true') |
| 62 | parser.add_argument('-v', '--verbose', action='count') |
| 63 | parser.add_argument('--max_results', type=int, default=5000) |
| 64 | parser.add_argument('--start-index', type=int, default=1) |
| 65 | parser.add_argument('--path-list') |
| 66 | parser.add_argument('path', nargs='*') |
| 67 | args = parser.parse_args() |
| 68 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 69 | entries = _entries(args) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 70 | |
| 71 | if args.path: |
| 72 | paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '', |
| 73 | path) |
| 74 | for path in args.path] |
| 75 | elif args.path_list: |
| 76 | paths_to_export = common.read_paths(args.path_list) |
| 77 | else: |
| 78 | paths_to_export = [] |
| 79 | |
| 80 | max_input_mtime = max(os.stat(__file__).st_mtime, |
| 81 | os.stat(common.__file__).st_mtime, |
| 82 | os.stat(html2markdown.__file__).st_mtime) |
| 83 | |
| 84 | updated = 0 |
| 85 | paths = [] |
| 86 | |
| 87 | if args.test: |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 88 | entry = _find_entry_by_path(paths_to_export[0], entries) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 89 | if entry: |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 90 | metadata = _metadata(entry, entries) |
| 91 | path = _path(entry, entries) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 92 | _ = _handle_entry(path, |
| 93 | (entry, metadata, max_input_mtime, args.force, |
| 94 | args.raw)) |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 95 | content = common.read_text_file('%s%s/index.md' % |
| 96 | (common.SITE_DIR, path)) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 97 | print(content) |
| 98 | return 0 |
| 99 | else: |
| 100 | print('%s not found' % paths_to_export[0]) |
| 101 | return 1 |
| 102 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 103 | q = common.JobQueue(_handle_entry, args.jobs) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 104 | |
| 105 | paths_to_export = set(paths_to_export) |
Dirk Pranke | 2de37ac | 2021-11-09 10:16:46 -0800 | [diff] [blame] | 106 | exported_pages = set() |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 107 | for i, entry in enumerate(list(entries.values())[:args.max_results]): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 108 | if entry['kind'] in ('webpage', 'listpage', |
| 109 | 'announcementspage', 'filecabinet'): |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 110 | metadata = _metadata(entry, entries) |
| 111 | path = _path(entry, entries) |
Dirk Pranke | 2de37ac | 2021-11-09 10:16:46 -0800 | [diff] [blame] | 112 | exported_pages.add(path.rstrip('/') or '/') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 113 | elif entry['kind'] == 'attachment': |
| 114 | metadata = {} |
| 115 | path = entry['url'].replace( |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 116 | 'https://sites.google.com/a/chromium.org/dev/', '/') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 117 | else: |
| 118 | continue |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 119 | if not paths_to_export or (path in paths_to_export): |
| 120 | q.request(path, (entry, metadata, max_input_mtime, args.force, |
| 121 | False)) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 122 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 123 | ret = 0 |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 124 | for path, res, did_update in q.results(): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 125 | if res: |
| 126 | ret = 1 |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 127 | if did_update: |
| 128 | updated += 1 |
| 129 | |
Dirk Pranke | 2de37ac | 2021-11-09 10:16:46 -0800 | [diff] [blame] | 130 | if ret == 0: |
| 131 | common.write_text_file( |
| 132 | os.path.join(common.SITE_DIR, 'pages.json'), |
| 133 | json.dumps(sorted(exported_pages), indent=2) + '\n') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 134 | print('updated %d entries' % updated) |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 135 | return ret |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 136 | |
| 137 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 138 | def _find_entry_by_path(path, entries): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 139 | for entry in entries.values(): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 140 | if entry['kind'] not in ('webpage', 'listpage', |
| 141 | 'announcmentspage', 'filecabinet'): |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 142 | continue |
| 143 | entry_path = _path(entry, entries) |
Dirk Pranke | 7aa0137 | 2021-11-05 16:16:09 -0700 | [diff] [blame] | 144 | if entry_path == path: |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 145 | return entry |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 146 | return None |
| 147 | |
| 148 | |
| 149 | def _handle_entry(task, obj): |
| 150 | entry, metadata, max_input_mtime, force, raw = obj |
| 151 | err = '' |
| 152 | did_update = False |
| 153 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 154 | if not task.startswith('/'): |
| 155 | return 'malformed task', False |
| 156 | |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 157 | yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str |
| 158 | |
| 159 | if task in ( |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 160 | '/developers/jinja', |
| 161 | '/developers/polymer-1-0', |
| 162 | '/devtools/breakpoints-tutorial/index.html', |
| 163 | '/devtools/breakpoints-tutorial/script.js', |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 164 | ): |
| 165 | # TODO: Eleventy chokes on these files. |
| 166 | return '', False |
| 167 | |
| 168 | def repr_str(dumper, data): |
| 169 | if '\n' in data: |
| 170 | return dumper.represent_scalar(u'tag:yaml.org,2002:str', data, |
| 171 | style='|') |
| 172 | return dumper.org_represent_str(data) |
| 173 | |
| 174 | yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper) |
| 175 | |
| 176 | |
| 177 | mtime = _to_ts(entry['updated']) |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 178 | target_mtime = max(mtime, max_input_mtime) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 179 | if entry['kind'] in ('webpage', |
| 180 | 'listpage', |
| 181 | 'announcementspage', |
| 182 | 'filecabinet'): |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 183 | path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md') |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 184 | if _needs_update(path, target_mtime, force): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 185 | if raw: |
| 186 | content = entry['content'] |
| 187 | else: |
| 188 | content_sio = io.StringIO(entry['content']) |
| 189 | md_sio = io.StringIO() |
| 190 | md_sio.write('---\n') |
| 191 | md_sio.write(yaml.safe_dump(metadata)) |
| 192 | md_sio.write('---\n\n') |
| 193 | url_converter = _URLConverter() |
| 194 | html2markdown.Convert(content_sio, md_sio, url_converter) |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 195 | if entry['kind'] == 'listpage': |
| 196 | md_sio.write('\n\n') |
| 197 | _write_listitems(md_sio, entry) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 198 | content = md_sio.getvalue() |
| 199 | content = content.replace(' \b\b\b\b', '') |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 200 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 201 | did_update = common.write_if_changed(path, content, mode='w') |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 202 | else: |
| 203 | did_update = False |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 204 | elif entry['kind'] == 'listitem': |
| 205 | # Handled as part of the corresponding 'listpage' entry. |
| 206 | pass |
| 207 | elif entry['kind'] == 'announcement': |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 208 | # TODO: implement me. |
| 209 | pass |
| 210 | elif entry['kind'] == 'attachment': |
Dirk Pranke | 221a47d | 2021-11-11 20:26:31 -0800 | [diff] [blame] | 211 | if ':' in task: |
| 212 | task = _URLConverter().Translate(task) |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 213 | path = '%s%s' % (common.SITE_DIR, task) |
| 214 | if task in ( |
| 215 | '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png', |
| 216 | '/developers/design-documents/cookie-split-loading/objects.png', |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 217 | ): |
| 218 | # These are expected 404's that we ignore. |
| 219 | did_update = False |
| 220 | elif _needs_update(path, mtime, force): |
| 221 | try: |
| 222 | fp = urlopen(entry['url']) |
| 223 | content = fp.read() |
| 224 | did_update = common.write_if_changed(path, content) |
| 225 | except (HTTPError, URLError, TimeoutError) as e: |
| 226 | err = 'Error: %s' % e |
| 227 | |
| 228 | elif entry['kind'] == 'comment': |
| 229 | # ignore comments in the migration |
| 230 | pass |
| 231 | elif entry['kind'] == 'tag': |
| 232 | err = 'tag kind not implemented' |
| 233 | else: |
| 234 | err = 'unknown kind %s' % entry['kind'] |
| 235 | |
| 236 | return err, did_update |
| 237 | |
| 238 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 239 | def _write_listitems(content, entry): |
| 240 | if not entry['listitems']: |
| 241 | return |
| 242 | |
| 243 | headers = entry['listitems'][0].keys() |
| 244 | rows = sorted(entry['listitems'], |
| 245 | key=lambda row: row.get('Release') or '') |
| 246 | |
| 247 | content.write('<table>\n') |
| 248 | content.write(' <tr>\n') |
| 249 | for header in headers: |
| 250 | content.write(' <th>%s</th>\n' % header) |
| 251 | content.write(' </tr>\n') |
| 252 | for row in rows: |
| 253 | content.write(' <tr>\n') |
| 254 | for value in row.values(): |
| 255 | if value and value.startswith('<a xmlns='): |
| 256 | value = value.replace(' xmlns="http://www.w3.org/1999/xhtml"', '') |
| 257 | content.write(' <td>%s</td>\n' % (value or '')) |
| 258 | content.write(' </tr>\n') |
| 259 | content.write('</table>\n') |
| 260 | |
| 261 | |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 262 | class _URLConverter: |
| 263 | def Translate(self, href): |
| 264 | if not href: |
| 265 | return '' |
| 266 | |
| 267 | for path in common.alternates: |
| 268 | if href.startswith(path): |
| 269 | href = href.replace(path, '') |
| 270 | |
| 271 | if href.startswith('/_/rsrc'): |
| 272 | href = '/' + '/'.join(href.split('/')[4:]) |
| 273 | if '?' in href: |
| 274 | href = href[0:href.index('?')] |
Dirk Pranke | 221a47d | 2021-11-11 20:26:31 -0800 | [diff] [blame] | 275 | if 'Screenshot' in href: |
| 276 | head, tail = href.split('Screenshot') |
| 277 | tail = tail.replace(':', '%3A') |
| 278 | href = head + 'Screenshot' + tail |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 279 | return href |
| 280 | |
| 281 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 282 | def _path(entry, entries): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 283 | path = entry['page_name'] |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 284 | parent_id = entry.get('parent_id') |
| 285 | while parent_id: |
| 286 | path = entries[parent_id]['page_name'] + '/' + path |
| 287 | parent_id = entries[parent_id].get('parent_id') |
| 288 | |
Dirk Pranke | 304c534 | 2021-11-03 12:34:21 -0700 | [diff] [blame] | 289 | return '/' + path |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 290 | |
| 291 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 292 | def _metadata(entry, entries): |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 293 | metadata = {} |
| 294 | metadata['page_name'] = entry['page_name'] |
| 295 | metadata['title'] = entry['title'] |
| 296 | |
| 297 | crumbs = [] |
| 298 | parent_id = entry.get('parent_id') |
| 299 | while parent_id: |
| 300 | parent = entries[parent_id] |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 301 | path = _path(parent, entries) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 302 | title = parent['title'] |
| 303 | crumbs = [[path, title]] + crumbs |
| 304 | parent_id = parent.get('parent_id') |
| 305 | |
| 306 | metadata['breadcrumbs'] = crumbs |
| 307 | |
| 308 | if metadata['page_name'] in ( |
| 309 | 'chromium-projects', |
| 310 | 'chromium', |
| 311 | ): |
| 312 | metadata['use_title_as_h1'] = False |
| 313 | |
| 314 | return metadata |
| 315 | |
| 316 | |
| 317 | def _needs_update(path, mtime, force): |
| 318 | if force: |
| 319 | return True |
| 320 | if os.path.exists(path): |
| 321 | st = os.stat(path) |
| 322 | return mtime > st.st_mtime |
| 323 | return True |
| 324 | |
| 325 | |
| 326 | def _entries(args): |
| 327 | entries = {} |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 328 | parents = {} |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 329 | |
| 330 | # Looks like Sites probably caps results at 500 entries per request, |
| 331 | # even if we request more than that. |
| 332 | rownum = 0 |
| 333 | url = ('https://sites.google.com/feeds/content/chromium.org/dev' |
| 334 | '?start-index=%d&max-results=%d&alt=json' % |
| 335 | (args.start_index, 500 - rownum)) |
| 336 | doc, next_url = _fetch(url, args.force) |
| 337 | |
| 338 | for rownum, entry in enumerate(doc['feed']['entry'], start=1): |
| 339 | row = _to_row(entry, rownum) |
| 340 | entries[row['id']] = row |
| 341 | if row.get('parent_id'): |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 342 | parents.setdefault(row['parent_id'], set()).add(row['id']) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 343 | if args.verbose: |
| 344 | print(' ... [%d]' % rownum) |
| 345 | while next_url: |
| 346 | doc, next_url = _fetch(next_url, args.force) |
| 347 | for rownum, entry in enumerate(doc['feed']['entry'], start=rownum): |
| 348 | row = _to_row(entry, rownum) |
| 349 | entries[row['id']] = row |
| 350 | if row.get('parent_id'): |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 351 | parents.setdefault(row['parent_id'], set()).add(row['id']) |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 352 | if args.verbose: |
| 353 | print(' ... [%d]' % rownum) |
| 354 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 355 | for entry_id, entry in entries.items(): |
| 356 | if entry['kind'] == 'listpage': |
| 357 | entry['listitems'] = [entries[child_id]['fields'] for child_id |
| 358 | in parents[entry_id] |
| 359 | if entries[child_id]['kind'] == 'listitem'] |
| 360 | |
| 361 | return entries |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 362 | |
| 363 | |
| 364 | def _fetch(url, force): |
| 365 | path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/') |
| 366 | if _needs_update(path, 0, force): |
| 367 | fp = urlopen(url) |
| 368 | content = fp.read() |
| 369 | doc = json.loads(content) |
| 370 | updated = _to_ts(doc['feed']['updated']['$t']) |
| 371 | common.write_if_changed(path, content) |
| 372 | else: |
| 373 | with open(path) as fp: |
| 374 | doc = json.load(fp) |
| 375 | next_url = _find_link(doc['feed'], 'next') |
| 376 | return doc, next_url |
| 377 | |
| 378 | |
| 379 | def _find_link(doc, rel): |
| 380 | for ent in doc['link']: |
| 381 | if ent['rel'] == rel: |
| 382 | return ent['href'] |
| 383 | return None |
| 384 | |
| 385 | |
| 386 | def _to_row(entry, rownum): |
| 387 | row = { |
| 388 | 'rownum': rownum, |
| 389 | 'content': entry.get('content', {}).get('$t'), |
| 390 | 'id': _to_id(entry['id']['$t']), |
| 391 | 'kind': entry['category'][0]['label'], |
| 392 | 'published': entry['published']['$t'], |
| 393 | 'updated': entry['updated']['$t'], |
| 394 | } |
| 395 | |
| 396 | row['page_name'] = entry.get('sites$pageName', {}).get('$t') |
| 397 | row['title'] = entry.get('title', {}).get('$t') |
| 398 | row['alt_url'] = _find_link(entry, 'alternate') |
| 399 | |
| 400 | if row['kind'] == 'attachment': |
| 401 | row['url'] = _find_link(entry, 'alternate') |
| 402 | else: |
| 403 | row['url'] = _find_link(entry, 'self') |
| 404 | |
Dirk Pranke | f995947 | 2021-11-09 14:16:33 -0800 | [diff] [blame] | 405 | if row['kind'] == 'listitem': |
| 406 | path = row['url'].replace('https://sites.google.com', |
| 407 | os.path.join(common.REPO_DIR, 'scripts')) |
| 408 | if os.path.exists(path): |
| 409 | xml_content = common.read_text_file(path) |
| 410 | else: |
| 411 | print('fetching %s' % row['url']) |
| 412 | with urlopen(row['url']) as fp: |
| 413 | xml_content = fp.read() |
| 414 | common.write_if_changed(path, xml_content) |
| 415 | |
| 416 | root = ET.fromstring(xml_content) |
| 417 | fields = root.findall('{http://schemas.google.com/spreadsheets/2006}field') |
| 418 | row['fields'] = collections.OrderedDict((el.attrib['name'], el.text) for el in fields) |
| 419 | |
Dirk Pranke | 7bbb547 | 2021-11-02 16:33:21 -0700 | [diff] [blame] | 420 | parent_url = _find_link(entry, |
| 421 | 'http://schemas.google.com/sites/2008#parent') |
| 422 | if parent_url: |
| 423 | row['parent_id'] = _to_id(parent_url) |
| 424 | return row |
| 425 | |
| 426 | |
| 427 | def _to_id(url): |
| 428 | return url[url.rfind('/') + 1:] |
| 429 | |
| 430 | |
| 431 | def _to_ts(iso_time): |
| 432 | return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ')) |
| 433 | |
| 434 | if __name__ == '__main__': |
| 435 | try: |
| 436 | main() |
| 437 | except Exception: |
| 438 | extype, value, tb = sys.exc_info() |
| 439 | traceback.print_exc() |
| 440 | pdb.post_mortem(tb) |