Scripts for crawling the website.
This lands a script that acts as a simple web crawler.
It'll crawl http://www.chromium.org and recursively fetch
all of the content on the site, rewriting any URLs to point
to the local copies of the pages.
The script uses a number of hard-coded values:
- the list of URLs to use as starting puts (in paths_to_crawl.txt)
- a list of URLs to skip (because they don't exist)
- alternate ways of writing equivalent URLs (e.g. /foo and
/a/chromium.org/dev/foo).
and so it is not a general-purpose crawler, but it would not be
too hard to generalize it a bit more to become one.
It also supports a `--prefix` option to rewrite all of the URLs
to that they start at an offset from the root directory of the
web server, e.g., instead of fetching /Home from ./home, it'll fetch
it from $prefix/home. This is useful if you want to embed the
crawled content as a part of the larger site.
In the context of the chromium.org migration, this crawler and the
prefix option will allow us to nest the original content inside
the new content, in order to be able to easily compare the two.
Bug: 1267643
Change-Id: I9b917f4a1dba795bcda8286611e1d631d7d27518
Reviewed-on: https://chromium-review.googlesource.com/c/website/+/3266881
Commit-Queue: Dirk Pranke <dpranke@google.com>
Auto-Submit: Dirk Pranke <dpranke@google.com>
Reviewed-by: Struan Shrimpton <sshrimp@google.com>
diff --git a/scripts/crawl.py b/scripts/crawl.py
new file mode 100755
index 0000000..f565fd8
--- /dev/null
+++ b/scripts/crawl.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+# Copyright 2021 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+"""Script for crawling a website.
+
+This script will crawl a website and download all of the pages and
+related assets, rewriting the links to point to the local copies.
+"""
+
+import argparse
+import html
+import os
+import sys
+import time
+import urllib.parse
+import urllib.request
+
+import common
+
+
+def main(argv):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-j', '--jobs', type=int,
+ default=common.cpu_count(),
+ help='Number of URLs to fetch in parallel '
+ '(default %(default)s)')
+ parser.add_argument('-l', '--limit', type=int, metavar='N',
+ help='Only fetch the first N pages')
+ parser.add_argument('-o', '--out-dir', default='pages')
+ parser.add_argument('--path-list', default='scripts/paths_to_crawl.txt',
+ help='initial list of URLs to seed the crawl')
+ parser.add_argument('--paths-to-skip', default='scripts/paths_to_skip.txt',
+ help='list of URLs to skip (expected 404s)')
+ parser.add_argument('--prefix', default='',
+ help='path to prepend to all the URLs')
+ parser.add_argument('paths', nargs='*')
+ args = parser.parse_args(argv)
+
+ if args.paths:
+ urls = [common.site + path for path in args.paths]
+ elif args.path_list:
+ urls = [common.site + path for path in
+ common.read_paths(args.path_list)]
+ else:
+ urls = [common.site + '/']
+
+ args.alternates = common.alternates
+
+ Crawler(args).crawl(urls)
+
+
+class Crawler:
+ def __init__(self, args):
+ self.site = None
+ self.host = None
+ self.args = args
+ self.queue = common.JobQueue(self._handle_url, self.args.jobs,
+ multiprocess=False)
+ self.paths_to_skip = set()
+
+ def crawl(self, urls):
+ if self.args.paths_to_skip:
+ self.paths_to_skip = common.read_paths(self.args.paths_to_skip)
+
+ self.site = urllib.parse.urlparse(urls[0])
+ self.host = urllib.parse.urlunparse(
+ urllib.parse.ParseResult(self.site.scheme,
+ self.site.netloc,
+ path='', params='', query='', fragment=''))
+
+ self._fetch(urls)
+ num_errors = 0
+ num_urls = 0
+ for task, res, links in self.queue.results():
+ if res:
+ num_errors += 1
+ num_urls += 1
+ self._fetch(urllib.parse.urljoin(self.host + task, link)
+ for link in links)
+
+ print('Crawled %d URLs%s.' % (num_urls,
+ (' (%d errors)' % num_errors) if num_errors else ''))
+
+ return 0
+
+ def _fetch(self, urls):
+ for url in urls:
+ should_fetch, task, new_url = self._filter(url)
+ if should_fetch:
+ self.queue.request(task, new_url)
+
+ def _filter(self, url):
+ comps = urllib.parse.urlparse(url)
+ if (not any(url.startswith(x) for x in common.alternates) or
+ comps.path.startswith('/system/app/pages')):
+ return False, comps.path, url
+
+ task = _rewrite_link(url, '')
+ idx = task.find('#')
+ if idx != -1:
+ task = task[:idx]
+ task = task.rstrip('/') or '/'
+ if task in self.paths_to_skip:
+ return False, task, url
+ if task in self.queue.all_tasks():
+ return False, task, url
+
+ new_url_comps = urllib.parse.ParseResult(
+ comps.scheme,
+ comps.netloc,
+ comps.path.rstrip('/') or '/',
+ params='',
+ query='',
+ fragment='')
+ new_url = urllib.parse.urlunparse(new_url_comps)
+
+ all_tasks = self.queue.all_tasks()
+ if not self.args.limit or len(all_tasks) < self.args.limit:
+ if task not in all_tasks:
+ return True, task, new_url
+ return False, task, new_url
+
+ def _handle_url(self, task, url):
+ del task
+ prefix = self.args.prefix
+ out_dir = self.args.out_dir
+ comps = urllib.parse.urlparse(url)
+ path = _rewrite_link(url, prefix)
+
+ res = ''
+ links = []
+
+ for i in range(4):
+ try:
+ with urllib.request.urlopen(url) as resp:
+ content_type = resp.getheader('Content-Type')
+ content = resp.read()
+ except Exception as e:
+ if i < 3:
+ time.sleep(1.0)
+ continue
+
+ res = '%s: %s' % (type(e), str(e))
+ return res, links
+
+ if content_type.startswith('text/html'):
+ page, links = _rewrite_html(content.decode('utf-8'), prefix)
+ new_content = page.encode('utf-8')
+ path = path.rstrip('/') + '/index.html'
+ elif content_type == 'text/css':
+ page, links = _rewrite_css(content.decode('utf-8'), prefix)
+ new_content = page.encode('utf-8')
+ else:
+ new_content = content
+ common.write_if_changed(out_dir + path, new_content)
+ return res, links
+
+
+def _rewrite_html(page, prefix):
+ links = set()
+ page = _rewrite_tag(page, prefix, links, 'a', 'href')
+ page = _rewrite_tag(page, prefix, links, 'img', 'src')
+ page = _rewrite_tag(page, prefix, links, 'script', 'src')
+ for val in ('stylesheet', 'shortcut icon', 'apple-touch-icon'):
+ page = _rewrite_tag(page, prefix, links, 'link', 'href', val)
+
+ return page, links
+
+
+def _rewrite_tag(page, prefix, links, tag, attr, rel=None, follow=True):
+ new_page = ''
+ if rel:
+ tag_str = '<%s rel="%s"' % (tag, rel)
+ else:
+ tag_str = '<%s' % (tag,)
+ attr_str = '%s="' % (attr,)
+
+ pos = 0
+ while True:
+ idx = page.find(tag_str, pos)
+ if idx == -1:
+ new_page += page[pos:]
+ break
+
+ tag_close_idx = page.find('>', idx)
+ attr_idx = page.find(attr_str, idx)
+ if attr_idx == -1 or attr_idx > tag_close_idx:
+ new_page += page[pos:tag_close_idx]
+ pos = tag_close_idx
+ continue
+
+ link_start = attr_idx + len(attr_str)
+ link_end = page.find('"', link_start)
+
+ link = html.unescape(page[link_start:link_end])
+ new_link = _rewrite_link(link, prefix)
+
+ if follow or tag != 'a':
+ links.add(link)
+
+ new_page += page[pos:link_start]
+ new_page += new_link
+ pos = link_end
+
+ return new_page
+
+
+def _rewrite_css(content, prefix):
+ content, links = _rewrite_rule(content, prefix, '@import "', '"')
+ content, more_links = _rewrite_rule(content, prefix, 'url(', ')')
+ links.update(more_links)
+
+ return content, links
+
+
+def _rewrite_rule(content, prefix, start, end):
+ new_content = ''
+ links = set()
+
+ pos = 0
+ while True:
+ start_idx = content.find(start, pos)
+ if start_idx == -1:
+ new_content += content[pos:]
+ break
+
+ end_idx = content.find(end, start_idx)
+
+ link_start = start_idx + len(start)
+
+ if ((content[link_start] == '"' and content[end_idx-1] == '"') or
+ (content[link_start] == "'" and content[end_idx-1] == "'")):
+ link_start += 1
+ end_idx -= 1
+ link = content[link_start:end_idx]
+ new_link = _rewrite_link(link, prefix)
+
+ new_content += content[pos:link_start]
+ new_content += new_link
+ pos = end_idx
+
+ return new_content, links
+
+
+def _rewrite_link(link, prefix):
+ new_link = link
+ if '?' in new_link:
+ new_link = link[0:new_link.index('?')]
+ for alt in common.alternates:
+ new_link = new_link.replace(alt, '')
+ for site_prefix in ('/sites/p/058338/','/sites/p/d955fc'):
+ if new_link.startswith(site_prefix):
+ new_link = new_link[len(site_prefix):]
+ if new_link.startswith('/_/rsrc'):
+ new_link = '/' + '/'.join(new_link.split('/')[4:])
+ new_link = new_link.rstrip('/') or '/'
+ if prefix and new_link.startswith('/'):
+ new_link = '/%s%s' % (prefix, new_link)
+ return new_link
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))