blob: f565fd800b0da0641cfad5af64a565c08f0622fb [file] [log] [blame]
Dirk Pranke71596412021-11-06 19:32:14 -07001#!/usr/bin/env python3
2# Copyright 2021 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Script for crawling a website.
6
7This script will crawl a website and download all of the pages and
8related assets, rewriting the links to point to the local copies.
9"""
10
11import argparse
12import html
13import os
14import sys
15import time
16import urllib.parse
17import urllib.request
18
19import common
20
21
22def main(argv):
23 parser = argparse.ArgumentParser()
24 parser.add_argument('-j', '--jobs', type=int,
25 default=common.cpu_count(),
26 help='Number of URLs to fetch in parallel '
27 '(default %(default)s)')
28 parser.add_argument('-l', '--limit', type=int, metavar='N',
29 help='Only fetch the first N pages')
30 parser.add_argument('-o', '--out-dir', default='pages')
31 parser.add_argument('--path-list', default='scripts/paths_to_crawl.txt',
32 help='initial list of URLs to seed the crawl')
33 parser.add_argument('--paths-to-skip', default='scripts/paths_to_skip.txt',
34 help='list of URLs to skip (expected 404s)')
35 parser.add_argument('--prefix', default='',
36 help='path to prepend to all the URLs')
37 parser.add_argument('paths', nargs='*')
38 args = parser.parse_args(argv)
39
40 if args.paths:
41 urls = [common.site + path for path in args.paths]
42 elif args.path_list:
43 urls = [common.site + path for path in
44 common.read_paths(args.path_list)]
45 else:
46 urls = [common.site + '/']
47
48 args.alternates = common.alternates
49
50 Crawler(args).crawl(urls)
51
52
53class Crawler:
54 def __init__(self, args):
55 self.site = None
56 self.host = None
57 self.args = args
58 self.queue = common.JobQueue(self._handle_url, self.args.jobs,
59 multiprocess=False)
60 self.paths_to_skip = set()
61
62 def crawl(self, urls):
63 if self.args.paths_to_skip:
64 self.paths_to_skip = common.read_paths(self.args.paths_to_skip)
65
66 self.site = urllib.parse.urlparse(urls[0])
67 self.host = urllib.parse.urlunparse(
68 urllib.parse.ParseResult(self.site.scheme,
69 self.site.netloc,
70 path='', params='', query='', fragment=''))
71
72 self._fetch(urls)
73 num_errors = 0
74 num_urls = 0
75 for task, res, links in self.queue.results():
76 if res:
77 num_errors += 1
78 num_urls += 1
79 self._fetch(urllib.parse.urljoin(self.host + task, link)
80 for link in links)
81
82 print('Crawled %d URLs%s.' % (num_urls,
83 (' (%d errors)' % num_errors) if num_errors else ''))
84
85 return 0
86
87 def _fetch(self, urls):
88 for url in urls:
89 should_fetch, task, new_url = self._filter(url)
90 if should_fetch:
91 self.queue.request(task, new_url)
92
93 def _filter(self, url):
94 comps = urllib.parse.urlparse(url)
95 if (not any(url.startswith(x) for x in common.alternates) or
96 comps.path.startswith('/system/app/pages')):
97 return False, comps.path, url
98
99 task = _rewrite_link(url, '')
100 idx = task.find('#')
101 if idx != -1:
102 task = task[:idx]
103 task = task.rstrip('/') or '/'
104 if task in self.paths_to_skip:
105 return False, task, url
106 if task in self.queue.all_tasks():
107 return False, task, url
108
109 new_url_comps = urllib.parse.ParseResult(
110 comps.scheme,
111 comps.netloc,
112 comps.path.rstrip('/') or '/',
113 params='',
114 query='',
115 fragment='')
116 new_url = urllib.parse.urlunparse(new_url_comps)
117
118 all_tasks = self.queue.all_tasks()
119 if not self.args.limit or len(all_tasks) < self.args.limit:
120 if task not in all_tasks:
121 return True, task, new_url
122 return False, task, new_url
123
124 def _handle_url(self, task, url):
125 del task
126 prefix = self.args.prefix
127 out_dir = self.args.out_dir
128 comps = urllib.parse.urlparse(url)
129 path = _rewrite_link(url, prefix)
130
131 res = ''
132 links = []
133
134 for i in range(4):
135 try:
136 with urllib.request.urlopen(url) as resp:
137 content_type = resp.getheader('Content-Type')
138 content = resp.read()
139 except Exception as e:
140 if i < 3:
141 time.sleep(1.0)
142 continue
143
144 res = '%s: %s' % (type(e), str(e))
145 return res, links
146
147 if content_type.startswith('text/html'):
148 page, links = _rewrite_html(content.decode('utf-8'), prefix)
149 new_content = page.encode('utf-8')
150 path = path.rstrip('/') + '/index.html'
151 elif content_type == 'text/css':
152 page, links = _rewrite_css(content.decode('utf-8'), prefix)
153 new_content = page.encode('utf-8')
154 else:
155 new_content = content
156 common.write_if_changed(out_dir + path, new_content)
157 return res, links
158
159
160def _rewrite_html(page, prefix):
161 links = set()
162 page = _rewrite_tag(page, prefix, links, 'a', 'href')
163 page = _rewrite_tag(page, prefix, links, 'img', 'src')
164 page = _rewrite_tag(page, prefix, links, 'script', 'src')
165 for val in ('stylesheet', 'shortcut icon', 'apple-touch-icon'):
166 page = _rewrite_tag(page, prefix, links, 'link', 'href', val)
167
168 return page, links
169
170
171def _rewrite_tag(page, prefix, links, tag, attr, rel=None, follow=True):
172 new_page = ''
173 if rel:
174 tag_str = '<%s rel="%s"' % (tag, rel)
175 else:
176 tag_str = '<%s' % (tag,)
177 attr_str = '%s="' % (attr,)
178
179 pos = 0
180 while True:
181 idx = page.find(tag_str, pos)
182 if idx == -1:
183 new_page += page[pos:]
184 break
185
186 tag_close_idx = page.find('>', idx)
187 attr_idx = page.find(attr_str, idx)
188 if attr_idx == -1 or attr_idx > tag_close_idx:
189 new_page += page[pos:tag_close_idx]
190 pos = tag_close_idx
191 continue
192
193 link_start = attr_idx + len(attr_str)
194 link_end = page.find('"', link_start)
195
196 link = html.unescape(page[link_start:link_end])
197 new_link = _rewrite_link(link, prefix)
198
199 if follow or tag != 'a':
200 links.add(link)
201
202 new_page += page[pos:link_start]
203 new_page += new_link
204 pos = link_end
205
206 return new_page
207
208
209def _rewrite_css(content, prefix):
210 content, links = _rewrite_rule(content, prefix, '@import "', '"')
211 content, more_links = _rewrite_rule(content, prefix, 'url(', ')')
212 links.update(more_links)
213
214 return content, links
215
216
217def _rewrite_rule(content, prefix, start, end):
218 new_content = ''
219 links = set()
220
221 pos = 0
222 while True:
223 start_idx = content.find(start, pos)
224 if start_idx == -1:
225 new_content += content[pos:]
226 break
227
228 end_idx = content.find(end, start_idx)
229
230 link_start = start_idx + len(start)
231
232 if ((content[link_start] == '"' and content[end_idx-1] == '"') or
233 (content[link_start] == "'" and content[end_idx-1] == "'")):
234 link_start += 1
235 end_idx -= 1
236 link = content[link_start:end_idx]
237 new_link = _rewrite_link(link, prefix)
238
239 new_content += content[pos:link_start]
240 new_content += new_link
241 pos = end_idx
242
243 return new_content, links
244
245
246def _rewrite_link(link, prefix):
247 new_link = link
248 if '?' in new_link:
249 new_link = link[0:new_link.index('?')]
250 for alt in common.alternates:
251 new_link = new_link.replace(alt, '')
252 for site_prefix in ('/sites/p/058338/','/sites/p/d955fc'):
253 if new_link.startswith(site_prefix):
254 new_link = new_link[len(site_prefix):]
255 if new_link.startswith('/_/rsrc'):
256 new_link = '/' + '/'.join(new_link.split('/')[4:])
257 new_link = new_link.rstrip('/') or '/'
258 if prefix and new_link.startswith('/'):
259 new_link = '/%s%s' % (prefix, new_link)
260 return new_link
261
262
263if __name__ == '__main__':
264 sys.exit(main(sys.argv[1:]))