blob: d82dc34ca0271088a8f61b7008dae57a925bca99 [file] [log] [blame]
Nigel Tao48c7d1c2018-11-04 20:38:04 +11001// Copyright 2018 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// +build ignore
16
17package main
18
19// crawl.go crawls a list of HTTP and HTTPS URLs. If the URL yields an HTML
20// file, that file is parsed and the "<img src=etc>" tags within it are
21// followed (but not recursively).
22//
23// The crawler writes files to disk with filenames based on the hash of the
24// content (thus de-duplicating e.g. a site's 404 Not Found page even if served
25// from multiple URLs). It also writes a manifest.tsv file that records the
26// mapping from the original URL to that content hash.
27//
28// Usage: go run crawl.go -outdir foo -infile urls.txt 2>log.txt
29
30import (
31 "bufio"
32 "bytes"
33 "crypto/sha256"
34 "flag"
35 "fmt"
36 "io/ioutil"
37 "log"
38 "math/rand"
39 "net/http"
40 "net/url"
41 "os"
42 "path/filepath"
43 "sort"
44 "strconv"
45 "sync"
46 "time"
47
48 "golang.org/x/net/html"
49)
50
51var (
52 datadirlevelsFlag = flag.Int("datadirlevels", 1,
53 "number of directories in the ab/cd/efgh output data filenames; valid range is [0..8]")
54 flushfreqFlag = flag.Int("flushfreq", 256, "write out in-progress manifest.tsv on every flushfreq entries")
55 httptimeoutFlag = flag.Duration("httptimeout", 30*time.Second, "HTTP Client timeout; zero means unlimited")
56 infileFlag = flag.String("infile", "", "source file containing URLs to crawl")
57 inlimitFlag = flag.Int("inlimit", -1, "if non-negative, the maximum number of input URLs")
58 nworkersFlag = flag.Int("nworkers", 16, "number of concurrent crawler workers")
59 outdirFlag = flag.String("outdir", "", "directory to place crawled data")
60 sleepFlag = flag.Duration("sleep", 1*time.Second,
61 "minimum duration to sleep between HTTP requests to the same domain")
62)
63
64func main() {
65 if err := main1(); err != nil {
66 os.Stderr.WriteString(err.Error() + "\n")
67 os.Exit(1)
68 }
69}
70
71func main1() error {
72 flag.Parse()
73 if *datadirlevelsFlag < 0 || 8 < *datadirlevelsFlag {
74 return fmt.Errorf("-datadirlevels out of bounds [0..8]")
75 }
76 if *infileFlag == "" {
77 return fmt.Errorf("-infile not given")
78 }
79 if *outdirFlag == "" {
80 return fmt.Errorf("-outdir not given")
81 }
82
83 urlsGroupedByDomain, err := parseInfile()
84 if err != nil {
85 return err
86 }
87
88 global.manifest, err = parseManifest()
89 if err != nil {
90 return err
91 }
92
93 wg := sync.WaitGroup{}
94 urlsChan := make(chan []*url.URL)
95 for i := 0; i < *nworkersFlag; i++ {
96 wg.Add(1)
97 w := worker(i)
98 go w.work(&wg, urlsChan)
99 }
100 for _, u := range urlsGroupedByDomain {
101 urlsChan <- u
102 }
103 close(urlsChan)
Nigel Tao2f788042021-01-23 19:29:19 +1100104 log.Printf("controller: no more work (inlimit is %d)", *inlimitFlag)
Nigel Tao48c7d1c2018-11-04 20:38:04 +1100105 wg.Wait()
106
107 return flush(global.manifest.m)
108}
109
110var global struct {
111 manifest manifest
112}
113
114const keySize = sha256.Size // 32 bytes.
115
116type key [keySize]byte
117
118func (k key) Cmp(l key) int {
119 for a := 0; a < keySize; a++ {
120 if k[a] < l[a] {
121 return -1
122 } else if k[a] > l[a] {
123 return +1
124 }
125 }
126 return 0
127}
128
129func (k key) Str(levels int) string {
130 const hex = "0123456789abcdef"
131 var b [128]byte
132 n := 0
133 for i, x := range k {
134 b[n+0] = hex[x>>4]
135 b[n+1] = hex[x&0xF]
136 n += 2
137 if i < levels {
138 b[n] = '/'
139 n++
140 }
141 }
142 return string(b[:n])
143}
144
145func hash(b []byte) key {
146 return sha256.Sum256(b)
147}
148
149func unhex(b byte) uint8 {
150 if '0' <= b && b <= '9' {
151 return b - '0'
152 }
153 if 'a' <= b && b <= 'f' {
154 return b + 10 - 'a'
155 }
156 return 0xFF
157}
158
159func parseHash(b []byte) (k key, ok bool) {
160 i := 0
161 for i < keySize && len(b) >= 2 {
162 if b[0] == '/' {
163 b = b[1:]
164 continue
165 }
166 u0 := unhex(b[0])
167 u1 := unhex(b[1])
168 if u0 > 15 || u1 > 15 {
169 return key{}, false
170 }
171 k[i] = u0<<4 | u1
172 i++
173 b = b[2:]
174 }
175 return k, i == keySize && len(b) == 0
176}
177
178type entry struct {
179 contentHash key
180 urlHash key
181 httpStatusCode uint32
182 sniffedMimeType string
183 size uint64
184 url *url.URL
185}
186
187type manifest struct {
188 lock sync.Mutex
189 m map[key]entry // maps from urlHash to entry.
190 numPending int
191 flushing bool
192}
193
194func (m *manifest) get(k key) entry {
195 m.lock.Lock()
196 e := m.m[k]
197 m.lock.Unlock()
198 return e
199}
200
201func (m *manifest) put(k key, e entry) {
202 clone := map[key]entry(nil)
203
204 m.lock.Lock()
205 if m.m == nil {
206 m.m = map[key]entry{}
207 }
208 m.m[k] = e
209 m.numPending++
210 if m.numPending >= *flushfreqFlag && !m.flushing {
211 m.numPending = 0
212 m.flushing = true
213 clone = make(map[key]entry, len(m.m))
214 for mk, mv := range m.m {
215 clone[mk] = mv
216 }
217 }
218 m.lock.Unlock()
219
220 if clone != nil {
221 if err := flush(clone); err != nil {
222 log.Println(err)
223 }
224
225 m.lock.Lock()
226 m.flushing = false
227 m.lock.Unlock()
228 }
229}
230
231func flush(m map[key]entry) error {
232 log.Printf("Write manifest.tsv (%d entries) started...", len(m))
233 defer log.Printf("Write manifest.tsv (%d entries) finished.", len(m))
234
235 keys := make([][2]key, 0, len(m))
236 for _, v := range m {
237 keys = append(keys, [2]key{v.contentHash, v.urlHash})
238 }
239
240 sort.Slice(keys, func(i, j int) bool {
241 if cmp := keys[i][0].Cmp(keys[j][0]); cmp != 0 {
242 return cmp < 0
243 }
244 if cmp := keys[i][1].Cmp(keys[j][1]); cmp != 0 {
245 return cmp < 0
246 }
247 return false
248 })
249
250 filename0 := filepath.Join(*outdirFlag, "manifest.tsv.inprogress")
251 filename1 := filepath.Join(*outdirFlag, "manifest.tsv")
252
253 f, err := os.Create(filename0)
254 if err != nil {
255 return err
256 }
257
258 w := bufio.NewWriter(f)
259 fmt.Fprintf(w, "#ContentHash\tURLHash\tHTTPStatusCode\tSniffedMIMEType\tSize\tURL\n")
260 for _, k := range keys {
261 e := m[k[1]]
262 fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%d\t%v\n",
263 e.contentHash.Str(*datadirlevelsFlag), e.urlHash.Str(0),
264 e.httpStatusCode, e.sniffedMimeType, e.size, e.url)
265 }
266 if err := w.Flush(); err != nil {
267 f.Close()
268 return err
269 }
270 if err := f.Close(); err != nil {
271 return err
272 }
273
274 return os.Rename(filename0, filename1)
275}
276
277func parseInfile() (map[string][]*url.URL, error) {
278 f, err := os.Open(*infileFlag)
279 if err != nil {
280 return nil, err
281 }
282 defer f.Close()
283
284 m := map[string][]*url.URL{}
285
286 n := 0
287 s := bufio.NewScanner(f)
288 for s.Scan() {
289 b := s.Bytes()
290
291 // Strip leading whitespace (space, tab or other control character).
292 for len(b) > 0 && b[0] <= ' ' {
293 b = b[1:]
294 }
295
296 // Skip empty lines or comments starting with "#".
297 if len(b) == 0 || b[0] == '#' {
298 continue
299 }
300
301 // Strip everything up to the first whitespace.
302 for i, x := range b {
303 if x <= ' ' {
304 b = b[:i]
305 break
306 }
307 }
308
309 u, err := url.Parse(string(b))
310 if err != nil {
311 continue
312 }
313
314 if *inlimitFlag >= 0 && n == *inlimitFlag {
315 break
316 }
317 n++
318
319 m[u.Host] = append(m[u.Host], u)
320 }
321 if err := s.Err(); err != nil {
322 return nil, err
323 }
324
325 return m, nil
326}
327
328func parseManifest() (manifest, error) {
329 f, err := os.Open(filepath.Join(*outdirFlag, "manifest.tsv"))
330 if err != nil {
331 if os.IsNotExist(err) {
332 err = nil
333 }
334 return manifest{}, err
335 }
336 defer f.Close()
337
338 m := map[key]entry{}
339
340 s := bufio.NewScanner(f)
341 for s.Scan() {
342 b := s.Bytes()
343
344 // Strip leading whitespace (space, tab or other control character).
345 for len(b) > 0 && b[0] <= ' ' {
346 b = b[1:]
347 }
348
349 // Skip empty lines or comments starting with "#".
350 if len(b) == 0 || b[0] == '#' {
351 continue
352 }
353
354 e := parseEntry(b)
355 if e.url == nil {
356 continue
357 }
358
359 m[e.urlHash] = e
360 }
361 if err := s.Err(); err != nil {
362 return manifest{}, err
363 }
364
365 return manifest{m: m}, nil
366}
367
368func parseEntry(b []byte) (e entry) {
369 if i := bytes.IndexByte(b, '\t'); i < 0 {
370 return entry{}
371 } else if h, ok := parseHash(b[:i]); !ok {
372 return entry{}
373 } else {
374 e.contentHash = h
375 b = b[i+1:]
376 }
377
378 if i := bytes.IndexByte(b, '\t'); i < 0 {
379 return entry{}
380 } else if h, ok := parseHash(b[:i]); !ok {
381 return entry{}
382 } else {
383 e.urlHash = h
384 b = b[i+1:]
385 }
386
387 if i := bytes.IndexByte(b, '\t'); i < 0 {
388 return entry{}
389 } else if u, err := strconv.ParseUint(string(b[:i]), 10, 32); err != nil {
390 return entry{}
391 } else {
392 e.httpStatusCode = uint32(u)
393 b = b[i+1:]
394 }
395
396 if i := bytes.IndexByte(b, '\t'); i < 0 {
397 return entry{}
398 } else {
399 e.sniffedMimeType = string(b[:i])
400 b = b[i+1:]
401 }
402
403 if i := bytes.IndexByte(b, '\t'); i < 0 {
404 return entry{}
405 } else if u, err := strconv.ParseUint(string(b[:i]), 10, 64); err != nil {
406 return entry{}
407 } else {
408 e.size = uint64(u)
409 b = b[i+1:]
410 }
411
412 if u, err := url.Parse(string(b)); err != nil {
413 return entry{}
414 } else {
415 e.url = u
416 }
417
418 return e
419}
420
421func sleep(rng *rand.Rand) {
422 jitter := rng.Int63n(int64(*sleepFlag))
423 time.Sleep(*sleepFlag + time.Duration(jitter))
424}
425
426type worker int
427
428func (w worker) work(wg *sync.WaitGroup, urlsChan chan []*url.URL) {
429 rng := rand.New(rand.NewSource(time.Now().UnixNano()))
430 defer wg.Done()
431 for urls := range urlsChan {
432 for _, u := range urls {
433 e, links, fetched := w.work1(u, true)
434 if fetched {
435 sleep(rng)
436 }
437
438 for _, u := range links {
439 if ee, _, fetched := w.work1(u, false); fetched {
440 sleep(rng)
441 if ee.url != nil {
442 global.manifest.put(ee.urlHash, ee)
443 }
444 }
445 }
446
447 if fetched && e.url != nil {
448 global.manifest.put(e.urlHash, e)
449 }
450 }
451 }
452 log.Printf("worker #%03d: no more work", w)
453}
454
455func (w worker) work1(u *url.URL, followHTML bool) (e entry, links []*url.URL, fetched bool) {
456 if u.Scheme != "http" && u.Scheme != "https" {
457 return entry{}, nil, false
458 }
459
460 urlString := u.String()
461 urlHash := hash([]byte(urlString))
462 e = global.manifest.get(urlHash)
463 if e.url != nil {
464 log.Printf("worker #%03d: skipping %q", w, urlString)
465 return e, nil, false
466 }
467 log.Printf("worker #%03d: fetching %q", w, urlString)
468
469 statusCode, data, err := fetch(urlString)
470 if err != nil {
471 log.Printf("worker #%03d: %v", w, err)
472 return entry{}, nil, true
473 }
474 e = entry{
475 contentHash: hash(data),
476 urlHash: urlHash,
477 httpStatusCode: statusCode,
478 sniffedMimeType: sniff(data),
479 size: uint64(len(data)),
480 url: u,
481 }
482
483 filename := filepath.Join(*outdirFlag, "data", filepath.FromSlash(e.contentHash.Str(*datadirlevelsFlag)))
484 if _, err := os.Stat(filename); os.IsNotExist(err) {
485 log.Printf("worker #%03d: writing %q", w, urlString)
486 os.MkdirAll(filepath.Dir(filename), 0755)
487 if err := ioutil.WriteFile(filename, data, 0644); err != nil {
488 log.Println(err)
489 return entry{}, nil, true
490 }
491 }
492
493 if followHTML && looksLikeHTML(data) {
494 log.Printf("worker #%03d: parsing %q", w, urlString)
495 links = parseHTML(u, data)
496 }
497 return e, links, true
498}
499
500func fetch(urlString string) (statusCode uint32, body []byte, retErr error) {
501 client := &http.Client{
502 Timeout: *httptimeoutFlag,
503 }
504
505 res, err := client.Get(urlString)
506 if err != nil {
507 return 0, nil, err
508 }
509 defer res.Body.Close()
510
511 body, err = ioutil.ReadAll(res.Body)
512 if err != nil {
513 return 0, nil, err
514 }
515 return uint32(res.StatusCode), body, nil
516}
517
518func parseHTML(u *url.URL, data []byte) (links []*url.URL) {
519 z := html.NewTokenizer(bytes.NewReader(data))
520 for {
521 tt := z.Next()
522 if tt == html.ErrorToken {
523 break
524 }
525 if tt != html.StartTagToken && tt != html.SelfClosingTagToken {
526 continue
527 }
528 tn, hasAttr := z.TagName()
529 if len(tn) != 3 || string(tn) != "img" {
530 continue
531 }
532 for hasAttr {
533 key, val, moreAttr := z.TagAttr()
534 if len(key) == 3 && string(key) == "src" {
535 if v, err := url.Parse(string(val)); err == nil {
536 links = append(links, u.ResolveReference(v))
537 }
538 }
539 hasAttr = moreAttr
540 }
541 }
542 return links
543}
544
545var (
546 prefixBM = []byte("BM")
547 prefixGIF = []byte("GIF8")
548 prefixJPEG = []byte("\xFF\xD8")
549 prefixPNG = []byte("\x89PNG\r\n\x1A\n")
550 prefixRIFF = []byte("RIFF")
551 prefixTIFFBE = []byte("II\x2A\x00")
552 prefixTIFFLE = []byte("MM\x00\x2A")
553 prefixWEBP = []byte("WEBP")
554 prefixZZZZ = []byte("\x00\x00\x00\x00")
555)
556
557func sniff(b []byte) string {
558 switch {
559 case bytes.HasPrefix(b, prefixBM):
560 if len(b) > 6 && bytes.HasPrefix(b[6:], prefixZZZZ) {
561 return "image/bmp"
562 }
563 case bytes.HasPrefix(b, prefixGIF):
564 return "image/gif"
565 case bytes.HasPrefix(b, prefixJPEG):
566 return "image/jpeg"
567 case bytes.HasPrefix(b, prefixPNG):
568 return "image/png"
569 case bytes.HasPrefix(b, prefixRIFF):
570 if len(b) > 8 && bytes.HasPrefix(b[8:], prefixWEBP) {
571 return "image/webp"
572 }
573 case bytes.HasPrefix(b, prefixTIFFBE), bytes.HasPrefix(b, prefixTIFFLE):
574 return "image/tiff"
575 }
576
577 if looksLikeHTML(b) {
578 return "text/html"
579 }
580 return "unknown"
581}
582
583func looksLikeHTML(b []byte) bool {
584 for len(b) > 0 {
585 if b[0] > ' ' {
586 return b[0] == '<'
587 }
588 b = b[1:]
589 }
590 return false
591}