blob: 5811fd7240097cec50eead86c45410e30f7ba94b [file] [log] [blame]
Nigel Tao48c7d1c2018-11-04 20:38:04 +11001// Copyright 2018 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Nigel Tao788479d2021-08-22 10:52:51 +100015//go:build ignore
Nigel Tao48c7d1c2018-11-04 20:38:04 +110016// +build ignore
17
18package main
19
20// crawl.go crawls a list of HTTP and HTTPS URLs. If the URL yields an HTML
21// file, that file is parsed and the "<img src=etc>" tags within it are
22// followed (but not recursively).
23//
24// The crawler writes files to disk with filenames based on the hash of the
25// content (thus de-duplicating e.g. a site's 404 Not Found page even if served
26// from multiple URLs). It also writes a manifest.tsv file that records the
27// mapping from the original URL to that content hash.
28//
29// Usage: go run crawl.go -outdir foo -infile urls.txt 2>log.txt
30
31import (
32 "bufio"
33 "bytes"
34 "crypto/sha256"
35 "flag"
36 "fmt"
Nigel Tao226c4762021-08-22 11:05:43 +100037 "io"
Nigel Tao48c7d1c2018-11-04 20:38:04 +110038 "log"
39 "math/rand"
40 "net/http"
41 "net/url"
42 "os"
43 "path/filepath"
44 "sort"
45 "strconv"
46 "sync"
47 "time"
48
49 "golang.org/x/net/html"
50)
51
52var (
53 datadirlevelsFlag = flag.Int("datadirlevels", 1,
54 "number of directories in the ab/cd/efgh output data filenames; valid range is [0..8]")
55 flushfreqFlag = flag.Int("flushfreq", 256, "write out in-progress manifest.tsv on every flushfreq entries")
56 httptimeoutFlag = flag.Duration("httptimeout", 30*time.Second, "HTTP Client timeout; zero means unlimited")
57 infileFlag = flag.String("infile", "", "source file containing URLs to crawl")
58 inlimitFlag = flag.Int("inlimit", -1, "if non-negative, the maximum number of input URLs")
59 nworkersFlag = flag.Int("nworkers", 16, "number of concurrent crawler workers")
60 outdirFlag = flag.String("outdir", "", "directory to place crawled data")
61 sleepFlag = flag.Duration("sleep", 1*time.Second,
62 "minimum duration to sleep between HTTP requests to the same domain")
63)
64
65func main() {
66 if err := main1(); err != nil {
67 os.Stderr.WriteString(err.Error() + "\n")
68 os.Exit(1)
69 }
70}
71
72func main1() error {
73 flag.Parse()
74 if *datadirlevelsFlag < 0 || 8 < *datadirlevelsFlag {
75 return fmt.Errorf("-datadirlevels out of bounds [0..8]")
76 }
77 if *infileFlag == "" {
78 return fmt.Errorf("-infile not given")
79 }
80 if *outdirFlag == "" {
81 return fmt.Errorf("-outdir not given")
82 }
83
84 urlsGroupedByDomain, err := parseInfile()
85 if err != nil {
86 return err
87 }
88
89 global.manifest, err = parseManifest()
90 if err != nil {
91 return err
92 }
93
94 wg := sync.WaitGroup{}
95 urlsChan := make(chan []*url.URL)
96 for i := 0; i < *nworkersFlag; i++ {
97 wg.Add(1)
98 w := worker(i)
99 go w.work(&wg, urlsChan)
100 }
101 for _, u := range urlsGroupedByDomain {
102 urlsChan <- u
103 }
104 close(urlsChan)
Nigel Tao2f788042021-01-23 19:29:19 +1100105 log.Printf("controller: no more work (inlimit is %d)", *inlimitFlag)
Nigel Tao48c7d1c2018-11-04 20:38:04 +1100106 wg.Wait()
107
108 return flush(global.manifest.m)
109}
110
111var global struct {
112 manifest manifest
113}
114
115const keySize = sha256.Size // 32 bytes.
116
117type key [keySize]byte
118
119func (k key) Cmp(l key) int {
120 for a := 0; a < keySize; a++ {
121 if k[a] < l[a] {
122 return -1
123 } else if k[a] > l[a] {
124 return +1
125 }
126 }
127 return 0
128}
129
130func (k key) Str(levels int) string {
131 const hex = "0123456789abcdef"
132 var b [128]byte
133 n := 0
134 for i, x := range k {
135 b[n+0] = hex[x>>4]
136 b[n+1] = hex[x&0xF]
137 n += 2
138 if i < levels {
139 b[n] = '/'
140 n++
141 }
142 }
143 return string(b[:n])
144}
145
146func hash(b []byte) key {
147 return sha256.Sum256(b)
148}
149
150func unhex(b byte) uint8 {
151 if '0' <= b && b <= '9' {
152 return b - '0'
153 }
154 if 'a' <= b && b <= 'f' {
155 return b + 10 - 'a'
156 }
157 return 0xFF
158}
159
160func parseHash(b []byte) (k key, ok bool) {
161 i := 0
162 for i < keySize && len(b) >= 2 {
163 if b[0] == '/' {
164 b = b[1:]
165 continue
166 }
167 u0 := unhex(b[0])
168 u1 := unhex(b[1])
169 if u0 > 15 || u1 > 15 {
170 return key{}, false
171 }
172 k[i] = u0<<4 | u1
173 i++
174 b = b[2:]
175 }
176 return k, i == keySize && len(b) == 0
177}
178
179type entry struct {
180 contentHash key
181 urlHash key
182 httpStatusCode uint32
183 sniffedMimeType string
184 size uint64
185 url *url.URL
186}
187
188type manifest struct {
189 lock sync.Mutex
190 m map[key]entry // maps from urlHash to entry.
191 numPending int
192 flushing bool
193}
194
195func (m *manifest) get(k key) entry {
196 m.lock.Lock()
197 e := m.m[k]
198 m.lock.Unlock()
199 return e
200}
201
202func (m *manifest) put(k key, e entry) {
203 clone := map[key]entry(nil)
204
205 m.lock.Lock()
206 if m.m == nil {
207 m.m = map[key]entry{}
208 }
209 m.m[k] = e
210 m.numPending++
211 if m.numPending >= *flushfreqFlag && !m.flushing {
212 m.numPending = 0
213 m.flushing = true
214 clone = make(map[key]entry, len(m.m))
215 for mk, mv := range m.m {
216 clone[mk] = mv
217 }
218 }
219 m.lock.Unlock()
220
221 if clone != nil {
222 if err := flush(clone); err != nil {
223 log.Println(err)
224 }
225
226 m.lock.Lock()
227 m.flushing = false
228 m.lock.Unlock()
229 }
230}
231
232func flush(m map[key]entry) error {
233 log.Printf("Write manifest.tsv (%d entries) started...", len(m))
234 defer log.Printf("Write manifest.tsv (%d entries) finished.", len(m))
235
236 keys := make([][2]key, 0, len(m))
237 for _, v := range m {
238 keys = append(keys, [2]key{v.contentHash, v.urlHash})
239 }
240
241 sort.Slice(keys, func(i, j int) bool {
242 if cmp := keys[i][0].Cmp(keys[j][0]); cmp != 0 {
243 return cmp < 0
244 }
245 if cmp := keys[i][1].Cmp(keys[j][1]); cmp != 0 {
246 return cmp < 0
247 }
248 return false
249 })
250
251 filename0 := filepath.Join(*outdirFlag, "manifest.tsv.inprogress")
252 filename1 := filepath.Join(*outdirFlag, "manifest.tsv")
253
254 f, err := os.Create(filename0)
255 if err != nil {
256 return err
257 }
258
259 w := bufio.NewWriter(f)
260 fmt.Fprintf(w, "#ContentHash\tURLHash\tHTTPStatusCode\tSniffedMIMEType\tSize\tURL\n")
261 for _, k := range keys {
262 e := m[k[1]]
263 fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%d\t%v\n",
264 e.contentHash.Str(*datadirlevelsFlag), e.urlHash.Str(0),
265 e.httpStatusCode, e.sniffedMimeType, e.size, e.url)
266 }
267 if err := w.Flush(); err != nil {
268 f.Close()
269 return err
270 }
271 if err := f.Close(); err != nil {
272 return err
273 }
274
275 return os.Rename(filename0, filename1)
276}
277
278func parseInfile() (map[string][]*url.URL, error) {
279 f, err := os.Open(*infileFlag)
280 if err != nil {
281 return nil, err
282 }
283 defer f.Close()
284
285 m := map[string][]*url.URL{}
286
287 n := 0
288 s := bufio.NewScanner(f)
289 for s.Scan() {
290 b := s.Bytes()
291
292 // Strip leading whitespace (space, tab or other control character).
293 for len(b) > 0 && b[0] <= ' ' {
294 b = b[1:]
295 }
296
297 // Skip empty lines or comments starting with "#".
298 if len(b) == 0 || b[0] == '#' {
299 continue
300 }
301
302 // Strip everything up to the first whitespace.
303 for i, x := range b {
304 if x <= ' ' {
305 b = b[:i]
306 break
307 }
308 }
309
310 u, err := url.Parse(string(b))
311 if err != nil {
312 continue
313 }
314
315 if *inlimitFlag >= 0 && n == *inlimitFlag {
316 break
317 }
318 n++
319
320 m[u.Host] = append(m[u.Host], u)
321 }
322 if err := s.Err(); err != nil {
323 return nil, err
324 }
325
326 return m, nil
327}
328
329func parseManifest() (manifest, error) {
330 f, err := os.Open(filepath.Join(*outdirFlag, "manifest.tsv"))
331 if err != nil {
332 if os.IsNotExist(err) {
333 err = nil
334 }
335 return manifest{}, err
336 }
337 defer f.Close()
338
339 m := map[key]entry{}
340
341 s := bufio.NewScanner(f)
342 for s.Scan() {
343 b := s.Bytes()
344
345 // Strip leading whitespace (space, tab or other control character).
346 for len(b) > 0 && b[0] <= ' ' {
347 b = b[1:]
348 }
349
350 // Skip empty lines or comments starting with "#".
351 if len(b) == 0 || b[0] == '#' {
352 continue
353 }
354
355 e := parseEntry(b)
356 if e.url == nil {
357 continue
358 }
359
360 m[e.urlHash] = e
361 }
362 if err := s.Err(); err != nil {
363 return manifest{}, err
364 }
365
366 return manifest{m: m}, nil
367}
368
369func parseEntry(b []byte) (e entry) {
370 if i := bytes.IndexByte(b, '\t'); i < 0 {
371 return entry{}
372 } else if h, ok := parseHash(b[:i]); !ok {
373 return entry{}
374 } else {
375 e.contentHash = h
376 b = b[i+1:]
377 }
378
379 if i := bytes.IndexByte(b, '\t'); i < 0 {
380 return entry{}
381 } else if h, ok := parseHash(b[:i]); !ok {
382 return entry{}
383 } else {
384 e.urlHash = h
385 b = b[i+1:]
386 }
387
388 if i := bytes.IndexByte(b, '\t'); i < 0 {
389 return entry{}
390 } else if u, err := strconv.ParseUint(string(b[:i]), 10, 32); err != nil {
391 return entry{}
392 } else {
393 e.httpStatusCode = uint32(u)
394 b = b[i+1:]
395 }
396
397 if i := bytes.IndexByte(b, '\t'); i < 0 {
398 return entry{}
399 } else {
400 e.sniffedMimeType = string(b[:i])
401 b = b[i+1:]
402 }
403
404 if i := bytes.IndexByte(b, '\t'); i < 0 {
405 return entry{}
406 } else if u, err := strconv.ParseUint(string(b[:i]), 10, 64); err != nil {
407 return entry{}
408 } else {
409 e.size = uint64(u)
410 b = b[i+1:]
411 }
412
413 if u, err := url.Parse(string(b)); err != nil {
414 return entry{}
415 } else {
416 e.url = u
417 }
418
419 return e
420}
421
422func sleep(rng *rand.Rand) {
423 jitter := rng.Int63n(int64(*sleepFlag))
424 time.Sleep(*sleepFlag + time.Duration(jitter))
425}
426
427type worker int
428
429func (w worker) work(wg *sync.WaitGroup, urlsChan chan []*url.URL) {
430 rng := rand.New(rand.NewSource(time.Now().UnixNano()))
431 defer wg.Done()
432 for urls := range urlsChan {
433 for _, u := range urls {
434 e, links, fetched := w.work1(u, true)
435 if fetched {
436 sleep(rng)
437 }
438
439 for _, u := range links {
440 if ee, _, fetched := w.work1(u, false); fetched {
441 sleep(rng)
442 if ee.url != nil {
443 global.manifest.put(ee.urlHash, ee)
444 }
445 }
446 }
447
448 if fetched && e.url != nil {
449 global.manifest.put(e.urlHash, e)
450 }
451 }
452 }
453 log.Printf("worker #%03d: no more work", w)
454}
455
456func (w worker) work1(u *url.URL, followHTML bool) (e entry, links []*url.URL, fetched bool) {
457 if u.Scheme != "http" && u.Scheme != "https" {
458 return entry{}, nil, false
459 }
460
461 urlString := u.String()
462 urlHash := hash([]byte(urlString))
463 e = global.manifest.get(urlHash)
464 if e.url != nil {
465 log.Printf("worker #%03d: skipping %q", w, urlString)
466 return e, nil, false
467 }
468 log.Printf("worker #%03d: fetching %q", w, urlString)
469
470 statusCode, data, err := fetch(urlString)
471 if err != nil {
472 log.Printf("worker #%03d: %v", w, err)
473 return entry{}, nil, true
474 }
475 e = entry{
476 contentHash: hash(data),
477 urlHash: urlHash,
478 httpStatusCode: statusCode,
479 sniffedMimeType: sniff(data),
480 size: uint64(len(data)),
481 url: u,
482 }
483
484 filename := filepath.Join(*outdirFlag, "data", filepath.FromSlash(e.contentHash.Str(*datadirlevelsFlag)))
485 if _, err := os.Stat(filename); os.IsNotExist(err) {
486 log.Printf("worker #%03d: writing %q", w, urlString)
487 os.MkdirAll(filepath.Dir(filename), 0755)
Nigel Tao226c4762021-08-22 11:05:43 +1000488 if err := os.WriteFile(filename, data, 0644); err != nil {
Nigel Tao48c7d1c2018-11-04 20:38:04 +1100489 log.Println(err)
490 return entry{}, nil, true
491 }
492 }
493
494 if followHTML && looksLikeHTML(data) {
495 log.Printf("worker #%03d: parsing %q", w, urlString)
496 links = parseHTML(u, data)
497 }
498 return e, links, true
499}
500
501func fetch(urlString string) (statusCode uint32, body []byte, retErr error) {
502 client := &http.Client{
503 Timeout: *httptimeoutFlag,
504 }
505
506 res, err := client.Get(urlString)
507 if err != nil {
508 return 0, nil, err
509 }
510 defer res.Body.Close()
511
Nigel Tao226c4762021-08-22 11:05:43 +1000512 body, err = io.ReadAll(res.Body)
Nigel Tao48c7d1c2018-11-04 20:38:04 +1100513 if err != nil {
514 return 0, nil, err
515 }
516 return uint32(res.StatusCode), body, nil
517}
518
519func parseHTML(u *url.URL, data []byte) (links []*url.URL) {
520 z := html.NewTokenizer(bytes.NewReader(data))
521 for {
522 tt := z.Next()
523 if tt == html.ErrorToken {
524 break
525 }
526 if tt != html.StartTagToken && tt != html.SelfClosingTagToken {
527 continue
528 }
529 tn, hasAttr := z.TagName()
530 if len(tn) != 3 || string(tn) != "img" {
531 continue
532 }
533 for hasAttr {
534 key, val, moreAttr := z.TagAttr()
535 if len(key) == 3 && string(key) == "src" {
536 if v, err := url.Parse(string(val)); err == nil {
537 links = append(links, u.ResolveReference(v))
538 }
539 }
540 hasAttr = moreAttr
541 }
542 }
543 return links
544}
545
546var (
547 prefixBM = []byte("BM")
548 prefixGIF = []byte("GIF8")
549 prefixJPEG = []byte("\xFF\xD8")
550 prefixPNG = []byte("\x89PNG\r\n\x1A\n")
551 prefixRIFF = []byte("RIFF")
552 prefixTIFFBE = []byte("II\x2A\x00")
553 prefixTIFFLE = []byte("MM\x00\x2A")
554 prefixWEBP = []byte("WEBP")
555 prefixZZZZ = []byte("\x00\x00\x00\x00")
556)
557
558func sniff(b []byte) string {
559 switch {
560 case bytes.HasPrefix(b, prefixBM):
561 if len(b) > 6 && bytes.HasPrefix(b[6:], prefixZZZZ) {
562 return "image/bmp"
563 }
564 case bytes.HasPrefix(b, prefixGIF):
565 return "image/gif"
566 case bytes.HasPrefix(b, prefixJPEG):
567 return "image/jpeg"
568 case bytes.HasPrefix(b, prefixPNG):
569 return "image/png"
570 case bytes.HasPrefix(b, prefixRIFF):
571 if len(b) > 8 && bytes.HasPrefix(b[8:], prefixWEBP) {
572 return "image/webp"
573 }
574 case bytes.HasPrefix(b, prefixTIFFBE), bytes.HasPrefix(b, prefixTIFFLE):
575 return "image/tiff"
576 }
577
578 if looksLikeHTML(b) {
579 return "text/html"
580 }
581 return "unknown"
582}
583
584func looksLikeHTML(b []byte) bool {
585 for len(b) > 0 {
586 if b[0] > ' ' {
587 return b[0] == '<'
588 }
589 b = b[1:]
590 }
591 return false
592}