Nigel Tao | 48c7d1c | 2018-11-04 20:38:04 +1100 | [diff] [blame] | 1 | // Copyright 2018 The Wuffs Authors. |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
Nigel Tao | 788479d | 2021-08-22 10:52:51 +1000 | [diff] [blame] | 15 | //go:build ignore |
Nigel Tao | 48c7d1c | 2018-11-04 20:38:04 +1100 | [diff] [blame] | 16 | // +build ignore |
| 17 | |
| 18 | package main |
| 19 | |
| 20 | // crawl.go crawls a list of HTTP and HTTPS URLs. If the URL yields an HTML |
| 21 | // file, that file is parsed and the "<img src=etc>" tags within it are |
| 22 | // followed (but not recursively). |
| 23 | // |
| 24 | // The crawler writes files to disk with filenames based on the hash of the |
| 25 | // content (thus de-duplicating e.g. a site's 404 Not Found page even if served |
| 26 | // from multiple URLs). It also writes a manifest.tsv file that records the |
| 27 | // mapping from the original URL to that content hash. |
| 28 | // |
| 29 | // Usage: go run crawl.go -outdir foo -infile urls.txt 2>log.txt |
| 30 | |
| 31 | import ( |
| 32 | "bufio" |
| 33 | "bytes" |
| 34 | "crypto/sha256" |
| 35 | "flag" |
| 36 | "fmt" |
Nigel Tao | 226c476 | 2021-08-22 11:05:43 +1000 | [diff] [blame] | 37 | "io" |
Nigel Tao | 48c7d1c | 2018-11-04 20:38:04 +1100 | [diff] [blame] | 38 | "log" |
| 39 | "math/rand" |
| 40 | "net/http" |
| 41 | "net/url" |
| 42 | "os" |
| 43 | "path/filepath" |
| 44 | "sort" |
| 45 | "strconv" |
| 46 | "sync" |
| 47 | "time" |
| 48 | |
| 49 | "golang.org/x/net/html" |
| 50 | ) |
| 51 | |
| 52 | var ( |
| 53 | datadirlevelsFlag = flag.Int("datadirlevels", 1, |
| 54 | "number of directories in the ab/cd/efgh output data filenames; valid range is [0..8]") |
| 55 | flushfreqFlag = flag.Int("flushfreq", 256, "write out in-progress manifest.tsv on every flushfreq entries") |
| 56 | httptimeoutFlag = flag.Duration("httptimeout", 30*time.Second, "HTTP Client timeout; zero means unlimited") |
| 57 | infileFlag = flag.String("infile", "", "source file containing URLs to crawl") |
| 58 | inlimitFlag = flag.Int("inlimit", -1, "if non-negative, the maximum number of input URLs") |
| 59 | nworkersFlag = flag.Int("nworkers", 16, "number of concurrent crawler workers") |
| 60 | outdirFlag = flag.String("outdir", "", "directory to place crawled data") |
| 61 | sleepFlag = flag.Duration("sleep", 1*time.Second, |
| 62 | "minimum duration to sleep between HTTP requests to the same domain") |
| 63 | ) |
| 64 | |
| 65 | func main() { |
| 66 | if err := main1(); err != nil { |
| 67 | os.Stderr.WriteString(err.Error() + "\n") |
| 68 | os.Exit(1) |
| 69 | } |
| 70 | } |
| 71 | |
| 72 | func main1() error { |
| 73 | flag.Parse() |
| 74 | if *datadirlevelsFlag < 0 || 8 < *datadirlevelsFlag { |
| 75 | return fmt.Errorf("-datadirlevels out of bounds [0..8]") |
| 76 | } |
| 77 | if *infileFlag == "" { |
| 78 | return fmt.Errorf("-infile not given") |
| 79 | } |
| 80 | if *outdirFlag == "" { |
| 81 | return fmt.Errorf("-outdir not given") |
| 82 | } |
| 83 | |
| 84 | urlsGroupedByDomain, err := parseInfile() |
| 85 | if err != nil { |
| 86 | return err |
| 87 | } |
| 88 | |
| 89 | global.manifest, err = parseManifest() |
| 90 | if err != nil { |
| 91 | return err |
| 92 | } |
| 93 | |
| 94 | wg := sync.WaitGroup{} |
| 95 | urlsChan := make(chan []*url.URL) |
| 96 | for i := 0; i < *nworkersFlag; i++ { |
| 97 | wg.Add(1) |
| 98 | w := worker(i) |
| 99 | go w.work(&wg, urlsChan) |
| 100 | } |
| 101 | for _, u := range urlsGroupedByDomain { |
| 102 | urlsChan <- u |
| 103 | } |
| 104 | close(urlsChan) |
Nigel Tao | 2f78804 | 2021-01-23 19:29:19 +1100 | [diff] [blame] | 105 | log.Printf("controller: no more work (inlimit is %d)", *inlimitFlag) |
Nigel Tao | 48c7d1c | 2018-11-04 20:38:04 +1100 | [diff] [blame] | 106 | wg.Wait() |
| 107 | |
| 108 | return flush(global.manifest.m) |
| 109 | } |
| 110 | |
| 111 | var global struct { |
| 112 | manifest manifest |
| 113 | } |
| 114 | |
| 115 | const keySize = sha256.Size // 32 bytes. |
| 116 | |
| 117 | type key [keySize]byte |
| 118 | |
| 119 | func (k key) Cmp(l key) int { |
| 120 | for a := 0; a < keySize; a++ { |
| 121 | if k[a] < l[a] { |
| 122 | return -1 |
| 123 | } else if k[a] > l[a] { |
| 124 | return +1 |
| 125 | } |
| 126 | } |
| 127 | return 0 |
| 128 | } |
| 129 | |
| 130 | func (k key) Str(levels int) string { |
| 131 | const hex = "0123456789abcdef" |
| 132 | var b [128]byte |
| 133 | n := 0 |
| 134 | for i, x := range k { |
| 135 | b[n+0] = hex[x>>4] |
| 136 | b[n+1] = hex[x&0xF] |
| 137 | n += 2 |
| 138 | if i < levels { |
| 139 | b[n] = '/' |
| 140 | n++ |
| 141 | } |
| 142 | } |
| 143 | return string(b[:n]) |
| 144 | } |
| 145 | |
| 146 | func hash(b []byte) key { |
| 147 | return sha256.Sum256(b) |
| 148 | } |
| 149 | |
| 150 | func unhex(b byte) uint8 { |
| 151 | if '0' <= b && b <= '9' { |
| 152 | return b - '0' |
| 153 | } |
| 154 | if 'a' <= b && b <= 'f' { |
| 155 | return b + 10 - 'a' |
| 156 | } |
| 157 | return 0xFF |
| 158 | } |
| 159 | |
| 160 | func parseHash(b []byte) (k key, ok bool) { |
| 161 | i := 0 |
| 162 | for i < keySize && len(b) >= 2 { |
| 163 | if b[0] == '/' { |
| 164 | b = b[1:] |
| 165 | continue |
| 166 | } |
| 167 | u0 := unhex(b[0]) |
| 168 | u1 := unhex(b[1]) |
| 169 | if u0 > 15 || u1 > 15 { |
| 170 | return key{}, false |
| 171 | } |
| 172 | k[i] = u0<<4 | u1 |
| 173 | i++ |
| 174 | b = b[2:] |
| 175 | } |
| 176 | return k, i == keySize && len(b) == 0 |
| 177 | } |
| 178 | |
| 179 | type entry struct { |
| 180 | contentHash key |
| 181 | urlHash key |
| 182 | httpStatusCode uint32 |
| 183 | sniffedMimeType string |
| 184 | size uint64 |
| 185 | url *url.URL |
| 186 | } |
| 187 | |
| 188 | type manifest struct { |
| 189 | lock sync.Mutex |
| 190 | m map[key]entry // maps from urlHash to entry. |
| 191 | numPending int |
| 192 | flushing bool |
| 193 | } |
| 194 | |
| 195 | func (m *manifest) get(k key) entry { |
| 196 | m.lock.Lock() |
| 197 | e := m.m[k] |
| 198 | m.lock.Unlock() |
| 199 | return e |
| 200 | } |
| 201 | |
| 202 | func (m *manifest) put(k key, e entry) { |
| 203 | clone := map[key]entry(nil) |
| 204 | |
| 205 | m.lock.Lock() |
| 206 | if m.m == nil { |
| 207 | m.m = map[key]entry{} |
| 208 | } |
| 209 | m.m[k] = e |
| 210 | m.numPending++ |
| 211 | if m.numPending >= *flushfreqFlag && !m.flushing { |
| 212 | m.numPending = 0 |
| 213 | m.flushing = true |
| 214 | clone = make(map[key]entry, len(m.m)) |
| 215 | for mk, mv := range m.m { |
| 216 | clone[mk] = mv |
| 217 | } |
| 218 | } |
| 219 | m.lock.Unlock() |
| 220 | |
| 221 | if clone != nil { |
| 222 | if err := flush(clone); err != nil { |
| 223 | log.Println(err) |
| 224 | } |
| 225 | |
| 226 | m.lock.Lock() |
| 227 | m.flushing = false |
| 228 | m.lock.Unlock() |
| 229 | } |
| 230 | } |
| 231 | |
| 232 | func flush(m map[key]entry) error { |
| 233 | log.Printf("Write manifest.tsv (%d entries) started...", len(m)) |
| 234 | defer log.Printf("Write manifest.tsv (%d entries) finished.", len(m)) |
| 235 | |
| 236 | keys := make([][2]key, 0, len(m)) |
| 237 | for _, v := range m { |
| 238 | keys = append(keys, [2]key{v.contentHash, v.urlHash}) |
| 239 | } |
| 240 | |
| 241 | sort.Slice(keys, func(i, j int) bool { |
| 242 | if cmp := keys[i][0].Cmp(keys[j][0]); cmp != 0 { |
| 243 | return cmp < 0 |
| 244 | } |
| 245 | if cmp := keys[i][1].Cmp(keys[j][1]); cmp != 0 { |
| 246 | return cmp < 0 |
| 247 | } |
| 248 | return false |
| 249 | }) |
| 250 | |
| 251 | filename0 := filepath.Join(*outdirFlag, "manifest.tsv.inprogress") |
| 252 | filename1 := filepath.Join(*outdirFlag, "manifest.tsv") |
| 253 | |
| 254 | f, err := os.Create(filename0) |
| 255 | if err != nil { |
| 256 | return err |
| 257 | } |
| 258 | |
| 259 | w := bufio.NewWriter(f) |
| 260 | fmt.Fprintf(w, "#ContentHash\tURLHash\tHTTPStatusCode\tSniffedMIMEType\tSize\tURL\n") |
| 261 | for _, k := range keys { |
| 262 | e := m[k[1]] |
| 263 | fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%d\t%v\n", |
| 264 | e.contentHash.Str(*datadirlevelsFlag), e.urlHash.Str(0), |
| 265 | e.httpStatusCode, e.sniffedMimeType, e.size, e.url) |
| 266 | } |
| 267 | if err := w.Flush(); err != nil { |
| 268 | f.Close() |
| 269 | return err |
| 270 | } |
| 271 | if err := f.Close(); err != nil { |
| 272 | return err |
| 273 | } |
| 274 | |
| 275 | return os.Rename(filename0, filename1) |
| 276 | } |
| 277 | |
| 278 | func parseInfile() (map[string][]*url.URL, error) { |
| 279 | f, err := os.Open(*infileFlag) |
| 280 | if err != nil { |
| 281 | return nil, err |
| 282 | } |
| 283 | defer f.Close() |
| 284 | |
| 285 | m := map[string][]*url.URL{} |
| 286 | |
| 287 | n := 0 |
| 288 | s := bufio.NewScanner(f) |
| 289 | for s.Scan() { |
| 290 | b := s.Bytes() |
| 291 | |
| 292 | // Strip leading whitespace (space, tab or other control character). |
| 293 | for len(b) > 0 && b[0] <= ' ' { |
| 294 | b = b[1:] |
| 295 | } |
| 296 | |
| 297 | // Skip empty lines or comments starting with "#". |
| 298 | if len(b) == 0 || b[0] == '#' { |
| 299 | continue |
| 300 | } |
| 301 | |
| 302 | // Strip everything up to the first whitespace. |
| 303 | for i, x := range b { |
| 304 | if x <= ' ' { |
| 305 | b = b[:i] |
| 306 | break |
| 307 | } |
| 308 | } |
| 309 | |
| 310 | u, err := url.Parse(string(b)) |
| 311 | if err != nil { |
| 312 | continue |
| 313 | } |
| 314 | |
| 315 | if *inlimitFlag >= 0 && n == *inlimitFlag { |
| 316 | break |
| 317 | } |
| 318 | n++ |
| 319 | |
| 320 | m[u.Host] = append(m[u.Host], u) |
| 321 | } |
| 322 | if err := s.Err(); err != nil { |
| 323 | return nil, err |
| 324 | } |
| 325 | |
| 326 | return m, nil |
| 327 | } |
| 328 | |
| 329 | func parseManifest() (manifest, error) { |
| 330 | f, err := os.Open(filepath.Join(*outdirFlag, "manifest.tsv")) |
| 331 | if err != nil { |
| 332 | if os.IsNotExist(err) { |
| 333 | err = nil |
| 334 | } |
| 335 | return manifest{}, err |
| 336 | } |
| 337 | defer f.Close() |
| 338 | |
| 339 | m := map[key]entry{} |
| 340 | |
| 341 | s := bufio.NewScanner(f) |
| 342 | for s.Scan() { |
| 343 | b := s.Bytes() |
| 344 | |
| 345 | // Strip leading whitespace (space, tab or other control character). |
| 346 | for len(b) > 0 && b[0] <= ' ' { |
| 347 | b = b[1:] |
| 348 | } |
| 349 | |
| 350 | // Skip empty lines or comments starting with "#". |
| 351 | if len(b) == 0 || b[0] == '#' { |
| 352 | continue |
| 353 | } |
| 354 | |
| 355 | e := parseEntry(b) |
| 356 | if e.url == nil { |
| 357 | continue |
| 358 | } |
| 359 | |
| 360 | m[e.urlHash] = e |
| 361 | } |
| 362 | if err := s.Err(); err != nil { |
| 363 | return manifest{}, err |
| 364 | } |
| 365 | |
| 366 | return manifest{m: m}, nil |
| 367 | } |
| 368 | |
| 369 | func parseEntry(b []byte) (e entry) { |
| 370 | if i := bytes.IndexByte(b, '\t'); i < 0 { |
| 371 | return entry{} |
| 372 | } else if h, ok := parseHash(b[:i]); !ok { |
| 373 | return entry{} |
| 374 | } else { |
| 375 | e.contentHash = h |
| 376 | b = b[i+1:] |
| 377 | } |
| 378 | |
| 379 | if i := bytes.IndexByte(b, '\t'); i < 0 { |
| 380 | return entry{} |
| 381 | } else if h, ok := parseHash(b[:i]); !ok { |
| 382 | return entry{} |
| 383 | } else { |
| 384 | e.urlHash = h |
| 385 | b = b[i+1:] |
| 386 | } |
| 387 | |
| 388 | if i := bytes.IndexByte(b, '\t'); i < 0 { |
| 389 | return entry{} |
| 390 | } else if u, err := strconv.ParseUint(string(b[:i]), 10, 32); err != nil { |
| 391 | return entry{} |
| 392 | } else { |
| 393 | e.httpStatusCode = uint32(u) |
| 394 | b = b[i+1:] |
| 395 | } |
| 396 | |
| 397 | if i := bytes.IndexByte(b, '\t'); i < 0 { |
| 398 | return entry{} |
| 399 | } else { |
| 400 | e.sniffedMimeType = string(b[:i]) |
| 401 | b = b[i+1:] |
| 402 | } |
| 403 | |
| 404 | if i := bytes.IndexByte(b, '\t'); i < 0 { |
| 405 | return entry{} |
| 406 | } else if u, err := strconv.ParseUint(string(b[:i]), 10, 64); err != nil { |
| 407 | return entry{} |
| 408 | } else { |
| 409 | e.size = uint64(u) |
| 410 | b = b[i+1:] |
| 411 | } |
| 412 | |
| 413 | if u, err := url.Parse(string(b)); err != nil { |
| 414 | return entry{} |
| 415 | } else { |
| 416 | e.url = u |
| 417 | } |
| 418 | |
| 419 | return e |
| 420 | } |
| 421 | |
| 422 | func sleep(rng *rand.Rand) { |
| 423 | jitter := rng.Int63n(int64(*sleepFlag)) |
| 424 | time.Sleep(*sleepFlag + time.Duration(jitter)) |
| 425 | } |
| 426 | |
| 427 | type worker int |
| 428 | |
| 429 | func (w worker) work(wg *sync.WaitGroup, urlsChan chan []*url.URL) { |
| 430 | rng := rand.New(rand.NewSource(time.Now().UnixNano())) |
| 431 | defer wg.Done() |
| 432 | for urls := range urlsChan { |
| 433 | for _, u := range urls { |
| 434 | e, links, fetched := w.work1(u, true) |
| 435 | if fetched { |
| 436 | sleep(rng) |
| 437 | } |
| 438 | |
| 439 | for _, u := range links { |
| 440 | if ee, _, fetched := w.work1(u, false); fetched { |
| 441 | sleep(rng) |
| 442 | if ee.url != nil { |
| 443 | global.manifest.put(ee.urlHash, ee) |
| 444 | } |
| 445 | } |
| 446 | } |
| 447 | |
| 448 | if fetched && e.url != nil { |
| 449 | global.manifest.put(e.urlHash, e) |
| 450 | } |
| 451 | } |
| 452 | } |
| 453 | log.Printf("worker #%03d: no more work", w) |
| 454 | } |
| 455 | |
| 456 | func (w worker) work1(u *url.URL, followHTML bool) (e entry, links []*url.URL, fetched bool) { |
| 457 | if u.Scheme != "http" && u.Scheme != "https" { |
| 458 | return entry{}, nil, false |
| 459 | } |
| 460 | |
| 461 | urlString := u.String() |
| 462 | urlHash := hash([]byte(urlString)) |
| 463 | e = global.manifest.get(urlHash) |
| 464 | if e.url != nil { |
| 465 | log.Printf("worker #%03d: skipping %q", w, urlString) |
| 466 | return e, nil, false |
| 467 | } |
| 468 | log.Printf("worker #%03d: fetching %q", w, urlString) |
| 469 | |
| 470 | statusCode, data, err := fetch(urlString) |
| 471 | if err != nil { |
| 472 | log.Printf("worker #%03d: %v", w, err) |
| 473 | return entry{}, nil, true |
| 474 | } |
| 475 | e = entry{ |
| 476 | contentHash: hash(data), |
| 477 | urlHash: urlHash, |
| 478 | httpStatusCode: statusCode, |
| 479 | sniffedMimeType: sniff(data), |
| 480 | size: uint64(len(data)), |
| 481 | url: u, |
| 482 | } |
| 483 | |
| 484 | filename := filepath.Join(*outdirFlag, "data", filepath.FromSlash(e.contentHash.Str(*datadirlevelsFlag))) |
| 485 | if _, err := os.Stat(filename); os.IsNotExist(err) { |
| 486 | log.Printf("worker #%03d: writing %q", w, urlString) |
| 487 | os.MkdirAll(filepath.Dir(filename), 0755) |
Nigel Tao | 226c476 | 2021-08-22 11:05:43 +1000 | [diff] [blame] | 488 | if err := os.WriteFile(filename, data, 0644); err != nil { |
Nigel Tao | 48c7d1c | 2018-11-04 20:38:04 +1100 | [diff] [blame] | 489 | log.Println(err) |
| 490 | return entry{}, nil, true |
| 491 | } |
| 492 | } |
| 493 | |
| 494 | if followHTML && looksLikeHTML(data) { |
| 495 | log.Printf("worker #%03d: parsing %q", w, urlString) |
| 496 | links = parseHTML(u, data) |
| 497 | } |
| 498 | return e, links, true |
| 499 | } |
| 500 | |
| 501 | func fetch(urlString string) (statusCode uint32, body []byte, retErr error) { |
| 502 | client := &http.Client{ |
| 503 | Timeout: *httptimeoutFlag, |
| 504 | } |
| 505 | |
| 506 | res, err := client.Get(urlString) |
| 507 | if err != nil { |
| 508 | return 0, nil, err |
| 509 | } |
| 510 | defer res.Body.Close() |
| 511 | |
Nigel Tao | 226c476 | 2021-08-22 11:05:43 +1000 | [diff] [blame] | 512 | body, err = io.ReadAll(res.Body) |
Nigel Tao | 48c7d1c | 2018-11-04 20:38:04 +1100 | [diff] [blame] | 513 | if err != nil { |
| 514 | return 0, nil, err |
| 515 | } |
| 516 | return uint32(res.StatusCode), body, nil |
| 517 | } |
| 518 | |
| 519 | func parseHTML(u *url.URL, data []byte) (links []*url.URL) { |
| 520 | z := html.NewTokenizer(bytes.NewReader(data)) |
| 521 | for { |
| 522 | tt := z.Next() |
| 523 | if tt == html.ErrorToken { |
| 524 | break |
| 525 | } |
| 526 | if tt != html.StartTagToken && tt != html.SelfClosingTagToken { |
| 527 | continue |
| 528 | } |
| 529 | tn, hasAttr := z.TagName() |
| 530 | if len(tn) != 3 || string(tn) != "img" { |
| 531 | continue |
| 532 | } |
| 533 | for hasAttr { |
| 534 | key, val, moreAttr := z.TagAttr() |
| 535 | if len(key) == 3 && string(key) == "src" { |
| 536 | if v, err := url.Parse(string(val)); err == nil { |
| 537 | links = append(links, u.ResolveReference(v)) |
| 538 | } |
| 539 | } |
| 540 | hasAttr = moreAttr |
| 541 | } |
| 542 | } |
| 543 | return links |
| 544 | } |
| 545 | |
| 546 | var ( |
| 547 | prefixBM = []byte("BM") |
| 548 | prefixGIF = []byte("GIF8") |
| 549 | prefixJPEG = []byte("\xFF\xD8") |
| 550 | prefixPNG = []byte("\x89PNG\r\n\x1A\n") |
| 551 | prefixRIFF = []byte("RIFF") |
| 552 | prefixTIFFBE = []byte("II\x2A\x00") |
| 553 | prefixTIFFLE = []byte("MM\x00\x2A") |
| 554 | prefixWEBP = []byte("WEBP") |
| 555 | prefixZZZZ = []byte("\x00\x00\x00\x00") |
| 556 | ) |
| 557 | |
| 558 | func sniff(b []byte) string { |
| 559 | switch { |
| 560 | case bytes.HasPrefix(b, prefixBM): |
| 561 | if len(b) > 6 && bytes.HasPrefix(b[6:], prefixZZZZ) { |
| 562 | return "image/bmp" |
| 563 | } |
| 564 | case bytes.HasPrefix(b, prefixGIF): |
| 565 | return "image/gif" |
| 566 | case bytes.HasPrefix(b, prefixJPEG): |
| 567 | return "image/jpeg" |
| 568 | case bytes.HasPrefix(b, prefixPNG): |
| 569 | return "image/png" |
| 570 | case bytes.HasPrefix(b, prefixRIFF): |
| 571 | if len(b) > 8 && bytes.HasPrefix(b[8:], prefixWEBP) { |
| 572 | return "image/webp" |
| 573 | } |
| 574 | case bytes.HasPrefix(b, prefixTIFFBE), bytes.HasPrefix(b, prefixTIFFLE): |
| 575 | return "image/tiff" |
| 576 | } |
| 577 | |
| 578 | if looksLikeHTML(b) { |
| 579 | return "text/html" |
| 580 | } |
| 581 | return "unknown" |
| 582 | } |
| 583 | |
| 584 | func looksLikeHTML(b []byte) bool { |
| 585 | for len(b) > 0 { |
| 586 | if b[0] > ' ' { |
| 587 | return b[0] == '<' |
| 588 | } |
| 589 | b = b[1:] |
| 590 | } |
| 591 | return false |
| 592 | } |