Blame - script/crawl.go - skia.googlesource.com/external/github.com/google/wuffs

blob: 5811fd7240097cec50eead86c45410e30f7ba94b [file] [log] [blame]

Nigel Tao	48c7d1c	2018-11-04 20:38:04 +1100	[diff] [blame]	1	// Copyright 2018 The Wuffs Authors.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// https://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
Nigel Tao	788479d	2021-08-22 10:52:51 +1000	[diff] [blame]	15	//go:build ignore
Nigel Tao	48c7d1c	2018-11-04 20:38:04 +1100	[diff] [blame]	16	// +build ignore
				17
				18	package main
				19
				20	// crawl.go crawls a list of HTTP and HTTPS URLs. If the URL yields an HTML
				21	// file, that file is parsed and the "<img src=etc>" tags within it are
				22	// followed (but not recursively).
				23	//
				24	// The crawler writes files to disk with filenames based on the hash of the
				25	// content (thus de-duplicating e.g. a site's 404 Not Found page even if served
				26	// from multiple URLs). It also writes a manifest.tsv file that records the
				27	// mapping from the original URL to that content hash.
				28	//
				29	// Usage: go run crawl.go -outdir foo -infile urls.txt 2>log.txt
				30
				31	import (
				32	"bufio"
				33	"bytes"
				34	"crypto/sha256"
				35	"flag"
				36	"fmt"
Nigel Tao	226c476	2021-08-22 11:05:43 +1000	[diff] [blame]	37	"io"
Nigel Tao	48c7d1c	2018-11-04 20:38:04 +1100	[diff] [blame]	38	"log"
				39	"math/rand"
				40	"net/http"
				41	"net/url"
				42	"os"
				43	"path/filepath"
				44	"sort"
				45	"strconv"
				46	"sync"
				47	"time"
				48
				49	"golang.org/x/net/html"
				50	)
				51
				52	var (
				53	datadirlevelsFlag = flag.Int("datadirlevels", 1,
				54	"number of directories in the ab/cd/efgh output data filenames; valid range is [0..8]")
				55	flushfreqFlag = flag.Int("flushfreq", 256, "write out in-progress manifest.tsv on every flushfreq entries")
				56	httptimeoutFlag = flag.Duration("httptimeout", 30*time.Second, "HTTP Client timeout; zero means unlimited")
				57	infileFlag = flag.String("infile", "", "source file containing URLs to crawl")
				58	inlimitFlag = flag.Int("inlimit", -1, "if non-negative, the maximum number of input URLs")
				59	nworkersFlag = flag.Int("nworkers", 16, "number of concurrent crawler workers")
				60	outdirFlag = flag.String("outdir", "", "directory to place crawled data")
				61	sleepFlag = flag.Duration("sleep", 1*time.Second,
				62	"minimum duration to sleep between HTTP requests to the same domain")
				63	)
				64
				65	func main() {
				66	if err := main1(); err != nil {
				67	os.Stderr.WriteString(err.Error() + "\n")
				68	os.Exit(1)
				69	}
				70	}
				71
				72	func main1() error {
				73	flag.Parse()
				74	if datadirlevelsFlag < 0 \|\| 8 < datadirlevelsFlag {
				75	return fmt.Errorf("-datadirlevels out of bounds [0..8]")
				76	}
				77	if *infileFlag == "" {
				78	return fmt.Errorf("-infile not given")
				79	}
				80	if *outdirFlag == "" {
				81	return fmt.Errorf("-outdir not given")
				82	}
				83
				84	urlsGroupedByDomain, err := parseInfile()
				85	if err != nil {
				86	return err
				87	}
				88
				89	global.manifest, err = parseManifest()
				90	if err != nil {
				91	return err
				92	}
				93
				94	wg := sync.WaitGroup{}
				95	urlsChan := make(chan []*url.URL)
				96	for i := 0; i < *nworkersFlag; i++ {
				97	wg.Add(1)
				98	w := worker(i)
				99	go w.work(&wg, urlsChan)
				100	}
				101	for _, u := range urlsGroupedByDomain {
				102	urlsChan <- u
				103	}
				104	close(urlsChan)
Nigel Tao	2f78804	2021-01-23 19:29:19 +1100	[diff] [blame]	105	log.Printf("controller: no more work (inlimit is %d)", *inlimitFlag)
Nigel Tao	48c7d1c	2018-11-04 20:38:04 +1100	[diff] [blame]	106	wg.Wait()
				107
				108	return flush(global.manifest.m)
				109	}
				110
				111	var global struct {
				112	manifest manifest
				113	}
				114
				115	const keySize = sha256.Size // 32 bytes.
				116
				117	type key [keySize]byte
				118
				119	func (k key) Cmp(l key) int {
				120	for a := 0; a < keySize; a++ {
				121	if k[a] < l[a] {
				122	return -1
				123	} else if k[a] > l[a] {
				124	return +1
				125	}
				126	}
				127	return 0
				128	}
				129
				130	func (k key) Str(levels int) string {
				131	const hex = "0123456789abcdef"
				132	var b [128]byte
				133	n := 0
				134	for i, x := range k {
				135	b[n+0] = hex[x>>4]
				136	b[n+1] = hex[x&0xF]
				137	n += 2
				138	if i < levels {
				139	b[n] = '/'
				140	n++
				141	}
				142	}
				143	return string(b[:n])
				144	}
				145
				146	func hash(b []byte) key {
				147	return sha256.Sum256(b)
				148	}
				149
				150	func unhex(b byte) uint8 {
				151	if '0' <= b && b <= '9' {
				152	return b - '0'
				153	}
				154	if 'a' <= b && b <= 'f' {
				155	return b + 10 - 'a'
				156	}
				157	return 0xFF
				158	}
				159
				160	func parseHash(b []byte) (k key, ok bool) {
				161	i := 0
				162	for i < keySize && len(b) >= 2 {
				163	if b[0] == '/' {
				164	b = b[1:]
				165	continue
				166	}
				167	u0 := unhex(b[0])
				168	u1 := unhex(b[1])
				169	if u0 > 15 \|\| u1 > 15 {
				170	return key{}, false
				171	}
				172	k[i] = u0<<4 \| u1
				173	i++
				174	b = b[2:]
				175	}
				176	return k, i == keySize && len(b) == 0
				177	}
				178
				179	type entry struct {
				180	contentHash key
				181	urlHash key
				182	httpStatusCode uint32
				183	sniffedMimeType string
				184	size uint64
				185	url *url.URL
				186	}
				187
				188	type manifest struct {
				189	lock sync.Mutex
				190	m map[key]entry // maps from urlHash to entry.
				191	numPending int
				192	flushing bool
				193	}
				194
				195	func (m *manifest) get(k key) entry {
				196	m.lock.Lock()
				197	e := m.m[k]
				198	m.lock.Unlock()
				199	return e
				200	}
				201
				202	func (m *manifest) put(k key, e entry) {
				203	clone := map[key]entry(nil)
				204
				205	m.lock.Lock()
				206	if m.m == nil {
				207	m.m = map[key]entry{}
				208	}
				209	m.m[k] = e
				210	m.numPending++
				211	if m.numPending >= *flushfreqFlag && !m.flushing {
				212	m.numPending = 0
				213	m.flushing = true
				214	clone = make(map[key]entry, len(m.m))
				215	for mk, mv := range m.m {
				216	clone[mk] = mv
				217	}
				218	}
				219	m.lock.Unlock()
				220
				221	if clone != nil {
				222	if err := flush(clone); err != nil {
				223	log.Println(err)
				224	}
				225
				226	m.lock.Lock()
				227	m.flushing = false
				228	m.lock.Unlock()
				229	}
				230	}
				231
				232	func flush(m map[key]entry) error {
				233	log.Printf("Write manifest.tsv (%d entries) started...", len(m))
				234	defer log.Printf("Write manifest.tsv (%d entries) finished.", len(m))
				235
				236	keys := make([][2]key, 0, len(m))
				237	for _, v := range m {
				238	keys = append(keys, [2]key{v.contentHash, v.urlHash})
				239	}
				240
				241	sort.Slice(keys, func(i, j int) bool {
				242	if cmp := keys[i][0].Cmp(keys[j][0]); cmp != 0 {
				243	return cmp < 0
				244	}
				245	if cmp := keys[i][1].Cmp(keys[j][1]); cmp != 0 {
				246	return cmp < 0
				247	}
				248	return false
				249	})
				250
				251	filename0 := filepath.Join(*outdirFlag, "manifest.tsv.inprogress")
				252	filename1 := filepath.Join(*outdirFlag, "manifest.tsv")
				253
				254	f, err := os.Create(filename0)
				255	if err != nil {
				256	return err
				257	}
				258
				259	w := bufio.NewWriter(f)
				260	fmt.Fprintf(w, "#ContentHash\tURLHash\tHTTPStatusCode\tSniffedMIMEType\tSize\tURL\n")
				261	for _, k := range keys {
				262	e := m[k[1]]
				263	fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%d\t%v\n",
				264	e.contentHash.Str(*datadirlevelsFlag), e.urlHash.Str(0),
				265	e.httpStatusCode, e.sniffedMimeType, e.size, e.url)
				266	}
				267	if err := w.Flush(); err != nil {
				268	f.Close()
				269	return err
				270	}
				271	if err := f.Close(); err != nil {
				272	return err
				273	}
				274
				275	return os.Rename(filename0, filename1)
				276	}
				277
				278	func parseInfile() (map[string][]*url.URL, error) {
				279	f, err := os.Open(*infileFlag)
				280	if err != nil {
				281	return nil, err
				282	}
				283	defer f.Close()
				284
				285	m := map[string][]*url.URL{}
				286
				287	n := 0
				288	s := bufio.NewScanner(f)
				289	for s.Scan() {
				290	b := s.Bytes()
				291
				292	// Strip leading whitespace (space, tab or other control character).
				293	for len(b) > 0 && b[0] <= ' ' {
				294	b = b[1:]
				295	}
				296
				297	// Skip empty lines or comments starting with "#".
				298	if len(b) == 0 \|\| b[0] == '#' {
				299	continue
				300	}
				301
				302	// Strip everything up to the first whitespace.
				303	for i, x := range b {
				304	if x <= ' ' {
				305	b = b[:i]
				306	break
				307	}
				308	}
				309
				310	u, err := url.Parse(string(b))
				311	if err != nil {
				312	continue
				313	}
				314
				315	if inlimitFlag >= 0 && n == inlimitFlag {
				316	break
				317	}
				318	n++
				319
				320	m[u.Host] = append(m[u.Host], u)
				321	}
				322	if err := s.Err(); err != nil {
				323	return nil, err
				324	}
				325
				326	return m, nil
				327	}
				328
				329	func parseManifest() (manifest, error) {
				330	f, err := os.Open(filepath.Join(*outdirFlag, "manifest.tsv"))
				331	if err != nil {
				332	if os.IsNotExist(err) {
				333	err = nil
				334	}
				335	return manifest{}, err
				336	}
				337	defer f.Close()
				338
				339	m := map[key]entry{}
				340
				341	s := bufio.NewScanner(f)
				342	for s.Scan() {
				343	b := s.Bytes()
				344
				345	// Strip leading whitespace (space, tab or other control character).
				346	for len(b) > 0 && b[0] <= ' ' {
				347	b = b[1:]
				348	}
				349
				350	// Skip empty lines or comments starting with "#".
				351	if len(b) == 0 \|\| b[0] == '#' {
				352	continue
				353	}
				354
				355	e := parseEntry(b)
				356	if e.url == nil {
				357	continue
				358	}
				359
				360	m[e.urlHash] = e
				361	}
				362	if err := s.Err(); err != nil {
				363	return manifest{}, err
				364	}
				365
				366	return manifest{m: m}, nil
				367	}
				368
				369	func parseEntry(b []byte) (e entry) {
				370	if i := bytes.IndexByte(b, '\t'); i < 0 {
				371	return entry{}
				372	} else if h, ok := parseHash(b[:i]); !ok {
				373	return entry{}
				374	} else {
				375	e.contentHash = h
				376	b = b[i+1:]
				377	}
				378
				379	if i := bytes.IndexByte(b, '\t'); i < 0 {
				380	return entry{}
				381	} else if h, ok := parseHash(b[:i]); !ok {
				382	return entry{}
				383	} else {
				384	e.urlHash = h
				385	b = b[i+1:]
				386	}
				387
				388	if i := bytes.IndexByte(b, '\t'); i < 0 {
				389	return entry{}
				390	} else if u, err := strconv.ParseUint(string(b[:i]), 10, 32); err != nil {
				391	return entry{}
				392	} else {
				393	e.httpStatusCode = uint32(u)
				394	b = b[i+1:]
				395	}
				396
				397	if i := bytes.IndexByte(b, '\t'); i < 0 {
				398	return entry{}
				399	} else {
				400	e.sniffedMimeType = string(b[:i])
				401	b = b[i+1:]
				402	}
				403
				404	if i := bytes.IndexByte(b, '\t'); i < 0 {
				405	return entry{}
				406	} else if u, err := strconv.ParseUint(string(b[:i]), 10, 64); err != nil {
				407	return entry{}
				408	} else {
				409	e.size = uint64(u)
				410	b = b[i+1:]
				411	}
				412
				413	if u, err := url.Parse(string(b)); err != nil {
				414	return entry{}
				415	} else {
				416	e.url = u
				417	}
				418
				419	return e
				420	}
				421
				422	func sleep(rng *rand.Rand) {
				423	jitter := rng.Int63n(int64(*sleepFlag))
				424	time.Sleep(*sleepFlag + time.Duration(jitter))
				425	}
				426
				427	type worker int
				428
				429	func (w worker) work(wg sync.WaitGroup, urlsChan chan []url.URL) {
				430	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
				431	defer wg.Done()
				432	for urls := range urlsChan {
				433	for _, u := range urls {
				434	e, links, fetched := w.work1(u, true)
				435	if fetched {
				436	sleep(rng)
				437	}
				438
				439	for _, u := range links {
				440	if ee, _, fetched := w.work1(u, false); fetched {
				441	sleep(rng)
				442	if ee.url != nil {
				443	global.manifest.put(ee.urlHash, ee)
				444	}
				445	}
				446	}
				447
				448	if fetched && e.url != nil {
				449	global.manifest.put(e.urlHash, e)
				450	}
				451	}
				452	}
				453	log.Printf("worker #%03d: no more work", w)
				454	}
				455
				456	func (w worker) work1(u url.URL, followHTML bool) (e entry, links []url.URL, fetched bool) {
				457	if u.Scheme != "http" && u.Scheme != "https" {
				458	return entry{}, nil, false
				459	}
				460
				461	urlString := u.String()
				462	urlHash := hash([]byte(urlString))
				463	e = global.manifest.get(urlHash)
				464	if e.url != nil {
				465	log.Printf("worker #%03d: skipping %q", w, urlString)
				466	return e, nil, false
				467	}
				468	log.Printf("worker #%03d: fetching %q", w, urlString)
				469
				470	statusCode, data, err := fetch(urlString)
				471	if err != nil {
				472	log.Printf("worker #%03d: %v", w, err)
				473	return entry{}, nil, true
				474	}
				475	e = entry{
				476	contentHash: hash(data),
				477	urlHash: urlHash,
				478	httpStatusCode: statusCode,
				479	sniffedMimeType: sniff(data),
				480	size: uint64(len(data)),
				481	url: u,
				482	}
				483
				484	filename := filepath.Join(outdirFlag, "data", filepath.FromSlash(e.contentHash.Str(datadirlevelsFlag)))
				485	if _, err := os.Stat(filename); os.IsNotExist(err) {
				486	log.Printf("worker #%03d: writing %q", w, urlString)
				487	os.MkdirAll(filepath.Dir(filename), 0755)
Nigel Tao	226c476	2021-08-22 11:05:43 +1000	[diff] [blame]	488	if err := os.WriteFile(filename, data, 0644); err != nil {
Nigel Tao	48c7d1c	2018-11-04 20:38:04 +1100	[diff] [blame]	489	log.Println(err)
				490	return entry{}, nil, true
				491	}
				492	}
				493
				494	if followHTML && looksLikeHTML(data) {
				495	log.Printf("worker #%03d: parsing %q", w, urlString)
				496	links = parseHTML(u, data)
				497	}
				498	return e, links, true
				499	}
				500
				501	func fetch(urlString string) (statusCode uint32, body []byte, retErr error) {
				502	client := &http.Client{
				503	Timeout: *httptimeoutFlag,
				504	}
				505
				506	res, err := client.Get(urlString)
				507	if err != nil {
				508	return 0, nil, err
				509	}
				510	defer res.Body.Close()
				511
Nigel Tao	226c476	2021-08-22 11:05:43 +1000	[diff] [blame]	512	body, err = io.ReadAll(res.Body)
Nigel Tao	48c7d1c	2018-11-04 20:38:04 +1100	[diff] [blame]	513	if err != nil {
				514	return 0, nil, err
				515	}
				516	return uint32(res.StatusCode), body, nil
				517	}
				518
				519	func parseHTML(u url.URL, data []byte) (links []url.URL) {
				520	z := html.NewTokenizer(bytes.NewReader(data))
				521	for {
				522	tt := z.Next()
				523	if tt == html.ErrorToken {
				524	break
				525	}
				526	if tt != html.StartTagToken && tt != html.SelfClosingTagToken {
				527	continue
				528	}
				529	tn, hasAttr := z.TagName()
				530	if len(tn) != 3 \|\| string(tn) != "img" {
				531	continue
				532	}
				533	for hasAttr {
				534	key, val, moreAttr := z.TagAttr()
				535	if len(key) == 3 && string(key) == "src" {
				536	if v, err := url.Parse(string(val)); err == nil {
				537	links = append(links, u.ResolveReference(v))
				538	}
				539	}
				540	hasAttr = moreAttr
				541	}
				542	}
				543	return links
				544	}
				545
				546	var (
				547	prefixBM = []byte("BM")
				548	prefixGIF = []byte("GIF8")
				549	prefixJPEG = []byte("\xFF\xD8")
				550	prefixPNG = []byte("\x89PNG\r\n\x1A\n")
				551	prefixRIFF = []byte("RIFF")
				552	prefixTIFFBE = []byte("II\x2A\x00")
				553	prefixTIFFLE = []byte("MM\x00\x2A")
				554	prefixWEBP = []byte("WEBP")
				555	prefixZZZZ = []byte("\x00\x00\x00\x00")
				556	)
				557
				558	func sniff(b []byte) string {
				559	switch {
				560	case bytes.HasPrefix(b, prefixBM):
				561	if len(b) > 6 && bytes.HasPrefix(b[6:], prefixZZZZ) {
				562	return "image/bmp"
				563	}
				564	case bytes.HasPrefix(b, prefixGIF):
				565	return "image/gif"
				566	case bytes.HasPrefix(b, prefixJPEG):
				567	return "image/jpeg"
				568	case bytes.HasPrefix(b, prefixPNG):
				569	return "image/png"
				570	case bytes.HasPrefix(b, prefixRIFF):
				571	if len(b) > 8 && bytes.HasPrefix(b[8:], prefixWEBP) {
				572	return "image/webp"
				573	}
				574	case bytes.HasPrefix(b, prefixTIFFBE), bytes.HasPrefix(b, prefixTIFFLE):
				575	return "image/tiff"
				576	}
				577
				578	if looksLikeHTML(b) {
				579	return "text/html"
				580	}
				581	return "unknown"
				582	}
				583
				584	func looksLikeHTML(b []byte) bool {
				585	for len(b) > 0 {
				586	if b[0] > ' ' {
				587	return b[0] == '<'
				588	}
				589	b = b[1:]
				590	}
				591	return false
				592	}