Skip to content

Instantly share code, notes, and snippets.

@jrwren
Created October 4, 2019 21:42
Show Gist options
  • Save jrwren/fe5f95236eede049b0993056096d9144 to your computer and use it in GitHub Desktop.
Save jrwren/fe5f95236eede049b0993056096d9144 to your computer and use it in GitHub Desktop.
crawler
package main
import (
"flag"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"os"
"regexp"
"strconv"
"sync"
)
var (
dpLock sync.Mutex
domainPool map[string]chan string
urlre *regexp.Regexp
debug bool
wg sync.WaitGroup
)
const (
connectionsPerHostname = 6
urlreS = `href="https?://[^" ]*"`
)
func main() {
flag.BoolVar(&debug, "debug", false, "debug output")
flag.Parse()
urlre = regexp.MustCompile(urlreS)
domainPool = make(map[string]chan string)
args := flag.Args()
if len(args) == 0 {
args = []string{"https://google.com",
"https://facebook.com", "https://wikipedia.com",
"https://yahoo.com"}
}
for _, url := range args {
wg.Add(1)
go errlogCrawl(url)
}
wg.Wait()
}
func errlogCrawl(u string) {
func() {
err := crawlURL(u)
if err != nil {
fmt.Printf("err: %v\n", err)
}
}()
}
func crawlURL(u string) error {
defer wg.Done()
if debug {
fmt.Printf("crawlURL %s\n", u)
}
up, err := url.Parse(u)
if err != nil {
return err
}
limit := getDomainPool(up.Host)
limit <- u
resp, err := http.Get(u)
if err != nil {
return err
}
if resp.Body == nil {
return nil
}
b, err := ioutil.ReadAll(resp.Body)
if err != nil {
return err
}
resp.Body.Close()
r := <-limit
if debug {
fmt.Printf("read %v from domain pool", r)
}
body := string(b)
if debug {
os.Stdout.WriteString(u)
os.Stdout.WriteString("\t")
os.Stdout.WriteString(strconv.Itoa(len(body)))
os.Stdout.WriteString("\n")
os.Stdout.WriteString(body)
os.Stdout.WriteString("\n")
}
urls := urlre.FindAllString(body, 100) // First 100 urls.
if debug {
fmt.Printf("found urls: %v", urls)
}
for _, next := range urls {
next = next[6 : len(next)-1]
wg.Add(1)
go errlogCrawl(next)
}
return nil
}
func getDomainPool(hn string) chan string {
dpLock.Lock()
defer dpLock.Unlock()
p, ok := domainPool[hn]
if !ok {
p = make(chan string, connectionsPerHostname)
domainPool[hn] = p
}
return p
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment