Created
October 4, 2019 21:42
-
-
Save jrwren/fe5f95236eede049b0993056096d9144 to your computer and use it in GitHub Desktop.
crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"flag" | |
"fmt" | |
"io/ioutil" | |
"net/http" | |
"net/url" | |
"os" | |
"regexp" | |
"strconv" | |
"sync" | |
) | |
var ( | |
dpLock sync.Mutex | |
domainPool map[string]chan string | |
urlre *regexp.Regexp | |
debug bool | |
wg sync.WaitGroup | |
) | |
const ( | |
connectionsPerHostname = 6 | |
urlreS = `href="https?://[^" ]*"` | |
) | |
func main() { | |
flag.BoolVar(&debug, "debug", false, "debug output") | |
flag.Parse() | |
urlre = regexp.MustCompile(urlreS) | |
domainPool = make(map[string]chan string) | |
args := flag.Args() | |
if len(args) == 0 { | |
args = []string{"https://google.com", | |
"https://facebook.com", "https://wikipedia.com", | |
"https://yahoo.com"} | |
} | |
for _, url := range args { | |
wg.Add(1) | |
go errlogCrawl(url) | |
} | |
wg.Wait() | |
} | |
func errlogCrawl(u string) { | |
func() { | |
err := crawlURL(u) | |
if err != nil { | |
fmt.Printf("err: %v\n", err) | |
} | |
}() | |
} | |
func crawlURL(u string) error { | |
defer wg.Done() | |
if debug { | |
fmt.Printf("crawlURL %s\n", u) | |
} | |
up, err := url.Parse(u) | |
if err != nil { | |
return err | |
} | |
limit := getDomainPool(up.Host) | |
limit <- u | |
resp, err := http.Get(u) | |
if err != nil { | |
return err | |
} | |
if resp.Body == nil { | |
return nil | |
} | |
b, err := ioutil.ReadAll(resp.Body) | |
if err != nil { | |
return err | |
} | |
resp.Body.Close() | |
r := <-limit | |
if debug { | |
fmt.Printf("read %v from domain pool", r) | |
} | |
body := string(b) | |
if debug { | |
os.Stdout.WriteString(u) | |
os.Stdout.WriteString("\t") | |
os.Stdout.WriteString(strconv.Itoa(len(body))) | |
os.Stdout.WriteString("\n") | |
os.Stdout.WriteString(body) | |
os.Stdout.WriteString("\n") | |
} | |
urls := urlre.FindAllString(body, 100) // First 100 urls. | |
if debug { | |
fmt.Printf("found urls: %v", urls) | |
} | |
for _, next := range urls { | |
next = next[6 : len(next)-1] | |
wg.Add(1) | |
go errlogCrawl(next) | |
} | |
return nil | |
} | |
func getDomainPool(hn string) chan string { | |
dpLock.Lock() | |
defer dpLock.Unlock() | |
p, ok := domainPool[hn] | |
if !ok { | |
p = make(chan string, connectionsPerHostname) | |
domainPool[hn] = p | |
} | |
return p | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment