Skip to content

Instantly share code, notes, and snippets.

@vgarvardt
Last active August 29, 2015 14:03
Show Gist options
  • Save vgarvardt/84c20bb51f7d9ad0f8d2 to your computer and use it in GitHub Desktop.
Save vgarvardt/84c20bb51f7d9ad0f8d2 to your computer and use it in GitHub Desktop.
Simple crawler in Go
// originally found @ http://venkat.io/posts/concurrent-crawling/
package main
//The builtins are limited. Making a lot of imports necessary
import ("sync"
"net/http"
"regexp"
"io/ioutil"
"os"
"bytes"
"fmt"
"strconv"
"runtime"
"crypto/md5"
"io"
)
var source = os.Args[1] //source link
var num_worker_threads, _ = strconv.Atoi(os.Args[2]) //specifying how many workers
var num_to_crawl, _ = strconv.Atoi(os.Args[3]) //maximum no. of pages to fetch
var crawled = make(chan int, num_to_crawl) //buffered channel to count page fetches
var links = make(chan string, num_to_crawl) //buffered channel as a queue of links
func do_work(link string, crawler_id int) {
//fmt.Println("crawling", crawler_id, link)
re := regexp.MustCompile(`<a href="(http.*?)"`)
resp, err := http.Get(link)
if err != nil {
return
}
defer resp.Body.Close()
content, _ := ioutil.ReadAll(resp.Body)
contentString := bytes.NewBuffer(content).String()
h := md5.New()
io.WriteString(h, contentString)
var _ = h.Sum(nil)
//Try to add a link to the queue of links. If it is full, the default case
//returns as there is no point in adding more links to the queue as our
//maximum page fetches is limited anyways.
for _, match := range re.FindAllStringSubmatch(contentString, -1) {
select {
case links <- match[1]:
default:
return
}
}
}
func worker(crawler_id int) {
//If the crawled channel's buffer is full, no more pages to fetch
//so no more work to do.
for {
select {
case crawled <- 1:
do_work(<-links, crawler_id)
default:
return
}
}
}
func main() {
var _ = fmt.Println
//Try to make the workers use all the logical CPUs in the machine.
runtime.GOMAXPROCS(runtime.NumCPU())
var wg sync.WaitGroup
links <- source
for i:=0; i < num_worker_threads; i++ {
// Increment the WaitGroup counter.
wg.Add(1)
// Launch a goroutine worker.
go func(crawler_id int) {
// Decrement the counter when the goroutine completes.
defer wg.Done()
worker(crawler_id)
}(i)
}
// Wait for all the workers to finish.
wg.Wait()
close(crawled)
close(links)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment