Skip to content

Instantly share code, notes, and snippets.

Last active August 29, 2015 14:03
Show Gist options
  • Save vgarvardt/84c20bb51f7d9ad0f8d2 to your computer and use it in GitHub Desktop.
Save vgarvardt/84c20bb51f7d9ad0f8d2 to your computer and use it in GitHub Desktop.
Simple crawler in Go
// originally found @
package main
//The builtins are limited. Making a lot of imports necessary
import ("sync"
var source = os.Args[1] //source link
var num_worker_threads, _ = strconv.Atoi(os.Args[2]) //specifying how many workers
var num_to_crawl, _ = strconv.Atoi(os.Args[3]) //maximum no. of pages to fetch
var crawled = make(chan int, num_to_crawl) //buffered channel to count page fetches
var links = make(chan string, num_to_crawl) //buffered channel as a queue of links
func do_work(link string, crawler_id int) {
//fmt.Println("crawling", crawler_id, link)
re := regexp.MustCompile(`<a href="(http.*?)"`)
resp, err := http.Get(link)
if err != nil {
defer resp.Body.Close()
content, _ := ioutil.ReadAll(resp.Body)
contentString := bytes.NewBuffer(content).String()
h := md5.New()
io.WriteString(h, contentString)
var _ = h.Sum(nil)
//Try to add a link to the queue of links. If it is full, the default case
//returns as there is no point in adding more links to the queue as our
//maximum page fetches is limited anyways.
for _, match := range re.FindAllStringSubmatch(contentString, -1) {
select {
case links <- match[1]:
func worker(crawler_id int) {
//If the crawled channel's buffer is full, no more pages to fetch
//so no more work to do.
for {
select {
case crawled <- 1:
do_work(<-links, crawler_id)
func main() {
var _ = fmt.Println
//Try to make the workers use all the logical CPUs in the machine.
var wg sync.WaitGroup
links <- source
for i:=0; i < num_worker_threads; i++ {
// Increment the WaitGroup counter.
// Launch a goroutine worker.
go func(crawler_id int) {
// Decrement the counter when the goroutine completes.
defer wg.Done()
// Wait for all the workers to finish.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment