Skip to content

Instantly share code, notes, and snippets.

@Veejay
Created January 24, 2014 21:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Veejay/8606301 to your computer and use it in GitHub Desktop.
Save Veejay/8606301 to your computer and use it in GitHub Desktop.
Link fetcher/checker
@Veejay
Copy link
Author

Veejay commented Jan 24, 2014

TODO:

  1. Add timeout to requests
  2. Still chokes on http://www.autreplanete.dev/Works/.rss (program panics)
  3. Separate the fetching from the queuing
  4. Add a list of visited links to avoid processing them more than once
  5. Find better way to recognize external links (URI parser or something?)

@Veejay
Copy link
Author

Veejay commented Jan 28, 2014

package main

import (
  "fmt"
  "net/http"
  "code.google.com/p/go.net/html"
  "net/url"
)

func IsLink(n *html.Node) (bool) {
  return (n.Type == html.ElementNode && n.Data == "a")
}

func fetchUrl (url string) {
  fmt.Printf("Fetching => %s\n", url)
  response, err := http.Get(url)
  if err != nil {
    fmt.Printf("%v\n", err) 
  } else {
    defer response.Body.Close()
    fmt.Printf("FETCHED URL %s, RESULT CODE IS: %i\n", url, response.StatusCode)
  }
}

func fetchAllLinks (n *html.Node) {
  if IsLink(n) {
    for _, a := range n.Attr {
      if a.Key == "href" {
        location, err := url.Parse(a.Val)
        if err != nil {
          fmt.Printf("%v\n", err)
        } else {
          if location.Scheme == "http" {
            fmt.Println("GO FETCH")
            go fetchUrl(a.Val)
          }
        }
        break
      }
    }
  } else {
    for c := n.FirstChild; c != nil; c = c.NextSibling {
      fetchAllLinks(c)
    }
  }
}

func main() {
  response, err := http.Get("http://www.autreplanete.com")
  if err != nil {
    fmt.Printf("%v\n", err)
  }
  document, err := html.Parse(response.Body)
  if err != nil {
    fmt.Printf("%v\n", err)
  } else {
    fetchAllLinks(document)
  }
}

@Veejay
Copy link
Author

Veejay commented Jan 28, 2014

Still kind of sucks. Also, doesn't work. I'm great at this.

@Veejay
Copy link
Author

Veejay commented Feb 3, 2014

package main

import (
  "fmt"
  "net/http"
  "code.google.com/p/go.net/html"
  "net/url"
)

type HttpResponse struct {
  url      string
  response *http.Response
  err      error
}

func IsLink(n *html.Node) (bool) {
  return (n.Type == html.ElementNode && n.Data == "a")
}

func fetchAllLinks (n *html.Node, c chan<- HttpResponse) {
  if IsLink(n) {
    for _, a := range n.Attr {
      if a.Key == "href" {
        location, err := url.Parse(a.Val)
        if err != nil {
          fmt.Printf("%v\n", err)
        } else {
          if location.Scheme == "http" {
              go func(url string, ch chan<- HttpResponse) {
                resp, err := http.Get(url)
                if err != nil {
                  fmt.Printf("RESPONSE: %v\nERROR: %v\n", resp, err)
                } else {
                  ch <- HttpResponse{url, resp, err}
                }
              }(a.Val, c)
            }
          }
        }
      }
  } else {
    for child := n.FirstChild; child != nil; child = child.NextSibling {
      fetchAllLinks(child, c)
    }
  }
}

func main() {
  results := make(chan HttpResponse)
  response, err := http.Get("http://www.autreplanete.com")
  if err != nil {
    fmt.Printf("%v\n", err)
  }
  document, err := html.Parse(response.Body)
  if err != nil {
    fmt.Printf("%v\n", err)
  } else {
    fetchAllLinks(document, results)
  }
  for result := range results {
    fmt.Printf("%v", result)
  }
}

@Veejay
Copy link
Author

Veejay commented Feb 3, 2014

I guess it blocks on the reading of the channel and nothing is being sent anymore or something. I am not really good with this sutff.

@Veejay
Copy link
Author

Veejay commented Feb 3, 2014

Programming. It's very hard.

@Veejay
Copy link
Author

Veejay commented Feb 7, 2014

package main

import (
  "fmt"
  "net/http"
  "os"
  "net/url"
  "code.google.com/p/go.net/html"
)

type HttpResponse struct {
  url string
  statusCode int
}

func getHypertextReference(tag html.Token) (href string) {
  for _, attr := range tag.Attr {
    if attr.Key == "href" {
      href = attr.Val
      break
    }
  }
  return href
}

func extractLinksFromPage (address string, c chan<- string) {
  response, err := http.Get(address)
  if err != nil {
    fmt.Printf("An error occurred while issuing a HTTP GET request to %s\n", address)
    return
  }
  defer response.Body.Close()
  tokenizer := html.NewTokenizer(response.Body)
  for {
    tokenType := tokenizer.Next()
    if tokenType == html.ErrorToken {
      return
    }
    token := tokenizer.Token()
    switch tokenType {
    case html.StartTagToken:
      if token.Data == "a" {
        href := getHypertextReference(token)
        location, err := url.Parse(href)
        if err != nil {
          // FIXME: That's wrong, handle error
          panic(err)
        }
        if location.Scheme == "http" || location.Scheme == "https" {
          c <- href
        }
      }
    case html.EndTagToken:
      if token.Data == "html" {
        close(c)
      }
    }
  }
}

func checkLink(href string, responses chan<- HttpResponse) {
  response, err := http.Get(href)
  if err != nil {
    // FIXME: This is absolutely not a 999. The HttpResponse should
    // actually be named something that embeds the URL, the response and 
    // any potential errors that occurred
    responses <- HttpResponse{href, 999}
    return
  }
  defer response.Body.Close()
  if response.StatusCode == http.StatusNotFound {
    responses <- HttpResponse{href, response.StatusCode}
  } else {
    responses <- HttpResponse{href, response.StatusCode}
  }
}

func main() {
  hrefs := make(chan string)
  httpResponses := make(chan HttpResponse)
  // Here we go
  go extractLinksFromPage(os.Args[1], hrefs)
  numberOfLinks := 0
  for href := range hrefs {
    numberOfLinks++
    go checkLink(href, httpResponses)
  }
  for i := 0; i < numberOfLinks; i++ {
    response := <-httpResponses
    fmt.Printf("Status %d for URL %s\n", response.statusCode, response.url)
  }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment