Created
January 24, 2014 21:00
-
-
Save Veejay/8606301 to your computer and use it in GitHub Desktop.
Link fetcher/checker
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ( | |
"fmt" | |
/* "io/ioutil" */ | |
"net/http" | |
"code.google.com/p/go.net/html" | |
"strings" | |
) | |
func main() { | |
response, err := http.Get("http://www.autreplanete.com") | |
if err != nil { | |
// handle error | |
} | |
document, err := html.Parse(response.Body) | |
var f func(*html.Node) | |
f = func(n *html.Node) { | |
if n.Type == html.ElementNode && n.Data == "a" { | |
for _, a := range n.Attr { | |
if a.Key == "href" { | |
if strings.Contains(a.Val, "http://"){ | |
fmt.Printf("Fetching => %s", a.Val) | |
response, err := http.Get(a.Val) | |
if err != nil { | |
fmt.Printf("%v", response.StatusCode) | |
panic(err) | |
} | |
defer response.Body.Close() | |
if (response.StatusCode != http.StatusNotFound){ | |
fmt.Printf("%s %v\n\n\n\n", a.Val, response.StatusCode) | |
} | |
} | |
break | |
} | |
} | |
} | |
for c := n.FirstChild; c != nil; c = c.NextSibling { | |
f(c) | |
} | |
} | |
f(document) | |
} |
Author
Veejay
commented
Jan 28, 2014
Still kind of sucks. Also, doesn't work. I'm great at this.
package main
import (
"fmt"
"net/http"
"code.google.com/p/go.net/html"
"net/url"
)
type HttpResponse struct {
url string
response *http.Response
err error
}
func IsLink(n *html.Node) (bool) {
return (n.Type == html.ElementNode && n.Data == "a")
}
func fetchAllLinks (n *html.Node, c chan<- HttpResponse) {
if IsLink(n) {
for _, a := range n.Attr {
if a.Key == "href" {
location, err := url.Parse(a.Val)
if err != nil {
fmt.Printf("%v\n", err)
} else {
if location.Scheme == "http" {
go func(url string, ch chan<- HttpResponse) {
resp, err := http.Get(url)
if err != nil {
fmt.Printf("RESPONSE: %v\nERROR: %v\n", resp, err)
} else {
ch <- HttpResponse{url, resp, err}
}
}(a.Val, c)
}
}
}
}
} else {
for child := n.FirstChild; child != nil; child = child.NextSibling {
fetchAllLinks(child, c)
}
}
}
func main() {
results := make(chan HttpResponse)
response, err := http.Get("http://www.autreplanete.com")
if err != nil {
fmt.Printf("%v\n", err)
}
document, err := html.Parse(response.Body)
if err != nil {
fmt.Printf("%v\n", err)
} else {
fetchAllLinks(document, results)
}
for result := range results {
fmt.Printf("%v", result)
}
}
I guess it blocks on the reading of the channel and nothing is being sent anymore or something. I am not really good with this sutff.
Programming. It's very hard.
package main
import (
"fmt"
"net/http"
"os"
"net/url"
"code.google.com/p/go.net/html"
)
type HttpResponse struct {
url string
statusCode int
}
func getHypertextReference(tag html.Token) (href string) {
for _, attr := range tag.Attr {
if attr.Key == "href" {
href = attr.Val
break
}
}
return href
}
func extractLinksFromPage (address string, c chan<- string) {
response, err := http.Get(address)
if err != nil {
fmt.Printf("An error occurred while issuing a HTTP GET request to %s\n", address)
return
}
defer response.Body.Close()
tokenizer := html.NewTokenizer(response.Body)
for {
tokenType := tokenizer.Next()
if tokenType == html.ErrorToken {
return
}
token := tokenizer.Token()
switch tokenType {
case html.StartTagToken:
if token.Data == "a" {
href := getHypertextReference(token)
location, err := url.Parse(href)
if err != nil {
// FIXME: That's wrong, handle error
panic(err)
}
if location.Scheme == "http" || location.Scheme == "https" {
c <- href
}
}
case html.EndTagToken:
if token.Data == "html" {
close(c)
}
}
}
}
func checkLink(href string, responses chan<- HttpResponse) {
response, err := http.Get(href)
if err != nil {
// FIXME: This is absolutely not a 999. The HttpResponse should
// actually be named something that embeds the URL, the response and
// any potential errors that occurred
responses <- HttpResponse{href, 999}
return
}
defer response.Body.Close()
if response.StatusCode == http.StatusNotFound {
responses <- HttpResponse{href, response.StatusCode}
} else {
responses <- HttpResponse{href, response.StatusCode}
}
}
func main() {
hrefs := make(chan string)
httpResponses := make(chan HttpResponse)
// Here we go
go extractLinksFromPage(os.Args[1], hrefs)
numberOfLinks := 0
for href := range hrefs {
numberOfLinks++
go checkLink(href, httpResponses)
}
for i := 0; i < numberOfLinks; i++ {
response := <-httpResponses
fmt.Printf("Status %d for URL %s\n", response.statusCode, response.url)
}
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment