Created
August 24, 2018 13:38
-
-
Save moos3/52b8c3b58d81b4a0aa21da422d56a98a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"fmt" | |
"log" | |
"net/http" | |
"os" | |
"time" | |
"gopkg.in/cheggaaa/pb.v1" | |
) | |
// Node - every link found stored as a node | |
type Node struct { | |
link string | |
redirectURL string | |
statusCode int | |
headers http.Header | |
} | |
// readLines - | |
// This reads a file of urls and will remove ones that are dupicates. | |
// This func is very dirty and could be smartter. | |
func readLines(path string) ([]string, error) { | |
file, err := os.Open(path) | |
if err != nil { | |
return nil, err | |
} | |
defer file.Close() | |
var lines []string | |
scanner := bufio.NewScanner(file) | |
for scanner.Scan() { | |
lines = append(lines, scanner.Text()) | |
} | |
encountered := map[string]bool{} | |
result := []string{} | |
for v := range lines { | |
if encountered[lines[v]] == true { | |
// do not add dupe | |
} else { | |
encountered[lines[v]] = true | |
// Append to result slice | |
result = append(result, lines[v]) | |
} | |
} | |
return result, scanner.Err() | |
} | |
// urlCheck - | |
// This function is for URL checking and will return a single Node | |
func urlCheck(url string) (Node, error) { | |
var data Node | |
timeout := time.Duration(30 * time.Second) | |
client := http.Client{ | |
Timeout: timeout, | |
} | |
resp, err := client.Get(url) | |
if err != nil { | |
log.Print(err) | |
} | |
data.link = url | |
data.statusCode = resp.StatusCode | |
data.headers = resp.Header | |
if resp.Request.URL.String() != url { | |
data.redirectURL = resp.Request.URL.String() | |
} | |
return data, err | |
} | |
// runChecker - This just loops over a slice of strings | |
func runChecker(l []string) []Node { | |
checked := []Node{} | |
fmt.Println("Checking Urls") | |
bar := pb.StartNew(len(l)) | |
// crawl each website in input file one consecutively | |
for i := 0; i < len(l); i++ { | |
bar.Increment() | |
n, e := urlCheck(l[i]) | |
if e != nil { | |
fmt.Printf("Got Error: %s when fetching: %s", e, l[i]) | |
} else { | |
checked = append(checked, n) | |
} | |
} | |
return checked | |
} | |
func main() { | |
if len(os.Args) != 2 { | |
fmt.Fprintf(os.Stderr, "Usage: %s file path\n", os.Args[0]) | |
os.Exit(1) | |
} | |
links, err := readLines(os.Args[1]) | |
if err != nil { | |
fmt.Println("Error reading file: ", err) | |
} | |
checked := runChecker(links) | |
fmt.Println("Reported 404 urls") | |
for _, link := range checked { | |
if link.statusCode == 404 { | |
fmt.Printf("Url: %s, reported: %d\n", link.link, link.statusCode) | |
for k, v := range link.headers { | |
fmt.Printf("Header field %s and value %s\n", k, v) | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment