Created
September 27, 2017 22:28
-
-
Save b5/99e48d6e4e8d7f37159da17df90f08ae to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"flag" | |
"fmt" | |
"github.com/PuerkitoBio/purell" | |
"io" | |
"net/url" | |
"os" | |
) | |
var ( | |
showHelp bool | |
outFile string | |
inFile string | |
) | |
func init() { | |
flag.BoolVar(&showHelp, "h", false, "print help text") | |
flag.StringVar(&inFile, "f", "", "file to read from") | |
flag.StringVar(&outFile, "o", "", "path to write file to") | |
} | |
func main() { | |
// parse flags, grabbing values from the command line | |
flag.Parse() | |
if len(os.Args) == 1 || showHelp { | |
PrintHelpText() | |
return | |
} | |
s, err := NewFileScanner(inFile) | |
if err != nil { | |
fmt.Println(err.Error()) | |
return | |
} | |
// allocate a new results writer | |
w, err := NewResultsWriter(outFile) | |
if err != nil { | |
fmt.Println(err.Error()) | |
return | |
} | |
stats, err := NormalizeUrls(s, w) | |
if err != nil { | |
fmt.Println(err.Error()) | |
return | |
} | |
// if stdout isn't being used for output, write stats to stdout | |
if w != os.Stderr { | |
fmt.Println(stats) | |
} | |
// check to see if our writer implements the closer interface, | |
// call close if so | |
if closer, ok := w.(io.Closer); ok { | |
if err := closer.Close(); err != nil { | |
fmt.Println(err.Error()) | |
return | |
} | |
} | |
} | |
// Stats tracks state for extracting hrefs from a given document | |
type Stats struct { | |
// elements matched by selector | |
Urls int | |
// elements who's absolute url was a duplicate | |
Duplicates int | |
// elements with a valid url | |
Added int | |
} | |
// stats implements the stringer interface | |
func (s *Stats) String() string { | |
return fmt.Sprintf("%d Urls scanned.\n%d were duplicates.\n%d urls written.", s.Urls, s.Duplicates, s.Added) | |
} | |
// NewFileScanner scans lines from a file | |
func NewFileScanner(path string) (*bufio.Scanner, error) { | |
f, err := os.Open(path) | |
if err != nil { | |
return nil, err | |
} | |
return bufio.NewScanner(f), nil | |
} | |
// NewResultsWriter writes to either a file or stderr if no path is provided | |
func NewResultsWriter(path string) (io.Writer, error) { | |
if path != "" { | |
return os.Create(path) | |
} | |
return os.Stderr, nil | |
} | |
// NormalizeUrls fetches a given url, and uses the provided jquery-style selector to grab | |
// all of the "href" attributes for a given url HTML document, writing a line-delimited list of | |
// deduplicated absolute urls to w | |
func NormalizeUrls(s *bufio.Scanner, w io.Writer) (*Stats, error) { | |
// create a stats object with the total number of matched element | |
stats := &Stats{} | |
// added is a list of urls that have been added already | |
added := map[string]bool{} | |
// iterate through elements | |
for s.Scan() { | |
stats.Urls++ | |
u, _ := url.Parse(s.Text()) | |
urlstr := purell.NormalizeURL(u, purell.FlagsUsuallySafeGreedy) | |
if added[urlstr] == false { | |
added[urlstr] = true | |
stats.Added++ | |
// write the url as a line to the writer | |
w.Write([]byte(fmt.Sprintf("%s\n", urlstr))) | |
} else { | |
stats.Duplicates++ | |
} | |
} | |
return stats, nil | |
} | |
// PrintHelpText outputs instructions for using this program to os.Stdout | |
func PrintHelpText() { | |
fmt.Println(` | |
normalize is a command line tool for normalizing a list of urls | |
writing each url on a new line. | |
Each matched url is: | |
* absolute (referenced by source url) | |
* unique - (no duplicates are added to the list) | |
* refers to a separate resource - (no url fragments) | |
extract_href uses a jquery-style selector to search the HTML document for elements that have an href attribute | |
to construct a de-duplicated list of href attributes. | |
options:`) | |
flag.PrintDefaults() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment