Skip to content

Instantly share code, notes, and snippets.

@b5
Created September 27, 2017 22:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save b5/99e48d6e4e8d7f37159da17df90f08ae to your computer and use it in GitHub Desktop.
Save b5/99e48d6e4e8d7f37159da17df90f08ae to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"flag"
"fmt"
"github.com/PuerkitoBio/purell"
"io"
"net/url"
"os"
)
var (
showHelp bool
outFile string
inFile string
)
func init() {
flag.BoolVar(&showHelp, "h", false, "print help text")
flag.StringVar(&inFile, "f", "", "file to read from")
flag.StringVar(&outFile, "o", "", "path to write file to")
}
func main() {
// parse flags, grabbing values from the command line
flag.Parse()
if len(os.Args) == 1 || showHelp {
PrintHelpText()
return
}
s, err := NewFileScanner(inFile)
if err != nil {
fmt.Println(err.Error())
return
}
// allocate a new results writer
w, err := NewResultsWriter(outFile)
if err != nil {
fmt.Println(err.Error())
return
}
stats, err := NormalizeUrls(s, w)
if err != nil {
fmt.Println(err.Error())
return
}
// if stdout isn't being used for output, write stats to stdout
if w != os.Stderr {
fmt.Println(stats)
}
// check to see if our writer implements the closer interface,
// call close if so
if closer, ok := w.(io.Closer); ok {
if err := closer.Close(); err != nil {
fmt.Println(err.Error())
return
}
}
}
// Stats tracks state for extracting hrefs from a given document
type Stats struct {
// elements matched by selector
Urls int
// elements who's absolute url was a duplicate
Duplicates int
// elements with a valid url
Added int
}
// stats implements the stringer interface
func (s *Stats) String() string {
return fmt.Sprintf("%d Urls scanned.\n%d were duplicates.\n%d urls written.", s.Urls, s.Duplicates, s.Added)
}
// NewFileScanner scans lines from a file
func NewFileScanner(path string) (*bufio.Scanner, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
return bufio.NewScanner(f), nil
}
// NewResultsWriter writes to either a file or stderr if no path is provided
func NewResultsWriter(path string) (io.Writer, error) {
if path != "" {
return os.Create(path)
}
return os.Stderr, nil
}
// NormalizeUrls fetches a given url, and uses the provided jquery-style selector to grab
// all of the "href" attributes for a given url HTML document, writing a line-delimited list of
// deduplicated absolute urls to w
func NormalizeUrls(s *bufio.Scanner, w io.Writer) (*Stats, error) {
// create a stats object with the total number of matched element
stats := &Stats{}
// added is a list of urls that have been added already
added := map[string]bool{}
// iterate through elements
for s.Scan() {
stats.Urls++
u, _ := url.Parse(s.Text())
urlstr := purell.NormalizeURL(u, purell.FlagsUsuallySafeGreedy)
if added[urlstr] == false {
added[urlstr] = true
stats.Added++
// write the url as a line to the writer
w.Write([]byte(fmt.Sprintf("%s\n", urlstr)))
} else {
stats.Duplicates++
}
}
return stats, nil
}
// PrintHelpText outputs instructions for using this program to os.Stdout
func PrintHelpText() {
fmt.Println(`
normalize is a command line tool for normalizing a list of urls
writing each url on a new line.
Each matched url is:
* absolute (referenced by source url)
* unique - (no duplicates are added to the list)
* refers to a separate resource - (no url fragments)
extract_href uses a jquery-style selector to search the HTML document for elements that have an href attribute
to construct a de-duplicated list of href attributes.
options:`)
flag.PrintDefaults()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment