Skip to content

Instantly share code, notes, and snippets.

@seanhagen
Created September 1, 2022 23:53
Show Gist options
  • Save seanhagen/5b4ae6ba755db52ab79640d088f75dce to your computer and use it in GitHub Desktop.
Save seanhagen/5b4ae6ba755db52ab79640d088f75dce to your computer and use it in GitHub Desktop.
Simple web scraper to get a list of English words
package main
import (
"fmt"
"io"
"net/http"
"os"
"github.com/PuerkitoBio/goquery"
)
const urlBase = "https://en.wikwik.org/allwordspage%d.htm"
const fileName = "allwords.txt"
var startPage = 2066
func main() {
file, err := os.OpenFile(fileName, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
if err != nil {
fmt.Printf("unable to open %s: %v\n", fileName, err)
os.Exit(1)
}
fmt.Printf("starting page scrape...\n")
for i := startPage; i >= 1; i-- {
url := fmt.Sprintf(urlBase, i)
fmt.Printf("page: %v - %s\n", i, url)
if err := scrapePage(file, url); err != nil {
fmt.Printf("unable to scrape '%v': %v\n", url, err)
continue
}
}
fmt.Printf("finished scraping, closing file...\n")
if err := file.Close(); err != nil {
fmt.Printf("unable to close output file: %v\n", err)
}
fmt.Printf("done!\n\n")
}
func scrapePage(wr io.Writer, url string) error {
resp, err := http.Get(url)
if err != nil {
return fmt.Errorf("unable to fetch page '%v', got error: %w", url, err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("got status code %v instead of 200", resp.StatusCode)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return fmt.Errorf("unable to parse document using goquery: %w", err)
}
doc.Find("p.mm > a").Each(func(i int, s *goquery.Selection) {
if _, err := fmt.Fprintf(wr, "%v\n", s.Text()); err != nil {
fmt.Printf("unable to write '%s' (item '%v' from url '%v') to output: %v\n", s.Text(), i, url, err)
}
})
return nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment