Created
September 1, 2022 23:53
-
-
Save seanhagen/5b4ae6ba755db52ab79640d088f75dce to your computer and use it in GitHub Desktop.
Simple web scraper to get a list of English words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"io" | |
"net/http" | |
"os" | |
"github.com/PuerkitoBio/goquery" | |
) | |
const urlBase = "https://en.wikwik.org/allwordspage%d.htm" | |
const fileName = "allwords.txt" | |
var startPage = 2066 | |
func main() { | |
file, err := os.OpenFile(fileName, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) | |
if err != nil { | |
fmt.Printf("unable to open %s: %v\n", fileName, err) | |
os.Exit(1) | |
} | |
fmt.Printf("starting page scrape...\n") | |
for i := startPage; i >= 1; i-- { | |
url := fmt.Sprintf(urlBase, i) | |
fmt.Printf("page: %v - %s\n", i, url) | |
if err := scrapePage(file, url); err != nil { | |
fmt.Printf("unable to scrape '%v': %v\n", url, err) | |
continue | |
} | |
} | |
fmt.Printf("finished scraping, closing file...\n") | |
if err := file.Close(); err != nil { | |
fmt.Printf("unable to close output file: %v\n", err) | |
} | |
fmt.Printf("done!\n\n") | |
} | |
func scrapePage(wr io.Writer, url string) error { | |
resp, err := http.Get(url) | |
if err != nil { | |
return fmt.Errorf("unable to fetch page '%v', got error: %w", url, err) | |
} | |
defer resp.Body.Close() | |
if resp.StatusCode != http.StatusOK { | |
return fmt.Errorf("got status code %v instead of 200", resp.StatusCode) | |
} | |
doc, err := goquery.NewDocumentFromReader(resp.Body) | |
if err != nil { | |
return fmt.Errorf("unable to parse document using goquery: %w", err) | |
} | |
doc.Find("p.mm > a").Each(func(i int, s *goquery.Selection) { | |
if _, err := fmt.Fprintf(wr, "%v\n", s.Text()); err != nil { | |
fmt.Printf("unable to write '%s' (item '%v' from url '%v') to output: %v\n", s.Text(), i, url, err) | |
} | |
}) | |
return nil | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment