Skip to content

Instantly share code, notes, and snippets.

@eenblam
Last active March 31, 2023 21:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eenblam/1e679bc9863eb7eecbd653780b803a3e to your computer and use it in GitHub Desktop.
Save eenblam/1e679bc9863eb7eecbd653780b803a3e to your computer and use it in GitHub Desktop.
Quick and dirty NLRB case scraper

As discussed at https://eenblam.github.io/recurse_004

Proof of concept. Some concerns:

  • Should be more polite and set a user-agent with contact info
  • Definitely need to handle job failure and site unavailability:
    • Push failed jobs onto a re-do queue
    • Enable backoff/slowdown if enough jobs are failing
  • Still need to write out to CSV or DB when done debugging
  • Could probably get away without also using goquery? Or could try a different headless browser like https://github.com/sourcegraph/webloop
package main
import (
"context"
"fmt"
"log"
"strings"
"sync"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/chromedp"
)
const url = "https://www.nlrb.gov/search/case"
const searchUrlTemplate = "https://www.nlrb.gov/search/case?page=%d"
type Record struct {
Defendant string
CaseNumber string
CaseUrl string
}
func main() {
// Page number to collect for
jobChan := make(chan int, 1)
// Where to send records to be handled
recordChan := make(chan Record, 1)
writeDoneChan := make(chan bool, 1)
var wg sync.WaitGroup
for w_id := 0; w_id < 3; w_id++ {
wg.Add(1)
w := &Worker{w_id, jobChan, recordChan, &wg}
go w.Work()
}
go recordHandler(recordChan, writeDoneChan)
// ?page=0 to ?page=43803
// Keeping it short while testing. Don't wanna slam their server.
for page := 0; page < 7; page++ {
jobChan <- page
}
close(jobChan)
// Wait for workers to finish
log.Println("Main: Waiting for workers to signal wait group done")
wg.Wait()
log.Println("Main: Closing record channel")
close(recordChan)
log.Println("Main: Waiting for writes to complete")
<-writeDoneChan
}
// Processes records received via recordChan, eventually signals writes complete via writeDoneChan
func recordHandler(recordChan chan Record, writeDoneChan chan bool) {
defer func() {
log.Println("RecordHandler: signaling write complete")
writeDoneChan <- true
}()
for r := range recordChan {
//TODO use a CSV writer or DB write here instead
fmt.Printf("\tDefendant: %s URL: %s Case#: %s\n", r.Defendant, r.CaseUrl, r.CaseNumber)
}
}
type Worker struct {
id int
jobChan chan int
recordChan chan Record
wg *sync.WaitGroup
}
// Convenience function to shorten calls like log.Printf("Worker %d: ...", i, args...)
func (w Worker) Log(fmtString string, args ...interface{}) {
fmtString = fmt.Sprintf("Worker %d: ", w.id) + fmtString
if len(args) == 0 {
log.Println(fmtString)
} else {
log.Printf(fmtString+"\n", args...)
}
}
// Work creates consumes jobs from a channel and processes each
func (w Worker) Work() {
defer w.wg.Done()
// I think sharing one context across workers causes sync issues,
// so giving each worker its own context to re-use in serial
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
for page := range w.jobChan {
w.Log("Processing page %d", page)
w.Process(ctx, page)
}
w.Log("Complete")
}
// Process a single job from queue
func (w Worker) Process(ctx context.Context, page int) {
w.Log("Chromedp fetching page %d", page)
searchUrl := fmt.Sprintf(searchUrlTemplate, page)
var data string
if err := chromedp.Run(ctx,
chromedp.Navigate(searchUrl),
// Wait til list elements have been populated by JS
chromedp.WaitVisible(`.wrapper-div`),
// Grab the whole list
chromedp.OuterHTML(".case-search-results", &data, chromedp.ByQuery),
); err != nil {
w.Log("Chromedp failed to fetch page %d: %s", page, err)
return
}
w.Log("Goquery parsing page %d", page)
doc, err := goquery.NewDocumentFromReader(strings.NewReader(data))
if err != nil {
w.Log("Goquery failed to parse page %d: %s", page, err)
return
}
doc.Find(".case-search-results .wrapper-div").Each(func(i int, s *goquery.Selection) {
// Each case is just in a div element. For each such .wrapper-div :
title := s.Find(".grid-row h2 > a")
defendant := title.Text()
caseUrl, exists := title.Attr("href")
if !exists {
caseUrl = "Missing"
w.Log("Page %d: failed to find case url for defendant %d", page, defendant)
} else {
caseUrl = "https://www.nlrb.gov" + caseUrl
}
// Could fail if the a isn't found, in which case First() returns an empty Selection
caseNo := s.Find(".wrapper-main-content a").First().Text()
w.recordChan <- Record{defendant, caseNo, caseUrl}
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment