|
package main |
|
|
|
import ( |
|
"context" |
|
"fmt" |
|
"log" |
|
"strings" |
|
"sync" |
|
|
|
"github.com/PuerkitoBio/goquery" |
|
"github.com/chromedp/chromedp" |
|
) |
|
|
|
const url = "https://www.nlrb.gov/search/case" |
|
const searchUrlTemplate = "https://www.nlrb.gov/search/case?page=%d" |
|
|
|
type Record struct { |
|
Defendant string |
|
CaseNumber string |
|
CaseUrl string |
|
} |
|
|
|
func main() { |
|
|
|
// Page number to collect for |
|
jobChan := make(chan int, 1) |
|
// Where to send records to be handled |
|
recordChan := make(chan Record, 1) |
|
writeDoneChan := make(chan bool, 1) |
|
|
|
var wg sync.WaitGroup |
|
|
|
for w_id := 0; w_id < 3; w_id++ { |
|
wg.Add(1) |
|
w := &Worker{w_id, jobChan, recordChan, &wg} |
|
go w.Work() |
|
} |
|
go recordHandler(recordChan, writeDoneChan) |
|
|
|
// ?page=0 to ?page=43803 |
|
// Keeping it short while testing. Don't wanna slam their server. |
|
for page := 0; page < 7; page++ { |
|
jobChan <- page |
|
} |
|
|
|
close(jobChan) |
|
// Wait for workers to finish |
|
log.Println("Main: Waiting for workers to signal wait group done") |
|
wg.Wait() |
|
log.Println("Main: Closing record channel") |
|
close(recordChan) |
|
log.Println("Main: Waiting for writes to complete") |
|
<-writeDoneChan |
|
|
|
} |
|
|
|
// Processes records received via recordChan, eventually signals writes complete via writeDoneChan |
|
func recordHandler(recordChan chan Record, writeDoneChan chan bool) { |
|
defer func() { |
|
log.Println("RecordHandler: signaling write complete") |
|
writeDoneChan <- true |
|
}() |
|
for r := range recordChan { |
|
//TODO use a CSV writer or DB write here instead |
|
fmt.Printf("\tDefendant: %s URL: %s Case#: %s\n", r.Defendant, r.CaseUrl, r.CaseNumber) |
|
} |
|
} |
|
|
|
type Worker struct { |
|
id int |
|
jobChan chan int |
|
recordChan chan Record |
|
wg *sync.WaitGroup |
|
} |
|
|
|
// Convenience function to shorten calls like log.Printf("Worker %d: ...", i, args...) |
|
func (w Worker) Log(fmtString string, args ...interface{}) { |
|
fmtString = fmt.Sprintf("Worker %d: ", w.id) + fmtString |
|
if len(args) == 0 { |
|
log.Println(fmtString) |
|
} else { |
|
log.Printf(fmtString+"\n", args...) |
|
} |
|
} |
|
|
|
// Work creates consumes jobs from a channel and processes each |
|
func (w Worker) Work() { |
|
defer w.wg.Done() |
|
|
|
// I think sharing one context across workers causes sync issues, |
|
// so giving each worker its own context to re-use in serial |
|
ctx, cancel := chromedp.NewContext(context.Background()) |
|
defer cancel() |
|
|
|
for page := range w.jobChan { |
|
w.Log("Processing page %d", page) |
|
w.Process(ctx, page) |
|
} |
|
w.Log("Complete") |
|
} |
|
|
|
// Process a single job from queue |
|
func (w Worker) Process(ctx context.Context, page int) { |
|
w.Log("Chromedp fetching page %d", page) |
|
searchUrl := fmt.Sprintf(searchUrlTemplate, page) |
|
|
|
var data string |
|
|
|
if err := chromedp.Run(ctx, |
|
chromedp.Navigate(searchUrl), |
|
// Wait til list elements have been populated by JS |
|
chromedp.WaitVisible(`.wrapper-div`), |
|
// Grab the whole list |
|
chromedp.OuterHTML(".case-search-results", &data, chromedp.ByQuery), |
|
); err != nil { |
|
w.Log("Chromedp failed to fetch page %d: %s", page, err) |
|
return |
|
} |
|
|
|
w.Log("Goquery parsing page %d", page) |
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(data)) |
|
if err != nil { |
|
w.Log("Goquery failed to parse page %d: %s", page, err) |
|
return |
|
} |
|
doc.Find(".case-search-results .wrapper-div").Each(func(i int, s *goquery.Selection) { |
|
// Each case is just in a div element. For each such .wrapper-div : |
|
title := s.Find(".grid-row h2 > a") |
|
defendant := title.Text() |
|
caseUrl, exists := title.Attr("href") |
|
if !exists { |
|
caseUrl = "Missing" |
|
w.Log("Page %d: failed to find case url for defendant %d", page, defendant) |
|
} else { |
|
caseUrl = "https://www.nlrb.gov" + caseUrl |
|
} |
|
// Could fail if the a isn't found, in which case First() returns an empty Selection |
|
caseNo := s.Find(".wrapper-main-content a").First().Text() |
|
w.recordChan <- Record{defendant, caseNo, caseUrl} |
|
}) |
|
} |