Skip to content

Instantly share code, notes, and snippets.

@beeekind
Last active December 2, 2020 19:35
Show Gist options
  • Save beeekind/a7c5ac1303626af3d70c4106e9c7dd08 to your computer and use it in GitHub Desktop.
Save beeekind/a7c5ac1303626af3d70c4106e9c7dd08 to your computer and use it in GitHub Desktop.
Simple intuitive way to scrape single page applications in golang
package main
import (
"fmt"
"context"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/chromedp"
)
func main(){
doc, err := ParseWebApp("youtube.com")
if err != nil {
// ...
}
fmt.Println(doc.Find("h1").First().Text())
}
// ParseWebApp ...
func ParseWebApp(url string) (*goquery.Document, error) {
var outterHTML string
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
if err := chromedp.Run(ctx, chromedp.Tasks{
chromedp.Navigate(url),
// js rendering happens asynchronously and this call seems to be enough to account for that
chromedp.WaitReady(":root"),
chromedp.ActionFunc(func(ctx context.Context) error {
node, err := dom.GetDocument().Do(ctx)
if err != nil {
return err
}
outterHTML, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
return err
}),
}); err != nil {
return nil, fmt.Errorf("ParseWebApp(): ActionFunc(): %w", err)
}
doc, err := goquery.NewDocumentFromReader(strings.NewReader(outterHTML))
if err != nil {
return nil, fmt.Errorf("ParseWebApp(): goquery.NewDocumentFromReader(): %w", err)
}
return doc, nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment