Skip to content

Instantly share code, notes, and snippets.

@ericchiang
Created January 12, 2016 17:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ericchiang/4cbefb674c9a4c0b33c2 to your computer and use it in GitHub Desktop.
Save ericchiang/4cbefb674c9a4c0b33c2 to your computer and use it in GitHub Desktop.
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)
func main() {
root, err := getAndParse("https://www.wikipedia.org/", "")
if err != nil {
log.Fatal(err)
}
title, ok := getTitle(root)
if !ok {
log.Fatal("could not find title")
}
html.Render(os.Stdout, title)
fmt.Println()
}
// getAndParse makes a GET request to the provided URL and attempts
// to parse it as HTML encoded as the given charset. This method
// returns the root node of the page.
//
// If charset is an empty string the charset is guessed.
func getAndParse(url, pageCharset string) (*html.Node, error) {
// Make a GET request to the provided URL.
resp, err := http.Get(url)
if err != nil {
return nil, fmt.Errorf("GET: %v", err)
}
defer resp.Body.Close()
// html.Parse assumes content to be UTF-8 encoded.
var reader io.Reader
if pageCharset == "" {
// Attempt to guess the charset of the HTML document.
reader, err = charset.NewReader(resp.Body, "")
if err != nil {
return nil, fmt.Errorf("creating new charset reader: %v", err)
}
} else {
// Lookup the charset and attempts to transform it to UTF-8.
e, name := charset.Lookup(pageCharset)
if name == "" {
return nil, fmt.Errorf("provided charset not found")
}
reader = transform.NewReader(resp.Body, e.NewDecoder())
}
// Use the html package to parse the page.
root, err := html.Parse(reader)
if err != nil {
return nil, fmt.Errorf("parsing HTML: %v", err)
}
return root, nil
}
// getTitle recursively looks up the <title> element.
func getTitle(node *html.Node) (*html.Node, bool) {
if node.DataAtom == atom.Title {
return node, true
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
if title, ok := getTitle(c); ok {
return title, true
}
}
return nil, false
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment