Skip to content

Instantly share code, notes, and snippets.

@vanleantking
Forked from xeoncross/html_tokens.go
Created October 23, 2018 02:18
Show Gist options
  • Save vanleantking/cd157acc3c293da729b68376ba8cc95d to your computer and use it in GitHub Desktop.
Save vanleantking/cd157acc3c293da729b68376ba8cc95d to your computer and use it in GitHub Desktop.
A simple HTML doc parser in golang that sends the tokens we are looking for back to the caller over a channel.
package main
import (
"fmt"
"strings"
"golang.org/x/net/html"
)
func main() {
HTMLString := `<!DOCTYPE html>
<html itemscope itemtype="http://schema.org/QAPage">
<head>
<title>go - Golang parse HTML, extract all content with &lt;body&gt; &lt;/body&gt; tags - Stack Overflow</title>
<link rel="shortcut icon" href="//cdn.sstatic.net/Sites/stackoverflow/img/favicon.ico?v=4f32ecc8f43d">
<link rel="apple-touch-icon image_src" href="//cdn.sstatic.net/Sites/stackoverflow/img/apple-touch-icon.png?v=c78bd457575a">
<link rel="search" type="application/opensearchdescription+xml" title="Stack Overflow" href="/opensearch.xml">
<meta name="twitter:card" content="summary">
<meta name="twitter:domain" content="stackoverflow.com"/>
<meta property="og:type" content="website" />
</head>
<body class="template-blog">
<nav class="navigation">
<div class="navigation__container container">
<a class="navigation__logo" href="/">
<h1>Foobar</h1>
</a>
<ul class="navigation__menu">
<li><a href="/tags/">Topics</a></li>
<li><a href="/about">About</a></li>
</ul>
</div>`
var c chan Node
var title string
var a []string
wantedTokens := []string{
"a", "title",
}
c = GetTokensFromHTMLString(HTMLString, wantedTokens)
for node := range c {
// fmt.Println(node.Type, node)
if node.Type == "title" {
tt := node.Doc.Next()
if tt == html.TextToken {
next := node.Doc.Token()
title = strings.TrimSpace(next.Data)
}
}
if node.Type == "a" {
a = append(a, node.Type)
}
}
fmt.Println("title", title)
fmt.Println("a", a)
}
// Node foobar
type Node struct {
Type string
Token html.Token
Doc *html.Tokenizer
}
// GetTokensFromHTMLString foobar
func GetTokensFromHTMLString(HTMLString string, wantedTokens []string) (c chan Node) {
c = make(chan Node)
go func() {
defer close(c)
// https://play.golang.org/p/0MRSefJ_-E
r := strings.NewReader(HTMLString)
z := html.NewTokenizer(r)
// defer func() {
// close(c)
// }
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
// End of the document, we're done
return
case tt == html.StartTagToken:
token := z.Token()
for _, name := range wantedTokens {
if token.Data == name {
c <- Node{token.Data, token, z}
}
continue
}
}
}
}()
return c
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment