Skip to content

Instantly share code, notes, and snippets.

@pressure679
Created April 13, 2019 22:48
Show Gist options
  • Save pressure679/ef0c18451218fe02899549748bed5899 to your computer and use it in GitHub Desktop.
Save pressure679/ef0c18451218fe02899549748bed5899 to your computer and use it in GitHub Desktop.
package main
import (
"fmt"
"strings"
"bytes"
"golang.org/x/net/html"
"github.com/akhenakh/gozim"
"github.com/jaytaylor/html2text"
)
func main() {
zimPath := "/home/pressure679/Documents/Wiki/Books/wikibooks_en_all_nopic_2019-03.zim"
z, err := zim.NewReader(zimPath, false)
article, err := z.ArticleAt(6985040)
if err != nil { panic(err) }
data, err := article.Data()
if err != nil { panic(err) }
fmt.Println(string(data))
text, err := html2text.FromString(string(data), html2text.Options{PrettyTables: false})
if err != nil { panic(err) }
fmt.Println(text)
reader := bytes.NewReader(data)
htmlTokenizer := html.NewTokenizer(reader)
for {
tt := htmlTokenizer.Next()
switch {
case tt == html.ErrorToken:
return
case tt == html.StartTagToken:
t := htmlTokenizer.Token()
if strings.EqualFold(t.Data, "a") {
for _, a := range t.Attr {
if a.Key == "href" {
fmt.Println("Found href:", a.Val)
break
}
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment