Skip to content

Instantly share code, notes, and snippets.

@malisetti
Created March 15, 2019 04:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save malisetti/d76d3b7827fcc63845650f9b7e86fd5d to your computer and use it in GitHub Desktop.
Save malisetti/d76d3b7827fcc63845650f9b7e86fd5d to your computer and use it in GitHub Desktop.
get text representation of html and do entity extraction
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"sort"
"strings"
"golang.org/x/net/html"
prose "gopkg.in/jdkato/prose.v2"
)
func main() {
res, err := http.Get(os.Args[1])
if err != nil {
log.Fatal(err)
}
defer res.Body.Close()
z := html.NewTokenizer(res.Body)
lines := []string{}
done := false
for !done {
tt := z.Next()
switch tt {
case html.ErrorToken:
if z.Err() == io.EOF {
done = true
} else {
log.Println(z.Err())
}
continue
case html.SelfClosingTagToken, html.EndTagToken:
continue
case html.StartTagToken:
tn, _ := z.TagName()
switch string(tn) {
case "script", "style":
z.Next()
continue
}
case html.TextToken:
txt := strings.TrimSpace(string(z.Text()))
if len(txt) == 0 {
continue
}
lines = append(lines, txt)
}
}
txt := strings.Join(lines, "\n")
log.Println(txt)
log.Println("==================================")
// Create a new document with the default configuration:
doc, err := prose.NewDocument(txt)
if err != nil {
log.Fatal(err)
}
m := make(map[string]string)
// Iterate over the doc's named-entities:
for _, ent := range doc.Entities() {
if err != nil {
log.Println(err)
continue
}
m[ent.Text] = ent.Label
}
txts := []string{}
for txt := range m {
txts = append(txts, txt)
}
sort.Strings(txts)
for i, txt := range txts {
if i+1 < len(txts) && (strings.HasPrefix(txts[i+1], txt) || strings.HasPrefix(txts[i+1], txt[0:len(txt)/2]) || strings.HasPrefix(txt, txts[i+1][0:len(txts[i+1])/2])) {
continue
}
fmt.Println(txt, m[txt])
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment