Skip to content

Instantly share code, notes, and snippets.

@jbaiter
Created September 13, 2017 11:49
Show Gist options
  • Save jbaiter/0978b391f4a2254e676ee70fd98e3057 to your computer and use it in GitHub Desktop.
Save jbaiter/0978b391f4a2254e676ee70fd98e3057 to your computer and use it in GitHub Desktop.
package main
import (
"archive/tar"
"bytes"
"compress/gzip"
"encoding/json"
"errors"
"flag"
"fmt"
"github.com/beevik/etree"
"github.com/vanng822/go-solr/solr"
"gopkg.in/cheggaaa/pb.v1"
"io"
"io/ioutil"
"math"
"os"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
)
type ParseTask struct {
zendId string
pageNo int
xmlData string
}
type OcrBox struct {
zendId string
pageNo int
wordIdx int
text string
offsetX float32
offsetY float32
width float32
height float32
}
var PAGE_PAT = regexp.MustCompile("(bsb\\d{8})_(\\d{5})")
var BBOX_PAT = regexp.MustCompile("bbox (-?\\d+) (-?\\d+) (-?\\d+) (-?\\d+)")
var LANG_PAT = regexp.MustCompile("lang ([a-z]+)")
var CONF_PAT = regexp.MustCompile("x_wconf ([0-9.]+)")
func parseBoxes(xml string, zendId string, pageNo int) ([]OcrBox, error) {
doc := etree.NewDocument()
if err := doc.ReadFromString(xml); err != nil {
return nil, errors.New("Could not parse XML")
}
page := doc.FindElement(".//div[@class='ocr_page']")
if page == nil {
return nil, errors.New("Could not find page element")
}
pageCoords := BBOX_PAT.FindStringSubmatch(page.SelectAttrValue("title", ""))
if len(pageCoords) == 0 {
return nil, errors.New("Page has no bbox!")
}
pageWidth, _ := strconv.Atoi(pageCoords[3])
pageHeight, _ := strconv.Atoi(pageCoords[4])
words := doc.FindElements("//span[@class='ocrx_word']")
boxes := make([]OcrBox, 0, len(words))
i := 0
for _, word := range words {
title := word.SelectAttrValue("title", "")
coords := BBOX_PAT.FindStringSubmatch(title)
if len(coords) == 0 {
fmt.Println("WARNING:", zendId, pageNo, i, "Word has no bbox, skipping")
fmt.Println("DEBUG: Title was ", title)
continue
}
if word.Text() == "" {
continue
}
ulx, _ := strconv.Atoi(coords[1])
uly, _ := strconv.Atoi(coords[2])
lrx, _ := strconv.Atoi(coords[3])
lry, _ := strconv.Atoi(coords[4])
boxes = append(boxes, OcrBox{
zendId: zendId, pageNo: pageNo,
offsetX: float32(math.Min(float64(ulx)/float64(pageWidth), 0.99)),
offsetY: float32(math.Min(float64(uly)/float64(pageHeight), 0.99)),
width: float32(math.Min(float64(lrx-ulx)/float64(pageWidth), 0.99)),
height: float32(math.Min(float64(lry-uly)/float64(pageHeight), 0.99)),
wordIdx: i,
text: strings.Replace(word.Text(), "|", "", -1),
})
i += 1
}
return boxes, nil
}
func readXmls(srcFile string, taskChan chan ParseTask) {
f, err := os.Open(srcFile)
if err != nil {
fmt.Println(srcFile)
panic(err)
}
defer f.Close()
gzf, err := gzip.NewReader(f)
if err != nil {
fmt.Println(srcFile)
panic(err)
}
tarReader := tar.NewReader(gzf)
for {
header, err := tarReader.Next()
if err == io.EOF {
break
}
if err != nil {
panic(err)
}
name := header.Name
if strings.HasSuffix(name, ".html") && !strings.HasSuffix(name, "index.html") {
parts := PAGE_PAT.FindStringSubmatch(name)
zendId := parts[1]
pageNo, _ := strconv.Atoi(parts[2])
xmlBytes, _ := ioutil.ReadAll(tarReader)
xml := string(xmlBytes)
xml = strings.Replace(xml, "­", "-", -1)
taskChan <- ParseTask{xmlData: xml, zendId: zendId, pageNo: pageNo}
}
}
close(taskChan)
}
func makeSolrToken(box OcrBox) string {
if len(box.text) > 224 {
box.text = box.text[:224]
}
return fmt.Sprintf(
"%s|%05d%05d%05d%05d%05d%05d",
box.text, box.pageNo, box.wordIdx,
int(100000*box.offsetX), int(100000*box.offsetY),
int(100000*box.width), int(100000*box.height))
}
func xmlReader(srcFile string) chan ParseTask {
c := make(chan ParseTask, 128)
go readXmls(srcFile, c)
return c
}
func indexBundle(srcFile string, solrUrl string, solrCollection string) {
si, _ := solr.NewSolrInterface(solrUrl, solrCollection)
currentBoxes := make([]OcrBox, 0)
for task := range xmlReader(srcFile) {
if len(currentBoxes) > 0 && task.zendId != currentBoxes[0].zendId {
var buffer bytes.Buffer
for _, box := range currentBoxes {
token := makeSolrToken(box)
buffer.WriteString(token)
buffer.WriteString(" ")
}
doc := solr.Document{
"id": currentBoxes[0].zendId,
"ocr_text": buffer.String(),
}
res, err := si.Add([]solr.Document{doc}, 1, nil)
if err != nil {
panic(err)
}
if !res.Success {
fmt.Println("Could not index ", currentBoxes[0].zendId)
res.Result["doc"] = doc
marshalled, _ := json.MarshalIndent(res.Result, "", " ")
ioutil.WriteFile(
"errors/"+currentBoxes[0].zendId+".json", marshalled, 0644)
}
currentBoxes = make([]OcrBox, 0)
}
boxes, err := parseBoxes(task.xmlData, task.zendId, task.pageNo)
if err != nil {
fmt.Println("ERROR parseBoxes(", task.zendId, ".", task.pageNo, "): ", err)
} else {
currentBoxes = append(currentBoxes, boxes...)
}
}
si.Commit()
}
func main() {
flag.Parse()
bundles := flag.Args()
solrUrl := "http://localhost:8983/solr"
solrCollection := "fulltext"
inChan := make(chan string, len(bundles))
var wg sync.WaitGroup
numWorkers := runtime.NumCPU()
progressChan := make(chan string, numWorkers)
wg.Add(numWorkers)
for i := 1; i <= numWorkers; i++ {
go func() {
for bundlePath := range inChan {
indexBundle(bundlePath, solrUrl, solrCollection)
progressChan <- bundlePath
}
wg.Done()
}()
}
for _, bundle := range bundles {
inChan <- bundle
}
close(inChan)
// Reporter goroutine
wg.Add(1)
go func() {
bar := pb.StartNew(len(bundles))
bar.ShowTimeLeft = true
for i := 0; i < len(bundles); i++ {
<-progressChan
bar.Increment()
}
wg.Done()
}()
wg.Wait()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment