Skip to content

Instantly share code, notes, and snippets.

@windhooked
Created August 8, 2019 15:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save windhooked/3fd042d1d87b0f9b560ce5f47a226e9a to your computer and use it in GitHub Desktop.
Save windhooked/3fd042d1d87b0f9b560ce5f47a226e9a to your computer and use it in GitHub Desktop.
package main
import (
"fmt"
sp "github.com/scipipe/scipipe"
comp "github.com/scipipe/scipipe/components"
"io/ioutil"
// "path/filepath"
"os"
"strings"
)
func main() {
path := "data/*/attachments/*.pdf"
wf := sp.NewWorkflow("pdf2ocr", 20)
globpdf := comp.NewFileGlobber(wf, "findpdf", path)
ximages := wf.NewProc("pdfimages", "pdfimages -tiff -p \"{i:in}\" \"{i:in}\" ; ls \"{i:in}\"*.tif > {o:out} ") // {o:out}
ximages.In("in").From(globpdf.Out())
ocr := wf.NewProc("tesseract", "# Just create the ports: {i:tiffiles} {o:out}")
ocr.CustomExecute = func(t *sp.Task) {
tiffiles, err := ioutil.ReadFile(t.InPath("tiffiles"))
sp.Check(err)
tifFiles := strings.Split(string(tiffiles), "\n")
for _, tifFile := range tifFiles {
//fmt.Println(filepath.Abs(tifFile))
fmt.Println(os.Getwd())
//sp.ExecCmd(fmt.Sprintf("tesseract %s %s.ocr -psm 1 -l eng && ls %s*.txt >> %s", tifFile, tifFile, tifFile, t.OutIP("out").TempPath()))
//sp.ExecCmd(fmt.Sprintf("/usr/bin/tesseract \"%s\" \"%s.ocr\" --psm 1 --oem 1 -l eng ", tifFile, tifFile))
sp.ExecCmd(fmt.Sprintf("ls %s ", tifFile))
}
}
ocr.In("tiffiles").From(ximages.Out("out"))
// ocr.SetOut("out", "{i:tiffiles}.tesseract.out")
wf.Run()
}
~
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment