Skip to content

Instantly share code, notes, and snippets.

@josephcc
Last active December 10, 2015 13:48
Show Gist options
  • Save josephcc/4443212 to your computer and use it in GitHub Desktop.
Save josephcc/4443212 to your computer and use it in GitHub Desktop.
from nltk.corpus import wordnet as wn
import tesseract
import string
api = tesseract.TessBaseAPI()
api.Init(".","eng",tesseract.OEM_DEFAULT)
api.SetVariable("tessedit_char_whitelist", string.ascii_uppercase)
mBuffer = open('source.jpg','rb').read()
result = tesseract.ProcessPagesBuffer(mBuffer,len(mBuffer),api)
print result
def iter_result(_min, _max, result):
for length in range(_min, _max+1):
for y in range(len(result)):
for x in range(len(result[0])):
h = result[y][x:x+length]
if len(h) == length:
yield ((y,x),(y,x+length),''.join(h))
v = [row[x] for row in result[y:y+length]]
if len(v) == length:
yield ((y,x),(y+length,x),''.join(v))
d = [row[x:x+length] for row in result[y:y+length]]
if len(d) != length or len(d[-1]) != length:
continue
d = [d[i][i] for i in range(length)]
yield ((y,x),(y+length,x+length),''.join(d))
result = [x.split() for x in result.strip().splitlines()]
for s,e,w in iter_result(3, 17, result):
lemmas = wn.lemmas(w.lower())
if len(lemmas) != 0 and sum([x.count() for x in lemmas]) >= 1:
print w, s, e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment