This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Refer http://craiget.com/extracting-table-data-from-pdfs-with-ocr/ | |
import Image, ImageOps | |
import subprocess, sys, os, glob | |
# minimum run of adjacent pixels to call something a line | |
H_THRESH = 300 | |
V_THRESH = 300 | |
def get_hlines(pix, w, h): | |
"""Get start/end pixels of lines containing horizontal runs of at least THRESH black pix""" |