Sigmame/pdf_table_with Tesseract

## pdf_table_with Tesseract
#Refer http://craiget.com/extracting-table-data-from-pdfs-with-ocr/
import Image, ImageOps
import subprocess, sys, os, glob

# minimum run of adjacent pixels to call something a line
H_THRESH = 300
V_THRESH = 300

def get_hlines(pix, w, h):
    """Get start/end pixels of lines containing horizontal runs of at least THRESH black pix"""
    hlines = []
    for y in range(h):
        x1, x2 = (None, None)
        black = 0
        run = 0
        for x in range(w):
            if pix[x,y] == (0,0,0):
                black = black + 1
                if not x1: x1 = x
                x2 = x
            else:
                if black > run:
                    run = black
                black = 0
        if run > H_THRESH:
            hlines.append((x1,y,x2,y))
    return hlines

def get_vlines(pix, w, h):
    """Get start/end pixels of lines containing vertical runs of at least THRESH black pix"""
    vlines = []
    for x in range(w):
        y1, y2 = (None,None)
        black = 0
        run = 0
        for y in range(h):
            if pix[x,y] == (0,0,0):
                black = black + 1
                if not y1: y1 = y
                y2 = y
            else:
                if black > run:
                    run = black
                black = 0
        if run > V_THRESH:
            vlines.append((x,y1,x,y2))
    return vlines

def get_cols(vlines):
    """Get top-left and bottom-right coordinates for each column from a list of vertical lines"""
    cols = []
    for i in range(1, len(vlines)):
        if vlines[i][0] - vlines[i-1][0] > 1:
            cols.append((vlines[i-1][0],vlines[i-1][1],vlines[i][2],vlines[i][3]))
    return cols

def get_rows(hlines):
    """Get top-left and bottom-right coordinates for each row from a list of vertical lines"""
    rows = []
    for i in range(1, len(hlines)):
        if hlines[i][1] - hlines[i-1][3] > 1:
            rows.append((hlines[i-1][0],hlines[i-1][1],hlines[i][2],hlines[i][3]))
    return rows

def get_cells(rows, cols):
    """Get top-left and bottom-right coordinates for each cell usings row and column coordinates"""
    cells = {}
    for i, row in enumerate(rows):
        cells.setdefault(i, {})
        for j, col in enumerate(cols):
            x1 = col[0]
            y1 = row[1]
            x2 = col[2]
            y2 = row[3]
            cells[i][j] = (x1,y1,x2,y2)
    return cells

def ocr_cell(im, cells, x, y):
    """Return OCRed text from this cell"""
    fbase = "working/%d-%d" % (x, y)
    ftif = "%s.tif" % fbase
    ftxt = "%s.txt" % fbase
    cmd = "tesseract %s %s" % (ftif, fbase)
    # extract cell from whole image, grayscale (1-color channel), monochrome
    region = im.crop(cells[x][y])
    region = ImageOps.grayscale(region)
    region = region.point(lambda p: p > 200 and 255)
    # determine background color (most used color)
    histo = region.histogram()
    if histo[0] > histo[255]: bgcolor = 0
    else: bgcolor = 255
    # trim borders by finding top-left and bottom-right bg pixels
    pix = region.load()
    x1,y1 = 0,0
    x2,y2 = region.size
    x2,y2 = x2-1,y2-1
    while pix[x1,y1] != bgcolor:
        x1 += 1
        y1 += 1
    while pix[x2,y2] != bgcolor:
        x2 -= 1
        y2 -= 1
    # save as TIFF and extract text with Tesseract OCR
    trimmed = region.crop((x1,y1,x2,y2))
    trimmed.save(ftif, "TIFF")
    subprocess.call([cmd], shell=True, stderr=subprocess.PIPE)
    lines = [l.strip() for l in open(ftxt).readlines()]
    return lines[0]

def get_image_data(filename):
    """Extract textual data[rows][cols] from spreadsheet-like image file"""
    im = Image.open(filename)
    pix = im.load()
    width, height = im.size
    hlines = get_hlines(pix, width, height)
    sys.stderr.write("%s: hlines: %d\n" % (filename, len(hlines)))
    vlines = get_vlines(pix, width, height)
    sys.stderr.write("%s: vlines: %d\n" % (filename, len(vlines)))
    rows = get_rows(hlines)
    sys.stderr.write("%s: rows: %d\n" % (filename, len(rows)))
    cols = get_cols(vlines)
    sys.stderr.write("%s: cols: %d\n" % (filename, len(cols)))
    cells = get_cells(rows, cols)

    data = []
    for row in range(len(rows)):
        data.append([ocr_cell(im,cells, row, col) for col in range(len(cols))])
    return data

def split_pdf(filename):
    """Split PDF into PNG pages, return filenames"""
    prefix = filename[:-4]
    cmd = "convert -density 600 %s working/%s-%%d.png" % (filename, prefix)
    subprocess.call([cmd], shell=True)
    return [f for f in glob.glob(os.path.join('working', '%s*' % prefix))]

def extract_pdf(filename):
    """Extract table data from pdf"""
    pngfiles = split_pdf(filename)
    sys.stderr.write("Pages: %d\n" % len(pngfiles))
    # extract table data from each page
    data = []
    for pngfile in pngfiles:
        pngdata = get_image_data(pngfile)
        for d in pngdata:
            data.append(d)
        # remove temp files for this page
        os.system("rm working/*.tif")
        os.system("rm working/*.txt")
    # remove split pages
    os.system("rm working/*")
    return data

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print "Usage: ctocr.py FILENAME"
        exit()
    # split target pdf into pages
    filename = sys.argv[1]
    data = extract_pdf(filename)
    for row in data:
        print "\t".join(row)
	#Refer http://craiget.com/extracting-table-data-from-pdfs-with-ocr/
	import Image, ImageOps
	import subprocess, sys, os, glob

	# minimum run of adjacent pixels to call something a line
	H_THRESH = 300
	V_THRESH = 300

	def get_hlines(pix, w, h):
	"""Get start/end pixels of lines containing horizontal runs of at least THRESH black pix"""
	hlines = []
	for y in range(h):
	x1, x2 = (None, None)
	black = 0
	run = 0
	for x in range(w):
	if pix[x,y] == (0,0,0):
	black = black + 1
	if not x1: x1 = x
	x2 = x
	else:
	if black > run:
	run = black
	black = 0
	if run > H_THRESH:
	hlines.append((x1,y,x2,y))
	return hlines

	def get_vlines(pix, w, h):
	"""Get start/end pixels of lines containing vertical runs of at least THRESH black pix"""
	vlines = []
	for x in range(w):
	y1, y2 = (None,None)
	black = 0
	run = 0
	for y in range(h):
	if pix[x,y] == (0,0,0):
	black = black + 1
	if not y1: y1 = y
	y2 = y
	else:
	if black > run:
	run = black
	black = 0
	if run > V_THRESH:
	vlines.append((x,y1,x,y2))
	return vlines

	def get_cols(vlines):
	"""Get top-left and bottom-right coordinates for each column from a list of vertical lines"""
	cols = []
	for i in range(1, len(vlines)):
	if vlines[i][0] - vlines[i-1][0] > 1:
	cols.append((vlines[i-1][0],vlines[i-1][1],vlines[i][2],vlines[i][3]))
	return cols

	def get_rows(hlines):
	"""Get top-left and bottom-right coordinates for each row from a list of vertical lines"""
	rows = []
	for i in range(1, len(hlines)):
	if hlines[i][1] - hlines[i-1][3] > 1:
	rows.append((hlines[i-1][0],hlines[i-1][1],hlines[i][2],hlines[i][3]))
	return rows

	def get_cells(rows, cols):
	"""Get top-left and bottom-right coordinates for each cell usings row and column coordinates"""
	cells = {}
	for i, row in enumerate(rows):
	cells.setdefault(i, {})
	for j, col in enumerate(cols):
	x1 = col[0]
	y1 = row[1]
	x2 = col[2]
	y2 = row[3]
	cells[i][j] = (x1,y1,x2,y2)
	return cells

	def ocr_cell(im, cells, x, y):
	"""Return OCRed text from this cell"""
	fbase = "working/%d-%d" % (x, y)
	ftif = "%s.tif" % fbase
	ftxt = "%s.txt" % fbase
	cmd = "tesseract %s %s" % (ftif, fbase)
	# extract cell from whole image, grayscale (1-color channel), monochrome
	region = im.crop(cells[x][y])
	region = ImageOps.grayscale(region)
	region = region.point(lambda p: p > 200 and 255)
	# determine background color (most used color)
	histo = region.histogram()
	if histo[0] > histo[255]: bgcolor = 0
	else: bgcolor = 255
	# trim borders by finding top-left and bottom-right bg pixels
	pix = region.load()
	x1,y1 = 0,0
	x2,y2 = region.size
	x2,y2 = x2-1,y2-1
	while pix[x1,y1] != bgcolor:
	x1 += 1
	y1 += 1
	while pix[x2,y2] != bgcolor:
	x2 -= 1
	y2 -= 1
	# save as TIFF and extract text with Tesseract OCR
	trimmed = region.crop((x1,y1,x2,y2))
	trimmed.save(ftif, "TIFF")
	subprocess.call([cmd], shell=True, stderr=subprocess.PIPE)
	lines = [l.strip() for l in open(ftxt).readlines()]
	return lines[0]

	def get_image_data(filename):
	"""Extract textual data[rows][cols] from spreadsheet-like image file"""
	im = Image.open(filename)
	pix = im.load()
	width, height = im.size
	hlines = get_hlines(pix, width, height)
	sys.stderr.write("%s: hlines: %d\n" % (filename, len(hlines)))
	vlines = get_vlines(pix, width, height)
	sys.stderr.write("%s: vlines: %d\n" % (filename, len(vlines)))
	rows = get_rows(hlines)
	sys.stderr.write("%s: rows: %d\n" % (filename, len(rows)))
	cols = get_cols(vlines)
	sys.stderr.write("%s: cols: %d\n" % (filename, len(cols)))
	cells = get_cells(rows, cols)

	data = []
	for row in range(len(rows)):
	data.append([ocr_cell(im,cells, row, col) for col in range(len(cols))])
	return data

	def split_pdf(filename):
	"""Split PDF into PNG pages, return filenames"""
	prefix = filename[:-4]
	cmd = "convert -density 600 %s working/%s-%%d.png" % (filename, prefix)
	subprocess.call([cmd], shell=True)
	return [f for f in glob.glob(os.path.join('working', '%s*' % prefix))]

	def extract_pdf(filename):
	"""Extract table data from pdf"""
	pngfiles = split_pdf(filename)
	sys.stderr.write("Pages: %d\n" % len(pngfiles))
	# extract table data from each page
	data = []
	for pngfile in pngfiles:
	pngdata = get_image_data(pngfile)
	for d in pngdata:
	data.append(d)
	# remove temp files for this page
	os.system("rm working/*.tif")
	os.system("rm working/*.txt")
	# remove split pages
	os.system("rm working/*")
	return data

	if __name__ == '__main__':
	if len(sys.argv) != 2:
	print "Usage: ctocr.py FILENAME"
	exit()
	# split target pdf into pages
	filename = sys.argv[1]
	data = extract_pdf(filename)
	for row in data:
	print "\t".join(row)