yinleon/requirements.txt

## requirements.txt
numpy
tqdm
pdf2image
opencv-python
pytesseract
Pillow

## tesseract_ocr.py
import os
from PIL import Image
import pytesseract
import cv2
from pdf2image import convert_from_path
import numpy as np
from tqdm import tqdm

# change these variables to the path of your input/output
fn_pdf = ''
fn_out = ''

def orc_flow(img):
    """
    Takes PIL image, preprocesses, and then returns text
    """
    image = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    gray = cv2.medianBlur(gray, 3)
    text = pytesseract.image_to_string(gray)
    return text

# get all the images from on PDF file
images = convert_from_path(fn_pdf)

# process each image with ORC
data = []
for img in tqdm(images):
    text = orc_flow(img)
    data.append(text)

# write file
with open(fn_out, 'w') as f:
    for txt in data:
        f.write(txt + '\n')
	import os
	from PIL import Image
	import pytesseract
	import cv2
	from pdf2image import convert_from_path
	import numpy as np
	from tqdm import tqdm

	# change these variables to the path of your input/output
	fn_pdf = ''
	fn_out = ''

	def orc_flow(img):
	"""
	Takes PIL image, preprocesses, and then returns text
	"""
	image = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY \| cv2.THRESH_OTSU)[1]
	gray = cv2.medianBlur(gray, 3)
	text = pytesseract.image_to_string(gray)
	return text

	# get all the images from on PDF file
	images = convert_from_path(fn_pdf)

	# process each image with ORC
	data = []
	for img in tqdm(images):
	text = orc_flow(img)
	data.append(text)

	# write file
	with open(fn_out, 'w') as f:
	for txt in data:
	f.write(txt + '\n')