onyekaa/ocr.py

## ocr.py
# import the necessary packages
from PIL import Image
import pytesseract
import argparse
import cv2
import os
import csv

# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
	help="path to input image to be OCR'd")
ap.add_argument("-p", "--preprocess", type=str, default="thresh",
	help="type of preprocessing to be done")
args = vars(ap.parse_args())

# load the example image and convert it to grayscale
# but first crop all the unnecessary bits.
crop = cv2.imread(args["image"])
fname = os.path.splitext(args['image'])[0]
crname = '{}-cropped.png'.format(fname)
# resize image so crops are consistent
img = cv2.resize(crop,(2200,1700))
cropimg = crop[641:1349, 40:2224]
# cv2.imwrite(os.path.join(path , 'waka.jpg'), img)
cv2.imwrite(crname, cropimg)

image = cv2.imread(crname)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# check to see if we should apply thresholding to preprocess the
# image
if args["preprocess"] == "thresh":
	gray = cv2.threshold(gray, 0, 255,
		cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

# make a check to see if median blurring should be done to remove
# noise
elif args["preprocess"] == "blur":
	gray = cv2.medianBlur(gray, 3)

# write the grayscale image to disk as a temporary file so we can
# apply OCR to it
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)

# load the image as a PIL/Pillow image, apply OCR, and then delete
# the temporary file
text = pytesseract.image_to_string(Image.open(filename), config='--psm 6')
# os.remove(filename)
print(text)

csvname = '{}.csv'.format(fname)
with open(csvname, mode='w') as b_file:
    bw = csv.writer(b_file, delimiter=',', escapechar='\n', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
    bw.writerow(['no', 'title', 'studio', 'dis', 'weeks', 'locs', 'weekend revenue', 'rev_change', 'avg_loc', 'admissions', 'adm_change', 'flash_rev', 'flash_adm'])
    line = ''
    row = 1
    count = 0
    # Since the data is mostly split by new rows, we look for '\n' xters so we can
    # split by rows
    for t in text:
        line += t
        if t == '\n':
            newcol = []
            count += 1
            columns = [c.strip() for c in line.strip(' ').split(' ')]
            # Some rows are broken, skip over if so. We expect at least 10 cols per row
            if len(columns) < 8:
                count -= 1
                continue

            # Get the movie title.
            title = ''
            newcol.append(count)
            for i, c in enumerate(columns):
                # if it's the number in the column matches the row count
                # we assume this is the first column, and thus the title is next
                if i == 0:
                    title = ''
                    tstart = True
                    continue

                # To guess where the title column ends and begins we assume
                # that all the words between the row number and the words below (studios)
                # are titles.
                if (c.isupper() == False) and (c not in ['INDP', 'BLUE PICS', 'FILMONE', 'SILVERBIRD', 'CRIMSON']):
                    # print(c.isupper())
                    title += ' {}'.format(str(c))
                else:
                    # print(title, c)
                    newcol.append(title)
                    tstart = False

                if not tstart:
                    newcol.append(c)

            if newcol:
                # print(newcol)
                # print('final', count, newcol)
                bw.writerow(newcol)
            line = ''
            columns = []


# show the output images
# cv2.imshow("Image", image)
# cv2.imshow("Output", gray)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
# cv2.waitKey(1)
	# import the necessary packages
	from PIL import Image
	import pytesseract
	import argparse
	import cv2
	import os
	import csv

	# construct the argument parse and parse the arguments
	ap = argparse.ArgumentParser()
	ap.add_argument("-i", "--image", required=True,
	help="path to input image to be OCR'd")
	ap.add_argument("-p", "--preprocess", type=str, default="thresh",
	help="type of preprocessing to be done")
	args = vars(ap.parse_args())

	# load the example image and convert it to grayscale
	# but first crop all the unnecessary bits.
	crop = cv2.imread(args["image"])
	fname = os.path.splitext(args['image'])[0]
	crname = '{}-cropped.png'.format(fname)
	# resize image so crops are consistent
	img = cv2.resize(crop,(2200,1700))
	cropimg = crop[641:1349, 40:2224]
	# cv2.imwrite(os.path.join(path , 'waka.jpg'), img)
	cv2.imwrite(crname, cropimg)

	image = cv2.imread(crname)
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

	# check to see if we should apply thresholding to preprocess the
	# image
	if args["preprocess"] == "thresh":
	gray = cv2.threshold(gray, 0, 255,
	cv2.THRESH_BINARY \| cv2.THRESH_OTSU)[1]

	# make a check to see if median blurring should be done to remove
	# noise
	elif args["preprocess"] == "blur":
	gray = cv2.medianBlur(gray, 3)

	# write the grayscale image to disk as a temporary file so we can
	# apply OCR to it
	filename = "{}.png".format(os.getpid())
	cv2.imwrite(filename, gray)

	# load the image as a PIL/Pillow image, apply OCR, and then delete
	# the temporary file
	text = pytesseract.image_to_string(Image.open(filename), config='--psm 6')
	# os.remove(filename)
	print(text)

	csvname = '{}.csv'.format(fname)
	with open(csvname, mode='w') as b_file:
	bw = csv.writer(b_file, delimiter=',', escapechar='\n', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
	bw.writerow(['no', 'title', 'studio', 'dis', 'weeks', 'locs', 'weekend revenue', 'rev_change', 'avg_loc', 'admissions', 'adm_change', 'flash_rev', 'flash_adm'])
	line = ''
	row = 1
	count = 0
	# Since the data is mostly split by new rows, we look for '\n' xters so we can
	# split by rows
	for t in text:
	line += t
	if t == '\n':
	newcol = []
	count += 1
	columns = [c.strip() for c in line.strip(' ').split(' ')]
	# Some rows are broken, skip over if so. We expect at least 10 cols per row
	if len(columns) < 8:
	count -= 1
	continue

	# Get the movie title.
	title = ''
	newcol.append(count)
	for i, c in enumerate(columns):
	# if it's the number in the column matches the row count
	# we assume this is the first column, and thus the title is next
	if i == 0:
	title = ''
	tstart = True
	continue

	# To guess where the title column ends and begins we assume
	# that all the words between the row number and the words below (studios)
	# are titles.
	if (c.isupper() == False) and (c not in ['INDP', 'BLUE PICS', 'FILMONE', 'SILVERBIRD', 'CRIMSON']):
	# print(c.isupper())
	title += ' {}'.format(str(c))
	else:
	# print(title, c)
	newcol.append(title)
	tstart = False

	if not tstart:
	newcol.append(c)

	if newcol:
	# print(newcol)
	# print('final', count, newcol)
	bw.writerow(newcol)
	line = ''
	columns = []



	# show the output images
	# cv2.imshow("Image", image)
	# cv2.imshow("Output", gray)
	# cv2.waitKey(0)
	# cv2.destroyAllWindows()
	# cv2.waitKey(1)