Skip to content

Instantly share code, notes, and snippets.

@onyekaa
Created April 18, 2019 12:36
Show Gist options
  • Save onyekaa/ce5c97e6eaa391c80b378324a38b5561 to your computer and use it in GitHub Desktop.
Save onyekaa/ce5c97e6eaa391c80b378324a38b5561 to your computer and use it in GitHub Desktop.
OCR Experiment
# import the necessary packages
from PIL import Image
import pytesseract
import argparse
import cv2
import os
import csv
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
help="path to input image to be OCR'd")
ap.add_argument("-p", "--preprocess", type=str, default="thresh",
help="type of preprocessing to be done")
args = vars(ap.parse_args())
# load the example image and convert it to grayscale
# but first crop all the unnecessary bits.
crop = cv2.imread(args["image"])
fname = os.path.splitext(args['image'])[0]
crname = '{}-cropped.png'.format(fname)
# resize image so crops are consistent
img = cv2.resize(crop,(2200,1700))
cropimg = crop[641:1349, 40:2224]
# cv2.imwrite(os.path.join(path , 'waka.jpg'), img)
cv2.imwrite(crname, cropimg)
image = cv2.imread(crname)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# check to see if we should apply thresholding to preprocess the
# image
if args["preprocess"] == "thresh":
gray = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# make a check to see if median blurring should be done to remove
# noise
elif args["preprocess"] == "blur":
gray = cv2.medianBlur(gray, 3)
# write the grayscale image to disk as a temporary file so we can
# apply OCR to it
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)
# load the image as a PIL/Pillow image, apply OCR, and then delete
# the temporary file
text = pytesseract.image_to_string(Image.open(filename), config='--psm 6')
# os.remove(filename)
print(text)
csvname = '{}.csv'.format(fname)
with open(csvname, mode='w') as b_file:
bw = csv.writer(b_file, delimiter=',', escapechar='\n', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
bw.writerow(['no', 'title', 'studio', 'dis', 'weeks', 'locs', 'weekend revenue', 'rev_change', 'avg_loc', 'admissions', 'adm_change', 'flash_rev', 'flash_adm'])
line = ''
row = 1
count = 0
# Since the data is mostly split by new rows, we look for '\n' xters so we can
# split by rows
for t in text:
line += t
if t == '\n':
newcol = []
count += 1
columns = [c.strip() for c in line.strip(' ').split(' ')]
# Some rows are broken, skip over if so. We expect at least 10 cols per row
if len(columns) < 8:
count -= 1
continue
# Get the movie title.
title = ''
newcol.append(count)
for i, c in enumerate(columns):
# if it's the number in the column matches the row count
# we assume this is the first column, and thus the title is next
if i == 0:
title = ''
tstart = True
continue
# To guess where the title column ends and begins we assume
# that all the words between the row number and the words below (studios)
# are titles.
if (c.isupper() == False) and (c not in ['INDP', 'BLUE PICS', 'FILMONE', 'SILVERBIRD', 'CRIMSON']):
# print(c.isupper())
title += ' {}'.format(str(c))
else:
# print(title, c)
newcol.append(title)
tstart = False
if not tstart:
newcol.append(c)
if newcol:
# print(newcol)
# print('final', count, newcol)
bw.writerow(newcol)
line = ''
columns = []
# show the output images
# cv2.imshow("Image", image)
# cv2.imshow("Output", gray)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
# cv2.waitKey(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment