Skip to content

Instantly share code, notes, and snippets.

@hernamesbarbara
Last active April 8, 2022 15:57
Show Gist options
  • Save hernamesbarbara/418a6819f3d6c51db56a9d2d4a359b75 to your computer and use it in GitHub Desktop.
Save hernamesbarbara/418a6819f3d6c51db56a9d2d4a359b75 to your computer and use it in GitHub Desktop.

install these libs


pip3 install -r reqs.txt

then you can run the script on the test img file like so


➜ ~ python3  parseimg.py

trying to parse text in: test.jpg
processing text contour 1 of 23
processing text contour 2 of 23
processing text contour 3 of 23
processing text contour 4 of 23
processing text contour 5 of 23
processing text contour 6 of 23
processing text contour 7 of 23
processing text contour 8 of 23
processing text contour 9 of 23
processing text contour 10 of 23
processing text contour 11 of 23
processing text contour 12 of 23
processing text contour 13 of 23
processing text contour 14 of 23
processing text contour 15 of 23
processing text contour 16 of 23
processing text contour 17 of 23
processing text contour 18 of 23
processing text contour 19 of 23
processing text contour 20 of 23
processing text contour 21 of 23
processing text contour 22 of 23
processing text contour 23 of 23
Saved Summary: recognized.txt
saved PDF results: outfile.pdf

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import os
import cv2
from PIL import Image
import pytesseract
OUT_SUMMARY = "recognized.txt"
OUT_PDF = "outfile.pdf"
INFILE = "test.jpg"
print("trying to parse text in: {}".format(INFILE))
img = cv2.imread(INFILE)
# Convert to gray scale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
# A smaller kernal size e.g. (10, 10) will detect
# each word vs. multiple words / tokens in sequence
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))
# idk what this does but it works
dilation = cv2.dilate(thresh1, rect_kernel, iterations = 1)
# Finding contours
contours, hierarchy = cv2.findContours(
dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
im2 = img.copy()
def append_line_to_file(data, outfile):
outfile.write(data)
outfile.write(os.linesep)
return
# ensure outfile exists
with open(OUT_SUMMARY, "w+") as f:
f.write(os.linesep)
for i, cnt in enumerate(contours):
print("processing text contour {} of {}".format(i+1, len(contours)))
x, y, w, h = cv2.boundingRect(cnt)
rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2)
cropped = im2[y:y + h, x:x + w]
text = pytesseract.image_to_string(cropped)
with open("recognized.txt", "a") as outfile:
# Appending results to the file you created fresh
append_line_to_file(text, outfile)
print("Saved Summary: {}".format(OUT_SUMMARY))
pdf = pytesseract.image_to_pdf_or_hocr(INFILE, extension='pdf')
with open(OUT_PDF, 'w+b') as out_boxfile:
out_boxfile.write(pdf)
print("saved PDF results: {}".format(OUT_PDF))
Pillow==9.1.0
opencv-python==4.5.5.64
pytesseract==0.3.9
numpy==1.19.4
@hernamesbarbara
Copy link
Author

test

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment