hernamesbarbara/README.md

## README.md

      
    Raw
  

              README.md
            
          
    install these libs

pip3 install -r reqs.txt


then you can run the script on the test img file like so

➜ ~ python3  parseimg.py


trying to parse text in: test.jpg
processing text contour 1 of 23
processing text contour 2 of 23
processing text contour 3 of 23
processing text contour 4 of 23
processing text contour 5 of 23
processing text contour 6 of 23
processing text contour 7 of 23
processing text contour 8 of 23
processing text contour 9 of 23
processing text contour 10 of 23
processing text contour 11 of 23
processing text contour 12 of 23
processing text contour 13 of 23
processing text contour 14 of 23
processing text contour 15 of 23
processing text contour 16 of 23
processing text contour 17 of 23
processing text contour 18 of 23
processing text contour 19 of 23
processing text contour 20 of 23
processing text contour 21 of 23
processing text contour 22 of 23
processing text contour 23 of 23
Saved Summary: recognized.txt
saved PDF results: outfile.pdf


## parseimg.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import os
import cv2
from PIL import Image
import pytesseract

OUT_SUMMARY = "recognized.txt"
OUT_PDF = "outfile.pdf"

INFILE = "test.jpg"

print("trying to parse text in: {}".format(INFILE))

img = cv2.imread(INFILE)

# Convert to gray scale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)

# A smaller kernal size e.g. (10, 10) will detect
# each word vs. multiple words / tokens in sequence
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))

# idk what this does but it works
dilation = cv2.dilate(thresh1, rect_kernel, iterations = 1)

# Finding contours
contours, hierarchy = cv2.findContours(
    dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

im2 = img.copy()

def append_line_to_file(data, outfile):
    outfile.write(data)
    outfile.write(os.linesep)
    return


# ensure outfile exists
with  open(OUT_SUMMARY, "w+") as f:
    f.write(os.linesep)

for i, cnt in enumerate(contours):
    print("processing text contour {} of {}".format(i+1, len(contours)))
    x, y, w, h = cv2.boundingRect(cnt)
    rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2)
    cropped = im2[y:y + h, x:x + w]
    text = pytesseract.image_to_string(cropped)
    with open("recognized.txt", "a") as outfile:
        # Appending results to the file you created fresh
        append_line_to_file(text, outfile)

print("Saved Summary: {}".format(OUT_SUMMARY))

pdf = pytesseract.image_to_pdf_or_hocr(INFILE, extension='pdf')

with open(OUT_PDF, 'w+b') as out_boxfile:
    out_boxfile.write(pdf)

print("saved PDF results: {}".format(OUT_PDF))

## reqs.txt
Pillow==9.1.0
opencv-python==4.5.5.64
pytesseract==0.3.9
numpy==1.19.4
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	import sys
	import os
	import cv2
	from PIL import Image
	import pytesseract

	OUT_SUMMARY = "recognized.txt"
	OUT_PDF = "outfile.pdf"

	INFILE = "test.jpg"

	print("trying to parse text in: {}".format(INFILE))

	img = cv2.imread(INFILE)

	# Convert to gray scale
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU \| cv2.THRESH_BINARY_INV)

	# A smaller kernal size e.g. (10, 10) will detect
	# each word vs. multiple words / tokens in sequence
	rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))

	# idk what this does but it works
	dilation = cv2.dilate(thresh1, rect_kernel, iterations = 1)

	# Finding contours
	contours, hierarchy = cv2.findContours(
	dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

	im2 = img.copy()

	def append_line_to_file(data, outfile):
	outfile.write(data)
	outfile.write(os.linesep)
	return


	# ensure outfile exists
	with open(OUT_SUMMARY, "w+") as f:
	f.write(os.linesep)

	for i, cnt in enumerate(contours):
	print("processing text contour {} of {}".format(i+1, len(contours)))
	x, y, w, h = cv2.boundingRect(cnt)
	rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2)
	cropped = im2[y:y + h, x:x + w]
	text = pytesseract.image_to_string(cropped)
	with open("recognized.txt", "a") as outfile:
	# Appending results to the file you created fresh
	append_line_to_file(text, outfile)

	print("Saved Summary: {}".format(OUT_SUMMARY))

	pdf = pytesseract.image_to_pdf_or_hocr(INFILE, extension='pdf')

	with open(OUT_PDF, 'w+b') as out_boxfile:
	out_boxfile.write(pdf)

	print("saved PDF results: {}".format(OUT_PDF))
	Pillow==9.1.0
	opencv-python==4.5.5.64
	pytesseract==0.3.9
	numpy==1.19.4