ahmedtijaninet/OCRExample.py

## OCRExample.py
import cv2
import numpy as np
import pytesseract
import os
import csv
import re

per = 25
pixelThreshold = 500

""" run ImgKeyPointSelector.py with the query image and get the key points  """
roi = [[(1206, 304), (1800, 400), 'text ', 'Nic No'],
       [(964, 888), (1884, 984), 'text', 'Name']]

#set path of the tesseract excecutable
pytesseract.pytesseract.tesseract_cmd = "C:/Users/PunsisiK.LOITL-SE03/AppData/Local/Tesseract-OCR/tesseract.exe"

""" format data (OPTIONAL)  """
def idnoformat(id):
    if id != '':
        arr = re.findall(r'\b\d+\b', id)
        if len(arr) == 0:
            try:
                arr = str(re.search(r'\d+', id).group())
                pos = id.find(arr[0])
                arr = id[pos:(pos+12)]
            except Exception :
                arr = 'N/A'
            return arr
        else:
            return arr[0]

def nameformat(name):
    if name != '':
        formattedname = ""
        arr = str(name).split()
        for i in arr:
            if i.isalpha():
                formattedname = formattedname + " " + i
            else:
                reg = re.sub('[^a-zA-Z]+', '', i)
                formattedname = formattedname + " " + reg
        formattedname.lstrip()
        return formattedname


""" read the query image """
imgQ = cv2.imread('QueryImg/New NIC.png')
h, w, c = imgQ.shape

""" getting the key points and the descriptors of the query image  """
orb = cv2.ORB_create(7900)
kp1, des1 = orb.detectAndCompute(imgQ, None)

""" reading all the images in the NIC folder  """
path = 'NIC' #name of the folder
myPicList = os.listdir(path)
print(myPicList)

NICdata = []

for j, y in enumerate(myPicList):
    img = cv2.imread(path + "/" + y)
    kp2, des2 = orb.detectAndCompute(img, None)

    """ using the Brute Force Matcher, the key points of the query image are matched with
     the key points of the sample images and filters only the good matches among them"""
    bf = cv2.BFMatcher(cv2.NORM_HAMMING)
    matches = bf.match(des2, des1)
    matches.sort(key=lambda x: x.distance)
    good = matches[:int(len(matches) * (per / 100))]
    imgMatch = cv2.drawMatches(img, kp2, imgQ, kp1, good[:100], None, flags=2)

    """ extracts the location of the matched key points of both images and scan the sample image
     to give an output image similar to the query image """
    srcPoints = np.float32([kp2[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
    dstPoints = np.float32([kp1[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)

    M, _ = cv2.findHomography(srcPoints, dstPoints, cv2.RANSAC, 5.0)
    imgScan = cv2.warpPerspective(img, M, (w, h))

    imgShow = imgScan.copy()
    imgMask = np.zeros_like(imgShow)

    myData = []

    print(f'################## Extracting Data from Form {j}  ##################')

    for x, r in enumerate(roi):
        """ The points which are defined by the roi above are being masked andhighlighted in
        rectangels to read the data inside them"""

        cv2.rectangle(imgMask, (r[0][0], r[0][1]), (r[1][0], r[1][1]), (0, 255, 0), cv2.FILLED)
        imgShow = cv2.addWeighted(imgShow, 0.99, imgMask, 0.1, 0)

        imgCrop = imgScan[r[0][1]:r[1][1], r[0][0]:r[1][0]]

        if r[2] == 'text':

            print('{} :{}'.format(r[3], pytesseract.image_to_string(imgCrop)))
            myData.append(pytesseract.image_to_string(imgCrop))

    NICdata.append(myData)

    cv2.imshow(y+"2", imgShow)

print(NICdata)

""" Writting data to a CSV file (OPTIONAL)  """
with open('NicData.csv', 'w', newline='', encoding='utf-16') as f:
    theWriter = csv.writer(f)
    theWriter.writerow(['ID', 'Name'])
    for data in NICdata:
        if len(data) == 0:
            theWriter.writerow(["N/A", "N/A"])
        elif len(data) == 1:
            theWriter.writerow([idnoformat(data[0]), "N/A"])
        else:
            theWriter.writerow([idnoformat(data[0]), nameformat(data[1])])


cv2.imshow("Output", imgQ)
cv2.waitKey(0)
	import cv2
	import numpy as np
	import pytesseract
	import os
	import csv
	import re

	per = 25
	pixelThreshold = 500

	""" run ImgKeyPointSelector.py with the query image and get the key points """
	roi = [[(1206, 304), (1800, 400), 'text ', 'Nic No'],
	[(964, 888), (1884, 984), 'text', 'Name']]

	#set path of the tesseract excecutable
	pytesseract.pytesseract.tesseract_cmd = "C:/Users/PunsisiK.LOITL-SE03/AppData/Local/Tesseract-OCR/tesseract.exe"

	""" format data (OPTIONAL) """
	def idnoformat(id):
	if id != '':
	arr = re.findall(r'\b\d+\b', id)
	if len(arr) == 0:
	try:
	arr = str(re.search(r'\d+', id).group())
	pos = id.find(arr[0])
	arr = id[pos:(pos+12)]
	except Exception :
	arr = 'N/A'
	return arr
	else:
	return arr[0]

	def nameformat(name):
	if name != '':
	formattedname = ""
	arr = str(name).split()
	for i in arr:
	if i.isalpha():
	formattedname = formattedname + " " + i
	else:
	reg = re.sub('[^a-zA-Z]+', '', i)
	formattedname = formattedname + " " + reg
	formattedname.lstrip()
	return formattedname


	""" read the query image """
	imgQ = cv2.imread('QueryImg/New NIC.png')
	h, w, c = imgQ.shape

	""" getting the key points and the descriptors of the query image """
	orb = cv2.ORB_create(7900)
	kp1, des1 = orb.detectAndCompute(imgQ, None)

	""" reading all the images in the NIC folder """
	path = 'NIC' #name of the folder
	myPicList = os.listdir(path)
	print(myPicList)

	NICdata = []

	for j, y in enumerate(myPicList):
	img = cv2.imread(path + "/" + y)
	kp2, des2 = orb.detectAndCompute(img, None)

	""" using the Brute Force Matcher, the key points of the query image are matched with
	the key points of the sample images and filters only the good matches among them"""
	bf = cv2.BFMatcher(cv2.NORM_HAMMING)
	matches = bf.match(des2, des1)
	matches.sort(key=lambda x: x.distance)
	good = matches[:int(len(matches) * (per / 100))]
	imgMatch = cv2.drawMatches(img, kp2, imgQ, kp1, good[:100], None, flags=2)

	""" extracts the location of the matched key points of both images and scan the sample image
	to give an output image similar to the query image """
	srcPoints = np.float32([kp2[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
	dstPoints = np.float32([kp1[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)

	M, _ = cv2.findHomography(srcPoints, dstPoints, cv2.RANSAC, 5.0)
	imgScan = cv2.warpPerspective(img, M, (w, h))

	imgShow = imgScan.copy()
	imgMask = np.zeros_like(imgShow)

	myData = []

	print(f'################## Extracting Data from Form {j} ##################')

	for x, r in enumerate(roi):
	""" The points which are defined by the roi above are being masked andhighlighted in
	rectangels to read the data inside them"""

	cv2.rectangle(imgMask, (r[0][0], r[0][1]), (r[1][0], r[1][1]), (0, 255, 0), cv2.FILLED)
	imgShow = cv2.addWeighted(imgShow, 0.99, imgMask, 0.1, 0)

	imgCrop = imgScan[r[0][1]:r[1][1], r[0][0]:r[1][0]]

	if r[2] == 'text':

	print('{} :{}'.format(r[3], pytesseract.image_to_string(imgCrop)))
	myData.append(pytesseract.image_to_string(imgCrop))

	NICdata.append(myData)

	cv2.imshow(y+"2", imgShow)

	print(NICdata)

	""" Writting data to a CSV file (OPTIONAL) """
	with open('NicData.csv', 'w', newline='', encoding='utf-16') as f:
	theWriter = csv.writer(f)
	theWriter.writerow(['ID', 'Name'])
	for data in NICdata:
	if len(data) == 0:
	theWriter.writerow(["N/A", "N/A"])
	elif len(data) == 1:
	theWriter.writerow([idnoformat(data[0]), "N/A"])
	else:
	theWriter.writerow([idnoformat(data[0]), nameformat(data[1])])


	cv2.imshow("Output", imgQ)
	cv2.waitKey(0)