VpkPrasanna/East Text Detection.py

## East Text Detection.py
# Necessary Imports
import textseg as ts
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_path
import cv2
import json
import pandas as pd
import glob
import os

data = pd.DataFrame()
final_name_list=[]
final_text_opencv=[]
final_text_tessaract=[]
# Path of all Resume files
for i in resumes:
    pdf = PdfFileReader(open(i,'rb'))

    # Get fileName of each PDF File
    fname = i.split('/')[-1]

    # Check how many page each PDF contains
    print(pdf.getNumPages())

    # Convert pdf object to image
    images = convert_from_path(i)
    resumes_img=[]

    # append all image instance to a list to pass them through model
    for j in range(len(images)):

         # Save pages as images in the pdf
        images[j].save(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg', 'JPEG')
        resumes_img.append(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg')
    name_list = fname.split('.')[0]+'_' +'.jpg'
    text_opencv=[]
    text_tessaract=[]
    for i in resumes_img:

        # read image using opencv
        frame=cv2.imread(i)
        os.remove(i)
        img = i.split("/")[2]

        # Pass the image to the model to get the text instance present in the image.
        output_img,label,dilate, c_dict,df1, split_img=ts.get_text_seg(frame, img)
        cv2.imwrite(path_to_write+img.split('.')[0]+".png",output_img)
        for i in range(len(split_img)):

            # This Loop will helps us to save the instance of text as a individual image.
            cv2.imwrite(path_to_write+img.split('.')[0]+str(i)+".png", split_img[i])
        text_opencv.append(c_dict)
        text_tessaract+=text_from_tesseract(output_img)
        tesseract_str = ''.join(text_tessaract)
    final_name_list.append(name_list)
    final_text_opencv.append(text_opencv)
    final_text_tessaract.append(tesseract_str)

# we are selecting the index 0 as we have passed one PDF as a input which contains one Page
print(final_text_opencv[0])
	# Necessary Imports
	import textseg as ts
	from PyPDF2 import PdfFileReader
	from pdf2image import convert_from_path
	import cv2
	import json
	import pandas as pd
	import glob
	import os

	data = pd.DataFrame()
	final_name_list=[]
	final_text_opencv=[]
	final_text_tessaract=[]
	# Path of all Resume files
	for i in resumes:
	pdf = PdfFileReader(open(i,'rb'))

	# Get fileName of each PDF File
	fname = i.split('/')[-1]

	# Check how many page each PDF contains
	print(pdf.getNumPages())

	# Convert pdf object to image
	images = convert_from_path(i)
	resumes_img=[]

	# append all image instance to a list to pass them through model
	for j in range(len(images)):

	# Save pages as images in the pdf
	images[j].save(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg', 'JPEG')
	resumes_img.append(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg')
	name_list = fname.split('.')[0]+'_' +'.jpg'
	text_opencv=[]
	text_tessaract=[]
	for i in resumes_img:

	# read image using opencv
	frame=cv2.imread(i)
	os.remove(i)
	img = i.split("/")[2]

	# Pass the image to the model to get the text instance present in the image.
	output_img,label,dilate, c_dict,df1, split_img=ts.get_text_seg(frame, img)
	cv2.imwrite(path_to_write+img.split('.')[0]+".png",output_img)
	for i in range(len(split_img)):

	# This Loop will helps us to save the instance of text as a individual image.
	cv2.imwrite(path_to_write+img.split('.')[0]+str(i)+".png", split_img[i])
	text_opencv.append(c_dict)
	text_tessaract+=text_from_tesseract(output_img)
	tesseract_str = ''.join(text_tessaract)
	final_name_list.append(name_list)
	final_text_opencv.append(text_opencv)
	final_text_tessaract.append(tesseract_str)

	# we are selecting the index 0 as we have passed one PDF as a input which contains one Page
	print(final_text_opencv[0])