Skip to content

Instantly share code, notes, and snippets.

@VpkPrasanna
Last active October 12, 2021 06:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save VpkPrasanna/bcc2153130fa45bf5bd82518d0eb0769 to your computer and use it in GitHub Desktop.
Save VpkPrasanna/bcc2153130fa45bf5bd82518d0eb0769 to your computer and use it in GitHub Desktop.
This helps us to extract text as a separate instance of image
# Necessary Imports
import textseg as ts
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_path
import cv2
import json
import pandas as pd
import glob
import os
data = pd.DataFrame()
final_name_list=[]
final_text_opencv=[]
final_text_tessaract=[]
# Path of all Resume files
for i in resumes:
pdf = PdfFileReader(open(i,'rb'))
# Get fileName of each PDF File
fname = i.split('/')[-1]
# Check how many page each PDF contains
print(pdf.getNumPages())
# Convert pdf object to image
images = convert_from_path(i)
resumes_img=[]
# append all image instance to a list to pass them through model
for j in range(len(images)):
# Save pages as images in the pdf
images[j].save(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg', 'JPEG')
resumes_img.append(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg')
name_list = fname.split('.')[0]+'_' +'.jpg'
text_opencv=[]
text_tessaract=[]
for i in resumes_img:
# read image using opencv
frame=cv2.imread(i)
os.remove(i)
img = i.split("/")[2]
# Pass the image to the model to get the text instance present in the image.
output_img,label,dilate, c_dict,df1, split_img=ts.get_text_seg(frame, img)
cv2.imwrite(path_to_write+img.split('.')[0]+".png",output_img)
for i in range(len(split_img)):
# This Loop will helps us to save the instance of text as a individual image.
cv2.imwrite(path_to_write+img.split('.')[0]+str(i)+".png", split_img[i])
text_opencv.append(c_dict)
text_tessaract+=text_from_tesseract(output_img)
tesseract_str = ''.join(text_tessaract)
final_name_list.append(name_list)
final_text_opencv.append(text_opencv)
final_text_tessaract.append(tesseract_str)
# we are selecting the index 0 as we have passed one PDF as a input which contains one Page
print(final_text_opencv[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment