Created
December 29, 2019 17:02
-
-
Save shivankgtm/0addb78ec7132ff7d18bc5198dbc292b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
path = '/home/shivank98/Desktop/Motion' # Set the path for folder where all pdf are kept. | |
files = [] | |
for r, d, f in os.walk(path): | |
for file in f: | |
if '.pdf' in file: | |
files.append(os.path.join(r, file)) | |
data = [] | |
def get_roll_number(s): | |
v = s.find(':') | |
return s[v+1: v+11] | |
def get_gender(s): | |
v = s.find("Gender :") | |
return s[v+8] | |
def get_category(s): | |
v = s.find("Category :") | |
return s[v+10:v+12] | |
def get_application_number(s): | |
v = s.find("Application Number:") | |
return s[v+20:v+32] | |
def get_timing_of_test(s): | |
v = s.find("Shift") | |
return s[v+5:v+8] | |
def get_test_center(s): | |
v = s.find("Venue of Test :")#Venue of Test : | |
d = s.find("Photograph") | |
return s[v+14:d] | |
import PyPDF2 | |
for i in range(len(files)): | |
pdfFileObj = open(str(files[i]), 'rb') | |
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) | |
pageObj = pdfReader.getPage(0) | |
s = pageObj.extractText() | |
print(get_application_number(s)) | |
app_num = get_application_number(s) | |
print(get_roll_number(s)) | |
roll_num = get_roll_number(s) | |
print(get_gender(s)) | |
gender = get_gender(s) | |
print(get_category(s)) | |
category = get_category(s) | |
print(get_timing_of_test(s)) | |
test_time = get_timing_of_test(s) | |
print(get_test_center(s)) | |
test_center = get_test_center(s) | |
pdfFileObj.close() | |
tup = [app_num, roll_num, gender, category, test_time, test_center] | |
data.append(tup) | |
print() | |
print() | |
df = pd.DataFrame(data, columns = ['ApplicationNum', 'RollNum', 'Category', 'Gender', 'TestTime', 'TestCenter']) | |
df.to_csv('pdf_data.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
simply run
python m_pdf_reader.py
after setting line#4 where a path needs to be set off where the pdf files are kept.this will result in a CSV file consist of the required database.