Skip to content

Instantly share code, notes, and snippets.

@shivankgtm
Created December 29, 2019 17:02
Show Gist options
  • Save shivankgtm/0addb78ec7132ff7d18bc5198dbc292b to your computer and use it in GitHub Desktop.
Save shivankgtm/0addb78ec7132ff7d18bc5198dbc292b to your computer and use it in GitHub Desktop.
import os
import pandas as pd
path = '/home/shivank98/Desktop/Motion' # Set the path for folder where all pdf are kept.
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
data = []
def get_roll_number(s):
v = s.find(':')
return s[v+1: v+11]
def get_gender(s):
v = s.find("Gender :")
return s[v+8]
def get_category(s):
v = s.find("Category :")
return s[v+10:v+12]
def get_application_number(s):
v = s.find("Application Number:")
return s[v+20:v+32]
def get_timing_of_test(s):
v = s.find("Shift")
return s[v+5:v+8]
def get_test_center(s):
v = s.find("Venue of Test :")#Venue of Test :
d = s.find("Photograph")
return s[v+14:d]
import PyPDF2
for i in range(len(files)):
pdfFileObj = open(str(files[i]), 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
s = pageObj.extractText()
print(get_application_number(s))
app_num = get_application_number(s)
print(get_roll_number(s))
roll_num = get_roll_number(s)
print(get_gender(s))
gender = get_gender(s)
print(get_category(s))
category = get_category(s)
print(get_timing_of_test(s))
test_time = get_timing_of_test(s)
print(get_test_center(s))
test_center = get_test_center(s)
pdfFileObj.close()
tup = [app_num, roll_num, gender, category, test_time, test_center]
data.append(tup)
print()
print()
df = pd.DataFrame(data, columns = ['ApplicationNum', 'RollNum', 'Category', 'Gender', 'TestTime', 'TestCenter'])
df.to_csv('pdf_data.csv')
@shivankgtm
Copy link
Author

simply run python m_pdf_reader.py after setting line#4 where a path needs to be set off where the pdf files are kept.
this will result in a CSV file consist of the required database.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment