Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Code to parse voter list pdf - ocred by tesseract
import sys
in_file = sys.argv[1]
content = open(in_file).read()
out = open("result.csv","a")
con = content.split("வாக்காளர்‌ பெயர்‌")
csv_string = ""
line = 1
for item in con:
lines = item.split("\n")
for line in lines:
#print(item)
#print(str(line))
if line.strip().startswith(":"):
print("line = " + line)
voter_name = line.split(":")[1]
print(voter_name)
csv_string = csv_string + voter_name + ","
if "கணவர்" in line:
print("line = " + line)
if ":" in line:
husband_name = line.split("கணவர்")[1].split(":")[1]
print(husband_name)
csv_string = csv_string + husband_name + ","
if "தந்தை" in line:
print("line = " + line)
if ":" in line:
father_name = line.split("தந்தை")[1].split(":")[1]
print(father_name)
else:
father_name = line.split("பெயர்‌")[1]
csv_string = csv_string + father_name + ","
if "வயது" in line:
print("line = " + line)
if "இனம்‌" in line:
age = str(line.split("வயது")[1].split("இனம்‌")[0].split(":")[1])
print(age)
csv_string = csv_string + age + ","
if "இனம்‌" in line:
print("line = " + line)
sex = line.split("இனம்‌")[1].split(":")[1]
print(sex)
csv_string = csv_string + sex + ","
csv_string = csv_string + "\n"
# print("====")
print(csv_string)
out.write(csv_string)
#வயது: 63 இனம்‌: பெண்‌
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.