Skip to content

Instantly share code, notes, and snippets.

@rmehta
Created March 16, 2014 09:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rmehta/9580863 to your computer and use it in GitHub Desktop.
Save rmehta/9580863 to your computer and use it in GitHub Desktop.
Extract Voter information from Maharashtra Electoral Rolls (PDF)
# PDF to CSV Converter for Mah Electoral roles
# Usage:
# 1. Place all pdf folders in "source"
# 2. Set "target" to be the folder where you want output files exported
# Note: inner directory structure will be maintained.
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import HTMLConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
import os, re, csv
def convert_pdf(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
def get_strings(html):
return re.findall('(?<=>)([^<]+?)(?=<)', html)
def get_voters(strings):
voters = []
v = prev = stype = addr = None
nextsr = 1
for s in strings:
s = s.strip()
ns = s.replace(" ", "").lower()
if not ns:
continue
if "pincode" in ns and len(ns) > 20:
addr = s
prev = stype
stype = None
if ns.startswith("mt/") or ns.startswith("nct"):
if v:
v["id"] = s
stype = "id"
continue
if ("photoavailable" in ns) or ("photonotavailable" in ns) \
or ns.startswith("partno"):
if v and "age" in v:
voters.append(v)
v = {"addr": addr}
stype = "new"
if prev == "new":
v["father_or_husband"] = s
joined = re.findall("[a-z][A-Z]", s)
if joined:
i = s.index(joined[0])
v["father_or_husband"] = s[:i+1]
v["name"] = s[i+1:]
if s.startswith("Age :"):
if not v:
v = {}
s = s.replace("Sex", "")
v["age"] = s.split(":")[1].strip()
stype = "age"
if prev == "age":
if "Male" in s:
v["sex"] = "Male"
if "Female" in s:
v["sex"] = "Female"
stype = "sex"
if prev == "sex":
p = s.split(":")
if p[0].strip():
v["name"] = p[0].strip()
if p[-1].strip():
v["house_no"] = p[-1].strip()
stype = "house"
if prev == "house":
if s==str(nextsr):
v["sr"] = s
nextsr = int(v["sr"]) + 1
stype = "sr"
else:
if "houseno" not in s:
v["house_no"] = s
stype = "house"
if prev == "id":
if not ns.startswith("elec"):
v["house_no"] = s
return voters
def write_csv(voters, filename):
with open(filename, "w") as f:
writer = csv.writer(f)
keys = ["sr", "id", "name", "age", "sex", "father_or_husband", "house_no", "addr"]
writer.writerow([k.replace("_", " ").title() for k in keys])
for v in voters:
writer.writerow([v.get(k, "") for k in keys])
source = "source"
target = "out"
for basepath, folders, files in os.walk(source):
for f in files:
if f.endswith(".pdf"):
outpath = os.path.join(target, os.path.relpath(basepath, source))
outfile = os.path.join(outpath, f.rsplit(".")[0] + ".csv")
if not os.path.exists(outpath):
os.makedirs(outpath)
html = convert_pdf(os.path.join(basepath, f))
print outfile
write_csv(get_voters(get_strings(html)), os.path.join(outpath, f.rsplit(".")[0] + ".csv"))
@suyashdb
Copy link

https://gist.github.com/rmehta/9580863#file-convert-py-L7
is process_pdf an added function to pdfminer module. If yes can you share it please?

Also does this script handle converting marathi electoral roll to txt/csv etc ..?

Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment