Created
January 23, 2014 07:35
-
-
Save efaisal/8574477 to your computer and use it in GitHub Desktop.
Split PDF file given by SPR every quarterly according to Daerah Mengundi for people to check voters' movement.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Split PDF given by SPR every quarterly according to Daerah Mengundi | |
Requires PyPDF2 https://github.com/mstamy2/PyPDF2 | |
""" | |
import sys | |
import os | |
import re | |
from cStringIO import StringIO | |
from PyPDF2 import PdfFileReader, PdfFileWriter | |
regex = re.compile(".+102\/26\/\d+\ (?P<dm>[\w\s]+)BAHAGIAN.+", re.U) | |
def process_pdf(pdf): | |
result = [] | |
reader = PdfFileReader(pdf) | |
no_of_pages = reader.getNumPages() | |
writer, dm = None, None | |
for i in xrange(no_of_pages): | |
page = reader.getPage(i) | |
text = page.extractText() | |
if not text: continue | |
if "SURUHANJAYA PILIHAN RAYA MALAYSIARANG DAFTAR PEMILIH" in text: | |
m = regex.search(text) | |
if m: | |
if result and not result[-1]['end']: | |
result[-1]['end'] = i - 1 | |
if writer: | |
with open(dm + '.pdf', 'wb') as fh: | |
writer.write(fh) | |
writer = PdfFileWriter() | |
dm = m.group('dm') | |
result.append({'DM': dm, 'start': i, 'end': None}) | |
writer.addPage(page) | |
else: | |
if result and not result[-1]['end']: | |
result[-1]['end'] = i - 1 | |
if writer: | |
with open(dm + '.pdf', 'wb') as fh: | |
writer.write(fh) | |
writer, dm = None, None | |
else: | |
if dm and dm in text: | |
writer.addPage(page) | |
else: | |
if result and not result[-1]['end']: | |
result[-1]['end'] = i - 1 | |
if writer: | |
with open(dm + '.pdf', 'wb') as fh: | |
writer.write(fh) | |
writer, dm = None, None | |
if result and not result[-1]['end']: | |
result[-1]['end'] = i - 1 | |
if writer: | |
with open(dm + '.pdf', 'wb') as fh: | |
writer.write(fh) | |
return result | |
def print_help(msg=None): | |
if msg: | |
print(msg) | |
print('Usage:') | |
print('\tpdfparser.py file') | |
def get_pdf_file(filename): | |
with open(filename, 'rb') as fh: | |
pdf = StringIO(fh.read()) | |
return pdf | |
if __name__ == '__main__': | |
if len(sys.argv) != 2: | |
print_help('file argument required') | |
sys.exit(1) | |
filename = sys.argv[1] | |
if not os.path.exists(filename): | |
print_help('file not found') | |
sys.exit(1) | |
pdf = get_pdf_file(filename) | |
result = process_pdf(pdf) | |
print("Total Daerah Mengundi: %d" % len(result)) | |
for i, r in enumerate(result): | |
print("%d. %s" % (i+1, r['DM'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment