Skip to content

Instantly share code, notes, and snippets.

@efaisal
Created January 23, 2014 07:35
Show Gist options
  • Save efaisal/8574477 to your computer and use it in GitHub Desktop.
Save efaisal/8574477 to your computer and use it in GitHub Desktop.
Split PDF file given by SPR every quarterly according to Daerah Mengundi for people to check voters' movement.
#!/usr/bin/env python
"""
Split PDF given by SPR every quarterly according to Daerah Mengundi
Requires PyPDF2 https://github.com/mstamy2/PyPDF2
"""
import sys
import os
import re
from cStringIO import StringIO
from PyPDF2 import PdfFileReader, PdfFileWriter
regex = re.compile(".+102\/26\/\d+\ (?P<dm>[\w\s]+)BAHAGIAN.+", re.U)
def process_pdf(pdf):
result = []
reader = PdfFileReader(pdf)
no_of_pages = reader.getNumPages()
writer, dm = None, None
for i in xrange(no_of_pages):
page = reader.getPage(i)
text = page.extractText()
if not text: continue
if "SURUHANJAYA PILIHAN RAYA MALAYSIARANG DAFTAR PEMILIH" in text:
m = regex.search(text)
if m:
if result and not result[-1]['end']:
result[-1]['end'] = i - 1
if writer:
with open(dm + '.pdf', 'wb') as fh:
writer.write(fh)
writer = PdfFileWriter()
dm = m.group('dm')
result.append({'DM': dm, 'start': i, 'end': None})
writer.addPage(page)
else:
if result and not result[-1]['end']:
result[-1]['end'] = i - 1
if writer:
with open(dm + '.pdf', 'wb') as fh:
writer.write(fh)
writer, dm = None, None
else:
if dm and dm in text:
writer.addPage(page)
else:
if result and not result[-1]['end']:
result[-1]['end'] = i - 1
if writer:
with open(dm + '.pdf', 'wb') as fh:
writer.write(fh)
writer, dm = None, None
if result and not result[-1]['end']:
result[-1]['end'] = i - 1
if writer:
with open(dm + '.pdf', 'wb') as fh:
writer.write(fh)
return result
def print_help(msg=None):
if msg:
print(msg)
print('Usage:')
print('\tpdfparser.py file')
print
def get_pdf_file(filename):
with open(filename, 'rb') as fh:
pdf = StringIO(fh.read())
return pdf
if __name__ == '__main__':
if len(sys.argv) != 2:
print_help('file argument required')
sys.exit(1)
filename = sys.argv[1]
if not os.path.exists(filename):
print_help('file not found')
sys.exit(1)
pdf = get_pdf_file(filename)
result = process_pdf(pdf)
print("Total Daerah Mengundi: %d" % len(result))
for i, r in enumerate(result):
print("%d. %s" % (i+1, r['DM']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment