Create a gist now

Instantly share code, notes, and snippets.

Get text from Wisconsin ad pdfs using pyPdf
'''
Text from Searchable pdfs
Scrape Text off Wisconsin Ads pdfs
Uses pyPdf to get text from searchable pdfs. The script is for tailored for getting data
from Wisconsin Political Ads Database: http://wiscadproject.wisc.edu/Storyboards.
@author: Gaurav Sood
Created on November 02, 2011
'''
import sys, os, re, pyPdf, codecs, csv, string
def convertPdf2String(pdfFile):
content = ""
# try catch for EOF exception - which seems like a nuisance exception
try:
# load PDF file
pdf = pyPdf.PdfFileReader(file(pdfFile, "rb"))
# iterate pages
for i in range(0, pdf.getNumPages()):
# extract the text from each page
content += pdf.getPage(i).extractText() + " \n"
# collapse whitespaces
content = u" ".join(content.replace(u"\xa0", u" ").strip().split())
except Exception, e:
return "Unable to open file: %s with error: %s" % (pdfFile, str(e))
return content
def writer(path, out):
dirList=os.listdir(path)
for fname in dirList:
row = convertPdf2String(path+fname).encode("ascii", "xmlcharrefreplace")
print row
if row.find('Brand') > 0:
title = ' '.join(row.split(' ')[1:]).partition('Brand:')[0]
creative = row.partition('Brand:')[0]
brand = row.partition('Brand:')[2].partition('Parent')[0]
else:
continue
race = row.split(' ')[0]
parent =row.partition('Parent:')[2].partition('Aired:')[0]
date = row.partition('Aired:')[2].partition('Creative Id:')[0].strip()
creative_id = row.partition('Creative Id:')[2].partition('[')[0]
sponsor=""
if row.find('[PFB'):
text = '['+row.partition('Creative Id:')[2].partition('[')[2].partition('[PFB')[0]
sponsor = row.partition('[PFB')[2]
if len(sponsor.split(':')) == 1:
sponsor = sponsor.split(':')[0].rstrip()
else:
sponsor = sponsor.split(':')[1].rstrip()
if sponsor.find('Copyright'):
sponsor = sponsor.partition('Copyright')[0].lstrip()
if sponsor.find(']'):
sponsor = sponsor.partition(']')[0]
#Clean if you want to
text = re.sub("Copyright 2003 TNS Media Intelligence/CMAG www.PoliticsOnTV.com 1-866-559-CMAG", "", text)
text = re.sub("Copyright 2004 TNS Media Intelligence/CMAG www.PoliticsOnTV.com 1-866-559-CMAG", "", text)
text = re.sub("Storyboard", "", text)
record = (creative, creative_id, date, race, title, brand, parent, sponsor,text)
out.writerow(record)
# Header Row
header = ('creative', 'creative_id','date.aired', 'race', 'title', 'brand', 'parent', 'sponsor', 'text')
ads = csv.writer(open('outpath', 'wb'))
ads.writerow(header) # Header Row
writer(path.to.ads.folder, ads)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment