Skip to content

Instantly share code, notes, and snippets.

@soodoku
Last active September 14, 2016 13:21
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save soodoku/62a3172eb1b4a55dee1a to your computer and use it in GitHub Desktop.
Save soodoku/62a3172eb1b4a55dee1a to your computer and use it in GitHub Desktop.
Get text from Wisconsin ad pdfs using pyPdf
'''
Text from Searchable pdfs
Scrape Text off Wisconsin Ads pdfs
Uses pyPdf to get text from searchable pdfs. The script is for tailored for getting data
from Wisconsin Political Ads Database: http://wiscadproject.wisc.edu/Storyboards.
@author: Gaurav Sood
Created on November 02, 2011
'''
import sys, os, re, pyPdf, codecs, csv, string
def convertPdf2String(pdfFile):
content = ""
# try catch for EOF exception - which seems like a nuisance exception
try:
# load PDF file
pdf = pyPdf.PdfFileReader(file(pdfFile, "rb"))
# iterate pages
for i in range(0, pdf.getNumPages()):
# extract the text from each page
content += pdf.getPage(i).extractText() + " \n"
# collapse whitespaces
content = u" ".join(content.replace(u"\xa0", u" ").strip().split())
except Exception, e:
return "Unable to open file: %s with error: %s" % (pdfFile, str(e))
return content
def writer(path, out):
dirList=os.listdir(path)
for fname in dirList:
row = convertPdf2String(path+fname).encode("ascii", "xmlcharrefreplace")
print row
if row.find('Brand') > 0:
title = ' '.join(row.split(' ')[1:]).partition('Brand:')[0]
creative = row.partition('Brand:')[0]
brand = row.partition('Brand:')[2].partition('Parent')[0]
else:
continue
race = row.split(' ')[0]
parent =row.partition('Parent:')[2].partition('Aired:')[0]
date = row.partition('Aired:')[2].partition('Creative Id:')[0].strip()
creative_id = row.partition('Creative Id:')[2].partition('[')[0]
sponsor=""
if row.find('[PFB'):
text = '['+row.partition('Creative Id:')[2].partition('[')[2].partition('[PFB')[0]
sponsor = row.partition('[PFB')[2]
if len(sponsor.split(':')) == 1:
sponsor = sponsor.split(':')[0].rstrip()
else:
sponsor = sponsor.split(':')[1].rstrip()
if sponsor.find('Copyright'):
sponsor = sponsor.partition('Copyright')[0].lstrip()
if sponsor.find(']'):
sponsor = sponsor.partition(']')[0]
#Clean if you want to
text = re.sub("Copyright 2003 TNS Media Intelligence/CMAG www.PoliticsOnTV.com 1-866-559-CMAG", "", text)
text = re.sub("Copyright 2004 TNS Media Intelligence/CMAG www.PoliticsOnTV.com 1-866-559-CMAG", "", text)
text = re.sub("Storyboard", "", text)
record = (creative, creative_id, date, race, title, brand, parent, sponsor,text)
out.writerow(record)
# Header Row
header = ('creative', 'creative_id','date.aired', 'race', 'title', 'brand', 'parent', 'sponsor', 'text')
ads = csv.writer(open('outpath', 'wb'))
ads.writerow(header) # Header Row
writer(path.to.ads.folder, ads)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment