Last active
September 14, 2016 13:21
-
-
Save soodoku/62a3172eb1b4a55dee1a to your computer and use it in GitHub Desktop.
Get text from Wisconsin ad pdfs using pyPdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Text from Searchable pdfs | |
Scrape Text off Wisconsin Ads pdfs | |
Uses pyPdf to get text from searchable pdfs. The script is for tailored for getting data | |
from Wisconsin Political Ads Database: http://wiscadproject.wisc.edu/Storyboards. | |
@author: Gaurav Sood | |
Created on November 02, 2011 | |
''' | |
import sys, os, re, pyPdf, codecs, csv, string | |
def convertPdf2String(pdfFile): | |
content = "" | |
# try catch for EOF exception - which seems like a nuisance exception | |
try: | |
# load PDF file | |
pdf = pyPdf.PdfFileReader(file(pdfFile, "rb")) | |
# iterate pages | |
for i in range(0, pdf.getNumPages()): | |
# extract the text from each page | |
content += pdf.getPage(i).extractText() + " \n" | |
# collapse whitespaces | |
content = u" ".join(content.replace(u"\xa0", u" ").strip().split()) | |
except Exception, e: | |
return "Unable to open file: %s with error: %s" % (pdfFile, str(e)) | |
return content | |
def writer(path, out): | |
dirList=os.listdir(path) | |
for fname in dirList: | |
row = convertPdf2String(path+fname).encode("ascii", "xmlcharrefreplace") | |
print row | |
if row.find('Brand') > 0: | |
title = ' '.join(row.split(' ')[1:]).partition('Brand:')[0] | |
creative = row.partition('Brand:')[0] | |
brand = row.partition('Brand:')[2].partition('Parent')[0] | |
else: | |
continue | |
race = row.split(' ')[0] | |
parent =row.partition('Parent:')[2].partition('Aired:')[0] | |
date = row.partition('Aired:')[2].partition('Creative Id:')[0].strip() | |
creative_id = row.partition('Creative Id:')[2].partition('[')[0] | |
sponsor="" | |
if row.find('[PFB'): | |
text = '['+row.partition('Creative Id:')[2].partition('[')[2].partition('[PFB')[0] | |
sponsor = row.partition('[PFB')[2] | |
if len(sponsor.split(':')) == 1: | |
sponsor = sponsor.split(':')[0].rstrip() | |
else: | |
sponsor = sponsor.split(':')[1].rstrip() | |
if sponsor.find('Copyright'): | |
sponsor = sponsor.partition('Copyright')[0].lstrip() | |
if sponsor.find(']'): | |
sponsor = sponsor.partition(']')[0] | |
#Clean if you want to | |
text = re.sub("Copyright 2003 TNS Media Intelligence/CMAG www.PoliticsOnTV.com 1-866-559-CMAG", "", text) | |
text = re.sub("Copyright 2004 TNS Media Intelligence/CMAG www.PoliticsOnTV.com 1-866-559-CMAG", "", text) | |
text = re.sub("Storyboard", "", text) | |
record = (creative, creative_id, date, race, title, brand, parent, sponsor,text) | |
out.writerow(record) | |
# Header Row | |
header = ('creative', 'creative_id','date.aired', 'race', 'title', 'brand', 'parent', 'sponsor', 'text') | |
ads = csv.writer(open('outpath', 'wb')) | |
ads.writerow(header) # Header Row | |
writer(path.to.ads.folder, ads) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment