Created September 13, 2012 20:38
Python csv with twitter api lookups
import csv
import urllib2
import json
import re
def findID(str):
# extracts the status ID from teh following string example:
# ""
matchObj = re.match('(.)+statuses\/(\d+)', str, re.I)
def findURL(str):
# input string is a link in the following format: <a href="X" rel="X">link text </a>
# this regex extracts the link text, if input is a link, otherwise returns the same string
# (sometimes twitter source is just 'web')
matchObj = re.match('.+>(.+)<\/a>', str, re.I)
if matchObj:
#print "No match"
return str
reader = csv.reader(open('in.csv', 'rU'), delimiter=',', quotechar='"')
writer = csv.writer(open('out.csv', 'wb'), delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
rowcount = 0
for row in reader:
# write the header line first, no transformations or lookups needed
if rowcount == 0:
if rowcount > 0:
#print row[1]
if row[2] == "TWITTER":
id = findID(row[1])
url = '' + id
#print url
# insert try here because sometimes repsponse codes are 403 (forbidden) or 404 (not found)
req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
data = urllib2.urlopen(url)
source =
obj = json.loads(source)
#print obj["source"]
cleanstr = findURL(obj["source"]).replace(u'\xa0', u' ')
cleanstr = cleanstr.replace(u'\xae', u' ')
row[3] = cleanstr
#print row
except urllib2.HTTPError, e:
row[3] = 'not authorized'
row[3] = 'n/a'
rowcount += 1
