Created
September 13, 2012 20:38
-
-
Save ntanya/3717461 to your computer and use it in GitHub Desktop.
Python csv with twitter api lookups
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import urllib2 | |
import json | |
import re | |
def findID(str): | |
# extracts the status ID from teh following string example: | |
# "http://twitter.com/IBPTravels/statuses/75907873892339712" | |
matchObj = re.match('(.)+statuses\/(\d+)', str, re.I) | |
return matchObj.group(2) | |
def findURL(str): | |
# input string is a link in the following format: <a href="X" rel="X">link text </a> | |
# this regex extracts the link text, if input is a link, otherwise returns the same string | |
# (sometimes twitter source is just 'web') | |
matchObj = re.match('.+>(.+)<\/a>', str, re.I) | |
if matchObj: | |
return matchObj.group(1) | |
else: | |
#print "No match" | |
return str | |
reader = csv.reader(open('in.csv', 'rU'), delimiter=',', quotechar='"') | |
writer = csv.writer(open('out.csv', 'wb'), delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
rowcount = 0 | |
for row in reader: | |
# write the header line first, no transformations or lookups needed | |
if rowcount == 0: | |
writer.writerow(row) | |
else: | |
if rowcount > 0: | |
#print row[1] | |
if row[2] == "TWITTER": | |
id = findID(row[1]) | |
url = 'https://api.twitter.com/1/statuses/show.json?id=' + id | |
#print url | |
# insert try here because sometimes repsponse codes are 403 (forbidden) or 404 (not found) | |
try: | |
req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) | |
data = urllib2.urlopen(url) | |
source = data.read() | |
obj = json.loads(source) | |
#print obj["source"] | |
cleanstr = findURL(obj["source"]).replace(u'\xa0', u' ') | |
cleanstr = cleanstr.replace(u'\xae', u' ') | |
row[3] = cleanstr | |
#print row | |
except urllib2.HTTPError, e: | |
print e.fp.read() | |
row[3] = 'not authorized' | |
else: | |
row[3] = 'n/a' | |
writer.writerow(row) | |
rowcount += 1 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment