Last active
December 15, 2015 12:09
-
-
Save revox/2dc5e0e4cc269bf40701 to your computer and use it in GitHub Desktop.
Twitter search API (last 7 days) into a CSV file with some profile info from the tweets (based on code from MTSW, 2nd ed, example 9.4)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# dont keep this script in public_html!!!! | |
import twitter | |
import sys,json,csv | |
def twitter_search(twitter_api, q, max_results=200): | |
# See https://dev.twitter.com/docs/api/1.1/get/search/tweets and | |
# https://dev.twitter.com/docs/using-search for details on advanced | |
# search criteria that may be useful for keyword arguments | |
# See https://dev.twitter.com/docs/api/1.1/get/search/tweets | |
search_results = twitter_api.search.tweets(q=q, count=100) | |
# print search_results | |
print json.dumps(search_results['search_metadata'], indent=1) | |
statuses = search_results['statuses'] | |
# Iterate through batches of results by following the cursor until we | |
# reach the desired number of results, keeping in mind that OAuth users | |
# can "only" make 180 search queries per 15-minute interval. See | |
# https://dev.twitter.com/docs/rate-limiting/1.1/limits | |
# for details. A reasonable number of results is ~1000-1500, although | |
# that number of results may not exist for all queries. | |
# Enforce a reasonable limit we'll go for 2000 in case Twitter change the API and | |
# let more out | |
max_results = min(2000, max_results) | |
for _ in range(10): # 10*100 = 1000 | |
try: | |
next_results = search_results['search_metadata']['next_results'] | |
except KeyError, e: # No more results when next_results doesn't exist | |
break | |
# Create a dictionary from next_results, which has the following form: | |
# ?max_id=313519052523986943&q=NCAA&include_entities=1 | |
kwargs = dict([ kv.split('=') for kv in next_results[1:].split("&") ]) | |
search_results = twitter_api.search.tweets(**kwargs) | |
statuses += search_results['statuses'] | |
if len(statuses) > max_results: | |
break | |
return statuses | |
''' helper functions, clean data, unpack dictionaries ''' | |
def getVal(val): | |
clean = "" | |
if isinstance(val, bool): | |
return val | |
if isinstance(val, int): | |
return val | |
if val: | |
clean = val.encode('utf-8') | |
return clean | |
def getLng(val): | |
if isinstance(val, dict): | |
return val['coordinates'][0] | |
def getLat(val): | |
if isinstance(val, dict): | |
return val['coordinates'][1] | |
def getPlace(val): | |
if isinstance(val, dict): | |
return val['full_name'].encode('utf-8') | |
# == OAuth Authentication == | |
# The consumer keys can be found on your application's Details | |
consumer_key="" | |
consumer_secret="" | |
# Create an access token under the the "Your access token" section | |
access_token="" | |
access_token_secret="" | |
auth = twitter.oauth.OAuth(access_token, | |
access_token_secret, | |
consumer_key, | |
consumer_secret) | |
twitter_api = twitter.Twitter(auth=auth) | |
twitter_api.retry = True | |
# Sample usage | |
q = "starwars" | |
results = twitter_search(twitter_api, q, max_results=2000) | |
print len(results) | |
# Show one sample search result by slicing the list... | |
# print json.dumps(results[0], indent=1) | |
csvfile = open(q + '_with_profiles.csv', 'w') | |
csvwriter = csv.writer(csvfile) | |
csvwriter.writerow(['created_at', | |
'user-screen_name', | |
'text', | |
'coordinates lng', | |
'coordinates lat', | |
'place', | |
'user-location', | |
'user-geo_enabled', | |
'user-lang', | |
'user-time_zone', | |
'user-statuses_count', | |
'user-followers_count', | |
'user-created_at']) | |
for tweet in results: | |
csvwriter.writerow([tweet['created_at'], | |
getVal(tweet['user']['screen_name']), | |
getVal(tweet['text']), | |
getLng(tweet['coordinates']), | |
getLat(tweet['coordinates']), | |
getPlace(tweet['place']), | |
getVal(tweet['user']['location']), | |
getVal(tweet['user']['geo_enabled']), | |
getVal(tweet['user']['lang']), | |
getVal(tweet['user']['time_zone']), | |
getVal(tweet['user']['statuses_count']), | |
getVal(tweet['user']['followers_count']), | |
getVal(tweet['user']['created_at']) | |
]) | |
print "done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment