Created
January 7, 2018 18:49
-
-
Save elinwibe/7a4383d0b7cd7486be111ea102da424b to your computer and use it in GitHub Desktop.
This code scrapes Tweets by Twitter Search API, and can collect tweets up to 10 days back in time. (Tweepy needs to be installed Mac.). Code modified from Bhaskar V. Karambelkar (https://www.karambelkar.info/2015/01/how-to-use-twitters-search-rest-api-most-effectively./)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This code searches for tweets with a particuar keyword and writes certain fields into a CSV file | |
import sys, csv | |
import twitter | |
import os | |
import tweepy | |
# Replace the API_KEY and API_SECRET with your application's key and secret. | |
#This code is using AppAuthHandler, not OAuthHandler to get higher limits, 2.5 times. | |
auth = tweepy.AppAuthHandler('YOUR_API_KEY', 'YOUR_SECRET_API_KEY') | |
api = tweepy.API(auth, wait_on_rate_limit=True, | |
wait_on_rate_limit_notify=True) | |
if (not api): | |
print ("Can't Authenticate") | |
sys.exit(-1) | |
def clean(val): | |
clean = "" | |
if val: | |
clean = val.encode('utf-8') | |
return clean | |
searchQuery = '' #This is for your hasthag(s), separate by comma | |
maxTweets = 80000 # Large max nr | |
tweetsPerQry = 100 # the max the API permits | |
fName = 'myfile.csv' #The CSV file where your tweets will be stored | |
csvfile = open(fName, 'w'); | |
csvwriter = csv.writer(csvfile) | |
count=0 | |
# If results from a specific ID onwards are reqd, set since_id to that ID. | |
# else default to no lower limit, go as far back as API allows | |
sinceId = None | |
# If results only below a specific ID are, set max_id to that ID. | |
# else default to no upper limit, start from the most recent tweet matching the search query. | |
max_id = -1L | |
tweetCount = 0 | |
#print("Downloading max {0} tweets".format(maxTweets)) | |
with open(fName, 'w') as csvfile: | |
while tweetCount < maxTweets: | |
try: | |
if (max_id <= 0): | |
if (not sinceId): | |
new_tweets = api.search(q=searchQuery, count=tweetsPerQry) | |
else: | |
new_tweets = api.search(q=searchQuery, count=tweetsPerQry, | |
since_id=sinceId) | |
else: | |
if (not sinceId): | |
new_tweets = api.search(q=searchQuery, count=tweetsPerQry, | |
max_id=str(max_id - 1)) | |
else: | |
new_tweets = api.search(q=searchQuery, count=tweetsPerQry, | |
max_id=str(max_id - 1), | |
since_id=sinceId) | |
if not new_tweets: | |
print("No more tweets found") | |
break | |
for tweet in new_tweets: | |
#The detals you want to scrape. Add other/additional data by putting different references, see Twitter developers website; | |
csvwriter.writerow([tweet.created_at, | |
clean(tweet.user.screen_name), | |
clean(tweet.text), | |
tweet.user.created_at, #Might seem arbitrary, but good for idintifying bots | |
tweet.user.followers_count, | |
tweet.user.friends_count, | |
tweet.user.statuses_count, | |
clean(tweet.user.location), #The location set by the user themselves | |
tweet.user.geo_enabled, | |
tweet.user.lang, | |
clean(tweet.user.time_zone), | |
tweet.retweet_count | |
]); | |
tweetCount += len(new_tweets) | |
#print("Downloaded {0} tweets".format(tweetCount)) | |
max_id = new_tweets[-1].id | |
except Exception as e: | |
# Just exit if any error | |
print("some error : " + str(e)) | |
pass | |
print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment