Skip to content

Instantly share code, notes, and snippets.

@esenthil2018
Created October 11, 2020 18:08
Show Gist options
  • Save esenthil2018/f1bc81b8c60573de1ca5640222d72a2f to your computer and use it in GitHub Desktop.
Save esenthil2018/f1bc81b8c60573de1ca5640222d72a2f to your computer and use it in GitHub Desktop.
import requests
TWITTER_KEY = 'Your twitter key'
TWITTER_SECRET_KEY = 'Your secret key'
# Authenticate
auth = tweepy.AppAuthHandler(TWITTER_KEY, TWITTER_SECRET_KEY)
api = tweepy.API(auth, wait_on_rate_limit=True,
wait_on_rate_limit_notify=True)
if (not api):
print ("Can't Authenticate")
sys.exit(-1)
#@title Twitter Search API Inputs
#@markdown ### Enter Search Query:
searchQuery = '#giraffe ' #@param {type:"string"}
#@markdown ### Enter Max Tweets To Scrape:
#@markdown #### The Twitter API Rate Limit (currently) is 45,000 tweets every 15 minutes.
maxTweets = 5000 #@param {type:"slider", min:0, max:45000, step:100}
Filter_Retweets = True #@param {type:"boolean"}
tweetsPerQry = 100 # this is the max the API permits
tweet_lst = []
if Filter_Retweets:
searchQuery = searchQuery + ' -filter:retweets' # to exclude retweets
# If results from a specific ID onwards are reqd, set since_id to that ID.
# else default to no lower limit, go as far back as API allows
sinceId = None
# If results only below a specific ID are, set max_id to that ID.
# else default to no upper limit, start from the most recent tweet matching the search query.
max_id = -10000000000
global vimage
tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
while tweetCount < maxTweets:
try:
if (max_id <= 0):
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry, lang="en")
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
lang="en", since_id=sinceId)
else:
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
lang="en", max_id=str(max_id - 1))
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
lang="en", max_id=str(max_id - 1),
since_id=sinceId)
if not new_tweets:
print("No more tweets found")
break
for tweet in new_tweets:
if hasattr(tweet, 'reply_count'):
reply_count = tweet.reply_count
else:
reply_count = 0
if hasattr(tweet, 'retweeted'):
retweeted = tweet.retweeted
else:
retweeted = "NA"
# fixup search query to get topic
topic = searchQuery[:searchQuery.find('-')].capitalize().strip()
# fixup date
tweetDate = tweet.created_at.date()
#url = tweet.entities["media"]["media_url"]
for media in tweet.entities.get("media",[{}]):
#checks if there is any media-entity
if media.get("type",None) == "photo":
print('test)')
vurl = media["media_url"]
# checks if the entity is of the type "photo"
vimage = requests.get(media["media_url"])
#vurl = media["media_url"]
#print(url)
# save to file etc.
timage = vimage
turl = vurl
tweet_lst.append([tweetDate, topic,
tweet.id, tweet.user.screen_name, tweet.user.name, tweet.text, tweet.favorite_count,
reply_count,tweet.user.location,timage,turl,tweet.user.description, tweet.retweet_count, retweeted])
tweetCount += len(new_tweets)
print("Downloaded {0} tweets".format(tweetCount))
max_id = new_tweets[-1].id
except tweepy.TweepError as e:
# Just exit if any error
print("some error : " + str(e))
break
clear_output()
print("Downloaded {0} tweets".format(tweetCount))
pd.set_option('display.max_colwidth', -1)
# load it into a pandas dataframe
tweet_df = pd.DataFrame(tweet_lst, columns=['tweet_dt', 'topic', 'id', 'screenname','username', 'tweet', 'like_count', 'reply_count','location','timage','turl','description', 'retweet_count', 'retweeted'])
tweet_df.to_csv('tweets1.csv')
tweet_df.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment