Skip to content

Instantly share code, notes, and snippets.

@arulrajnet
Created January 4, 2015 18:08
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arulrajnet/0b71842b573d81c7bc23 to your computer and use it in GitHub Desktop.
Save arulrajnet/0b71842b573d81c7bc23 to your computer and use it in GitHub Desktop.
Dump all tweets as CSV of given twitter handle using #Tweepy https://github.com/tweepy/tweepy
# -*- coding: utf-8 -*-
import tweepy # https://github.com/tweepy/tweepy
import json
import csv
import sys
# Twitter API credentials
consumer_key = "your_consumer_key"
consumer_secret = "your_consumer_secret"
access_key = "your_access_key"
access_secret = "your_access_secret"
# Converts to Tweepy Status object to JSON and add that as json attribute
# Quick monkeypatch https://gist.github.com/misja/1183424
@classmethod
def parse(cls, api, raw):
status = cls.first_parse(api, raw)
setattr(status, 'json', json.dumps(raw))
return status
tweepy.models.Status.first_parse = tweepy.models.Status.parse
tweepy.models.Status.parse = parse
def get_all_tweets(screen_name):
# Twitter only allows access to a users most recent 3240 tweets with this method
# authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name=screen_name, count=200)
# Writing into file
out_file = open('%s_tweets.csv' % screen_name, 'wb')
writer = csv.DictWriter(out_file, fieldnames=["id", "created_at", "text"], extrasaction='ignore',
delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
for tweet in new_tweets:
t = json.loads(tweet.json)
if "text" in t:
t["text"] = t["text"].encode("utf-8")
writer.writerow(t)
# save the id of the oldest tweet less one
oldest = new_tweets[-1].id - 1
no_of_tweets = len(new_tweets)
# keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print "\t getting tweets before %s" % oldest
# all subsequent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest)
if new_tweets is None or len(new_tweets) == 0:
continue
# update the id of the oldest tweet less one
oldest = new_tweets[-1].id - 1
no_of_tweets += len(new_tweets)
# Writing into file
for tweet in new_tweets:
t = json.loads(tweet.json)
if "text" in t:
t["text"] = t["text"].encode("utf-8")
writer.writerow(t)
print "\t ...%s tweets downloaded so far" % no_of_tweets
def usage():
print "Usage :"
print """\t python %s @twitter_handle""" % sys.argv[0]
print "\t OR"
print """\t python %s '["@twitter_handle","twitter_handle1", "@twitter_handle2"]'""" % sys.argv[0]
if __name__ == '__main__':
# pass in the username or list of the account you want to download
# Array of handles get from xpath in chrome $x("//span[contains(@class,'username js-action-profile-name')]/text()")
if len(sys.argv) == 1:
usage()
sys.exit()
given_value = sys.argv[1]
try:
_handles_ = json.loads(given_value)
for twitter_handle in _handles_:
twitter_handle = twitter_handle.replace("@", "")
print "Downloading Tweets of %s" % twitter_handle
try:
get_all_tweets(twitter_handle)
except Exception, e:
print "Error in Downloading %s, %s" % (twitter_handle, e)
continue
except Exception, e:
# If not a JSON document
print "Its not an JSON array. So trying ordinary way."
given_value = given_value.replace("@", "")
print "Downloading Tweets of %s" % given_value
get_all_tweets(given_value)
print "Done"
@mrgloom
Copy link

mrgloom commented Feb 14, 2018

For me it worked like:

len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 200
len(new_tweets) 43
len(new_tweets) 0
no_of_tweets 3243

Is it limitation of 3240 tweets? And how to grab all tweets?

Also why you substruct 1 from id of last tweet? like oldest = new_tweets[-1].id - 1 , maybe it should be just oldest = new_tweets[-1].id ?
Ids look more like random numbers and not like serial numbers.

id 963598082691694594
id 963597759994564608
id 963597617241509888
id 963596847544709120
id 963592688540012544
id 963592168328843264

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment