Skip to content

Instantly share code, notes, and snippets.

@emallson
Created November 20, 2015 01:55
Show Gist options
  • Save emallson/9e56a99973b3091124cd to your computer and use it in GitHub Desktop.
Save emallson/9e56a99973b3091124cd to your computer and use it in GitHub Desktop.
Iterating over user timelines with Twython
from twython import Twython, TwythonRateLimitError, TwythonError
from glob import glob
from util import sleep_until
from csv import DictReader, DictWriter
import os
APP_KEY = ''
ACCESS_TOKEN = ''
tw = Twython(APP_KEY, access_token=ACCESS_TOKEN)
fields = ['id', 'user', 'created_at', 'lang', 'text']
def timeline(user_id, max_id=None):
""" Loop over a user's timeline, starting at max_id.
Generator.
We can get up to 15 pages.
This function loops up to 16 times to make the base case
`len(tweets) == 0` trigger."""
for i in range(16):
tweets = tw.get_user_timeline(user_id=user_id,
max_id=max_id,
count=200,
trim_user=True,
exclude_replies=True,
include_rts=False)
if len(tweets) > 0: # last page should have zero results
for tweet in tweets:
max_id = tweet['id'] - 1
yield tweet
else:
break
# I have a directory of csv files containing user profiles.
# Pull the IDs from these.
for csv in glob("csvs/*.csv"):
with open(csv, "r") as csvfile:
reader = DictReader(csvfile)
for row in reader:
path = "csvs/timelines/{}.csv".format(row['id'])
if os.path.exists(path):
continue # skip users that have already been read.
with open(path, "w") as timecsv:
print(row['id'])
writer = DictWriter(timecsv, fields)
writer.writeheader()
# max_id is used to continue a user's timeline from
# the last tweet read in the event that we get
# rate-limited in the middle of a user's timeline
max_id = None
while True:
try:
for tweet in timeline(row['id'], max_id):
sub = {k: v for k, v in tweet.items()
if k in fields}
sub['user'] = sub['user']['id']
max_id = sub['id']
writer.writerow(sub)
max_id = None
break
except TwythonRateLimitError as e:
sleep_until(e.retry_after) # sleep until the given date
except TwythonError as e:
# I *think* this is caused by protected profiles.
# I can read some user profile info, but not the timeline
print(e)
print("Skipping...")
break
from datetime import datetime
import time
def sleep_until(ts):
""" Sleep until the given UTC UNIX TIMESTAMP. """
next_time = datetime.utcfromtimestamp(int(ts))
now = datetime.utcnow()
offset = (next_time - now).seconds
print("Enhancing calm. Next try: {} (Currently {})".format(next_time, now))
print("Sleeping for {}...".format(offset))
time.sleep(offset)
print("Continuing...")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment