Skip to content

Instantly share code, notes, and snippets.

@hbmartin
Created December 24, 2019 19:52
Show Gist options
  • Save hbmartin/dc0ccd51b7b7807b60dc4d9bc259bd86 to your computer and use it in GitHub Desktop.
Save hbmartin/dc0ccd51b7b7807b60dc4d9bc259bd86 to your computer and use it in GitHub Desktop.
Download a twitter user's tweet corpus with output suitable for nlp / gpt2 training
#!/usr/bin/env python3
import json
import re
import sys
import tweepy
try:
with open("twitter_secrets.json") as json_file:
twitter_secrets = json.load(json_file)
except FileNotFoundError:
print("Please place twitter_secrets.json in the same directory as this script")
print("It must have the following keys: consumer_key, consumer_secret, access_token, access_token_secret")
sys.exit(1)
auth = tweepy.OAuthHandler(twitter_secrets["consumer_key"], twitter_secrets["consumer_secret"])
auth.set_access_token(twitter_secrets["access_token"], twitter_secrets["access_token_secret"])
api = tweepy.API(auth)
def save_all_tweets(screen_name, min_length=20):
all_tweets = []
# Twitter only allows access to a users most recent 3240 tweets with this method
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(
screen_name=screen_name, count=200, include_rts=False, trim_user=True, tweet_mode="extended"
)
while len(new_tweets) > 0:
all_tweets.extend(new_tweets)
print("Last downloaded tweet: %s" % all_tweets[-1].full_text)
print("...%s tweets downloaded so far" % (len(all_tweets)))
oldest = all_tweets[-1].id - 1
new_tweets = api.user_timeline(
screen_name=screen_name, count=200, include_rts=False, trim_user=True, tweet_mode="extended", max_id=oldest
)
cleaned_text = [re.sub(r"https?://\S+", "", tweet.full_text).rstrip() for tweet in all_tweets]
long_text = [text for text in cleaned_text if len(text) > min_length]
# write the output
with open("%s.txt" % screen_name, "w") as text_file:
text_file.write("\n<|endoftext|>\n".join(long_text))
if len(sys.argv) == 1:
print("Pass twitter user names as arguments")
sys.exit(1)
for arg in sys.argv[1:]:
save_all_tweets(arg)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment