Skip to content

Instantly share code, notes, and snippets.

@cathalgarvey
Created April 14, 2013 20:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cathalgarvey/5384003 to your computer and use it in GitHub Desktop.
Save cathalgarvey/5384003 to your computer and use it in GitHub Desktop.
grep_tweets, a companion script to cat_tweets that allows searching and filtering of Twitter tweet archive data by regex or a bunch of other useful parameters.
#!/usr/bin/env python3
import time
import datetime
import json
import re
timestamp_format = '%a %b %d %H:%M:%S %z %Y'
def twitter_timestamp_to_obj(time_string):
'Returns a timezone-aware datetime object.'
return datetime.datetime.strptime(time_string, timestamp_format)
def twitter_timestamp_to_unix(time_string):
timest = twitter_timestamp_to_obj("Sat Apr 13 18:04:17 +0000 2013")
timest = timest.utctimetuple()
return time.mktime(timest)
def refilter(list, field, expression):
expression = re.compile(expression)
for x in list:
if expression.findall(x.get(field,'')):
yield x
def get_hashtags(tweet, hashtag):
return [x.get('text','') for x in tweet['entities'].get('hashtags',[])]
def get_mention(tweet, username):
return [x.get('screen_name','') for x in tweet['entities'].get('user_mentions',[])]
def get_mention_names(tweet, realname):
return [x.get('name','') for x in tweet['entities'].get('user_mentions',[])]
def get_mention_ids(tweet, realname):
return [x.get('id_str','') for x in tweet['entities'].get('user_mentions',[])]
def get_retweets(tweet_list):
return [x for x in tweet_list if "retweeted_status" in x]
def get_all_retweets_of_user(tweet_list, username):
return [x for x in get_retweets(tweet_list) if x['retweeted_status']['user']['screen_name']==username]
def get_all_hashtag_uses(tweet_list, hashtag):
for tweet in tweet_list:
if get_hashtags(tweet, hashtag):
yield tweet
def get_all_mentions(tweet_list, username):
for tweet in tweet_list:
if get_mention(tweet, username):
yield tweet
def get_all_mentions_by_name(tweet_list, username):
for tweet in tweet_list:
if get_mention_names(tweet, username):
yield tweet
def get_all_mentions_by_id(tweet_list, userid):
for tweet in tweet_list:
if get_mention_names(tweet, userid):
yield tweet
def get_all_tweets_by(tweet_list, username):
results = get_all_retweets_of_user(tweet_list, username)
results.extend([x for x in tweet_list if x not in results and x['user']['screen_name']==username])
return sorted(results, key=lambda x:x['unix_timestamp'])
def grep_text(tweet_list, expression):
return [x for x in refilter(tweet_list, "text", expression)]
if __name__ == "__main__":
import argparse
#try:
# from shutil import get_terminal_size
# havetermxy = True
#except:
# havetermxy = False
ArgP = argparse.ArgumentParser(
description=("A grep-mimic for tweet lists serialised with cat_tweets. "
"Doesn't double-count due to inclusion of retweeted messages, "
"supports tweet-specific switches."),
epilog="by Cathal Garvey, released under GNU Affero GPL v3.")
ArgP.add_argument("tweetfile",
help="File containing ordered, JSON formatted tweets as returned by cat_tweets.")
ArgP.add_argument("-p", "--pattern",
help="Regex pattern to match in tweet text/body.")
ArgP.add_argument("-u", "--username",
help="Limit results to tweets from this username only. (exclude '@')")
ArgP.add_argument("-m", "--mentions",
help="Limit results to tweets that mention this username only. (exclude '@')")
ArgP.add_argument("-t", "--hashtag",
help="Limit results to tweets containing this hashtag. (exclude '#')")
#ArgP.add_argument("-l", "--single-line",action="store_true",default=False,
# help="Don't wrap output, print/save each tweet to a single line. On Python < 3.3, this is the default anyway.")
args = ArgP.parse_args()
with open(args.tweetfile) as InF:
tweets = json.load(InF)
if args.username: tweets = get_all_tweets_by(tweets, args.username)
if args.mentions: tweets = get_all_mentions(tweets, args.mentions)
if args.pattern: tweets = grep_text(tweets, args.pattern)
for tweet in tweets:
if "retweeted_status" in tweet:
tweet_text = tweet['retweeted_status']['text']
tweet_author = tweet['retweeted_status']['user']['screen_name']
tweet_time = twitter_timestamp_to_obj(tweet['retweeted_status']['created_at']).utctimetuple()
else:
tweet_text = tweet['text']
tweet_author = tweet['user']['screen_name']
tweet_time = twitter_timestamp_to_obj(tweet['created_at']).utctimetuple()
tweet_time = time.strftime("%Y-%m-%d-%H:%M:%S UTC", tweet_time)
tweet_text = tweet_text + ' ' * (140-len(tweet_text))
tweet_author = tweet_author + ' ' * (15-len(tweet_author))
formattedtweet = "{author} at {time}: {tweet}".format(author=tweet_author, tweet=tweet_text, time=tweet_time)
print(formattedtweet)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment