Created
April 14, 2013 20:06
-
-
Save cathalgarvey/5384003 to your computer and use it in GitHub Desktop.
grep_tweets, a companion script to cat_tweets that allows searching and filtering of Twitter tweet archive data by regex or a bunch of other useful parameters.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import time | |
import datetime | |
import json | |
import re | |
timestamp_format = '%a %b %d %H:%M:%S %z %Y' | |
def twitter_timestamp_to_obj(time_string): | |
'Returns a timezone-aware datetime object.' | |
return datetime.datetime.strptime(time_string, timestamp_format) | |
def twitter_timestamp_to_unix(time_string): | |
timest = twitter_timestamp_to_obj("Sat Apr 13 18:04:17 +0000 2013") | |
timest = timest.utctimetuple() | |
return time.mktime(timest) | |
def refilter(list, field, expression): | |
expression = re.compile(expression) | |
for x in list: | |
if expression.findall(x.get(field,'')): | |
yield x | |
def get_hashtags(tweet, hashtag): | |
return [x.get('text','') for x in tweet['entities'].get('hashtags',[])] | |
def get_mention(tweet, username): | |
return [x.get('screen_name','') for x in tweet['entities'].get('user_mentions',[])] | |
def get_mention_names(tweet, realname): | |
return [x.get('name','') for x in tweet['entities'].get('user_mentions',[])] | |
def get_mention_ids(tweet, realname): | |
return [x.get('id_str','') for x in tweet['entities'].get('user_mentions',[])] | |
def get_retweets(tweet_list): | |
return [x for x in tweet_list if "retweeted_status" in x] | |
def get_all_retweets_of_user(tweet_list, username): | |
return [x for x in get_retweets(tweet_list) if x['retweeted_status']['user']['screen_name']==username] | |
def get_all_hashtag_uses(tweet_list, hashtag): | |
for tweet in tweet_list: | |
if get_hashtags(tweet, hashtag): | |
yield tweet | |
def get_all_mentions(tweet_list, username): | |
for tweet in tweet_list: | |
if get_mention(tweet, username): | |
yield tweet | |
def get_all_mentions_by_name(tweet_list, username): | |
for tweet in tweet_list: | |
if get_mention_names(tweet, username): | |
yield tweet | |
def get_all_mentions_by_id(tweet_list, userid): | |
for tweet in tweet_list: | |
if get_mention_names(tweet, userid): | |
yield tweet | |
def get_all_tweets_by(tweet_list, username): | |
results = get_all_retweets_of_user(tweet_list, username) | |
results.extend([x for x in tweet_list if x not in results and x['user']['screen_name']==username]) | |
return sorted(results, key=lambda x:x['unix_timestamp']) | |
def grep_text(tweet_list, expression): | |
return [x for x in refilter(tweet_list, "text", expression)] | |
if __name__ == "__main__": | |
import argparse | |
#try: | |
# from shutil import get_terminal_size | |
# havetermxy = True | |
#except: | |
# havetermxy = False | |
ArgP = argparse.ArgumentParser( | |
description=("A grep-mimic for tweet lists serialised with cat_tweets. " | |
"Doesn't double-count due to inclusion of retweeted messages, " | |
"supports tweet-specific switches."), | |
epilog="by Cathal Garvey, released under GNU Affero GPL v3.") | |
ArgP.add_argument("tweetfile", | |
help="File containing ordered, JSON formatted tweets as returned by cat_tweets.") | |
ArgP.add_argument("-p", "--pattern", | |
help="Regex pattern to match in tweet text/body.") | |
ArgP.add_argument("-u", "--username", | |
help="Limit results to tweets from this username only. (exclude '@')") | |
ArgP.add_argument("-m", "--mentions", | |
help="Limit results to tweets that mention this username only. (exclude '@')") | |
ArgP.add_argument("-t", "--hashtag", | |
help="Limit results to tweets containing this hashtag. (exclude '#')") | |
#ArgP.add_argument("-l", "--single-line",action="store_true",default=False, | |
# help="Don't wrap output, print/save each tweet to a single line. On Python < 3.3, this is the default anyway.") | |
args = ArgP.parse_args() | |
with open(args.tweetfile) as InF: | |
tweets = json.load(InF) | |
if args.username: tweets = get_all_tweets_by(tweets, args.username) | |
if args.mentions: tweets = get_all_mentions(tweets, args.mentions) | |
if args.pattern: tweets = grep_text(tweets, args.pattern) | |
for tweet in tweets: | |
if "retweeted_status" in tweet: | |
tweet_text = tweet['retweeted_status']['text'] | |
tweet_author = tweet['retweeted_status']['user']['screen_name'] | |
tweet_time = twitter_timestamp_to_obj(tweet['retweeted_status']['created_at']).utctimetuple() | |
else: | |
tweet_text = tweet['text'] | |
tweet_author = tweet['user']['screen_name'] | |
tweet_time = twitter_timestamp_to_obj(tweet['created_at']).utctimetuple() | |
tweet_time = time.strftime("%Y-%m-%d-%H:%M:%S UTC", tweet_time) | |
tweet_text = tweet_text + ' ' * (140-len(tweet_text)) | |
tweet_author = tweet_author + ' ' * (15-len(tweet_author)) | |
formattedtweet = "{author} at {time}: {tweet}".format(author=tweet_author, tweet=tweet_text, time=tweet_time) | |
print(formattedtweet) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment