Last active
April 13, 2018 16:21
-
-
Save jrwiebe/df22665e69f9dcf436a3d7ad2be6f854 to your computer and use it in GitHub Desktop.
I noticed realDonaldTrump seemed to take a long time to write linked tweets, so I created this silly script to generate some stats.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This script requires 'wget' | |
current_year=`date '+%Y'` | |
### You can edit these values ### | |
start_year=2017 | |
end_year=$current_year | |
outfile="condensed_all.json" | |
################################# | |
year=$start_year | |
current_json="condensed_${current_year}.json.zip" | |
if [[ "$end_year" -eq "$current_year" && -f $current_json ]]; then | |
# Always download current year's tweets, if needed | |
echo "Removing old $current_json" | |
rm $current_json | |
fi | |
while [ "$year" -le "$end_year" ]; do | |
if [ -f condensed_${year}.json.zip ]; then | |
echo "condensed_${year}.json.zip exists" | |
fi | |
wget -nc -nv https://github.com/bpb27/trump_tweet_data_archive/raw/master/condensed_${year}.json.zip | |
((year+=1)) | |
done | |
echo "Writing $outfile" | |
year=$end_year | |
if [ "$start_year" -eq "$year" ]; then | |
unzip -o -q condensed_${year}.json.zip | |
mv condensed_${year}.json $outfile | |
else | |
echo "[" > $outfile | |
while [ "$year" -ge "$start_year" ]; do | |
unzip -o -q condensed_${year}.json.zip | |
cat condensed_${year}.json | sed 's/^.//;s/.$//' >> $outfile | |
rm condensed_${year}.json | |
if [ "$year" -ne "$start_year" ]; then | |
echo "," >> $outfile | |
fi | |
((year-=1)) | |
done | |
echo "]" >> $outfile | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Calculates the average elapsed time between @realDonaldTrump's | |
linked tweets. (Tweets are identified as linked if they begin or end with | |
ellipsis [...].) | |
To run: | |
------- | |
$ ./gettweets.sh | |
$ python trumplinkedtweettime.py -e 4 condensed_all.json | |
The example argument "-e 4" is optional, and specifies the minimum ellipsis | |
length (the default is 3). | |
""" | |
import sys, argparse, json | |
from datetime import datetime, timedelta | |
def secs_to_minsec(secs): | |
return divmod(secs, 60) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-e', metavar='NUM_ELLIPSIS_DOTS', type=int, | |
default=3, help='minimum number of ellipsis dots ' | |
'linking tweets (default=%(default)s)') | |
parser.add_argument('-t', metavar='DISCARD_THRESHOLD', type=int, | |
default=60, help='Discard apparently-linked tweets ' | |
'separated by more than this many minutes ' | |
'(default=%(default)s)') | |
parser.add_argument('jsonfile', help='JSON file containing tweets') | |
args = parser.parse_args() | |
num_dots = args.e | |
threshold = args.t * 60 | |
ellipsis = '.' * num_dots | |
with open(args.jsonfile, 'r') as jsonfile: | |
tweets = json.load(jsonfile) | |
dt_format = '%a %b %d %X +0000 %Y' | |
count = 0 | |
delta_sum = 0 | |
longest = 0 | |
longest_id_a = longest_id_b = 0 | |
shortest = float('inf') | |
numberedtweets = enumerate(tweets) | |
for i, tweet in numberedtweets: | |
if i < len(tweets) - 1: | |
prev_tweet = tweets[i+1] | |
if tweet['text'].startswith(ellipsis) or prev_tweet['text'].endswith(ellipsis): | |
dt_current = datetime.strptime(tweet['created_at'], dt_format) | |
dt_previous = datetime.strptime(prev_tweet['created_at'], dt_format) | |
delta = dt_current - dt_previous | |
delta_secs = delta.total_seconds() | |
if delta_secs <= threshold: | |
count = count + 1 | |
delta_sum = delta_sum + delta_secs | |
wpm = len(tweet['text'].split()) / (delta_secs / 60) | |
print("%2dm%ds\t: %s, tweet %s to %s (%0.1f wpm)" % | |
(secs_to_minsec(delta_secs) + | |
(dt_current.strftime('%Y-%m-%d %H:%M:%S'), prev_tweet['id_str'], tweet['id_str'], wpm))) | |
if delta_secs > longest: | |
longest = delta_secs | |
longest_id_b = tweet['id_str'] | |
longest_id_a = prev_tweet['id_str'] | |
if delta_secs < shortest: | |
shortest = delta_secs | |
avg_seconds = delta_sum / count | |
print("\nTweets linked by ellipsis of at least %d dots" % num_dots) | |
print("Discarding tweet pairs separated by more than %dm%ds" % secs_to_minsec(threshold)) | |
print("\nAverage time between linked tweets: %dm%ds" % secs_to_minsec(avg_seconds)) | |
print("Longest : %dm%ds (tweets %s and %s)" % (secs_to_minsec(longest) + (longest_id_a, longest_id_b))) | |
print("Shortest: %dm%ds" % secs_to_minsec(shortest)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment