jrwiebe/gettweets.sh

## gettweets.sh
#!/bin/bash
# This script requires 'wget'

current_year=`date '+%Y'`
### You can edit these values ###
start_year=2017
end_year=$current_year
outfile="condensed_all.json"
#################################

year=$start_year
current_json="condensed_${current_year}.json.zip"
if [[ "$end_year" -eq "$current_year" && -f $current_json ]]; then
    # Always download current year's tweets, if needed
    echo "Removing old $current_json"
    rm $current_json
fi
while [ "$year" -le "$end_year" ]; do
    if [ -f condensed_${year}.json.zip ]; then
        echo "condensed_${year}.json.zip exists"
    fi
    wget -nc -nv https://github.com/bpb27/trump_tweet_data_archive/raw/master/condensed_${year}.json.zip
    ((year+=1))
done
echo "Writing $outfile"
year=$end_year
if [ "$start_year" -eq "$year" ]; then
    unzip -o -q condensed_${year}.json.zip
    mv condensed_${year}.json $outfile
else
    echo "[" > $outfile
    while [ "$year" -ge "$start_year" ]; do
        unzip -o -q condensed_${year}.json.zip
        cat condensed_${year}.json | sed 's/^.//;s/.$//' >> $outfile
        rm condensed_${year}.json
        if [ "$year" -ne "$start_year" ]; then
            echo "," >> $outfile
        fi
        ((year-=1))
    done
    echo "]" >> $outfile
fi

## trumplinkedtweettime.py
"""
Calculates the average elapsed time between @realDonaldTrump's
linked tweets. (Tweets are identified as linked if they begin or end with
ellipsis [...].)

To run:
-------
$ ./gettweets.sh
$ python trumplinkedtweettime.py -e 4 condensed_all.json

The example argument "-e 4" is optional, and specifies the minimum ellipsis
length (the default is 3).
"""

import sys, argparse, json
from datetime import datetime, timedelta

def secs_to_minsec(secs):
    return divmod(secs, 60)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', metavar='NUM_ELLIPSIS_DOTS', type=int,
                        default=3, help='minimum number of ellipsis dots '
                        'linking tweets (default=%(default)s)')
    parser.add_argument('-t', metavar='DISCARD_THRESHOLD', type=int,
                        default=60, help='Discard apparently-linked tweets '
                        'separated by more than this many minutes '
                        '(default=%(default)s)')
    parser.add_argument('jsonfile', help='JSON file containing tweets')
    args = parser.parse_args()
    num_dots = args.e
    threshold = args.t * 60
    ellipsis = '.' * num_dots

    with open(args.jsonfile, 'r') as jsonfile:
        tweets = json.load(jsonfile)
        dt_format = '%a %b %d %X +0000 %Y'
        count = 0
        delta_sum = 0
        longest = 0
        longest_id_a = longest_id_b = 0
        shortest = float('inf')

        numberedtweets = enumerate(tweets)
        for i, tweet in numberedtweets:
            if i < len(tweets) - 1:
                prev_tweet = tweets[i+1]
                if tweet['text'].startswith(ellipsis) or prev_tweet['text'].endswith(ellipsis):
                    dt_current = datetime.strptime(tweet['created_at'], dt_format)
                    dt_previous = datetime.strptime(prev_tweet['created_at'], dt_format)
                    delta = dt_current - dt_previous
                    delta_secs = delta.total_seconds()
                    if delta_secs <= threshold:
                        count = count + 1
                        delta_sum = delta_sum + delta_secs
                        wpm = len(tweet['text'].split()) / (delta_secs / 60)
                        print("%2dm%ds\t: %s, tweet %s to %s (%0.1f wpm)" %
                            (secs_to_minsec(delta_secs) +
                            (dt_current.strftime('%Y-%m-%d %H:%M:%S'), prev_tweet['id_str'], tweet['id_str'], wpm)))
                        if delta_secs > longest:
                            longest = delta_secs
                            longest_id_b = tweet['id_str']
                            longest_id_a = prev_tweet['id_str']
                        if delta_secs < shortest:
                            shortest = delta_secs

        avg_seconds = delta_sum / count

        print("\nTweets linked by ellipsis of at least %d dots" % num_dots)
        print("Discarding tweet pairs separated by more than %dm%ds" % secs_to_minsec(threshold))
        print("\nAverage time between linked tweets: %dm%ds" % secs_to_minsec(avg_seconds))
        print("Longest : %dm%ds (tweets %s and %s)" % (secs_to_minsec(longest) + (longest_id_a, longest_id_b)))
        print("Shortest: %dm%ds" % secs_to_minsec(shortest))

if __name__ == "__main__":
    main()
	#!/bin/bash
	# This script requires 'wget'

	current_year=`date '+%Y'`
	### You can edit these values ###
	start_year=2017
	end_year=$current_year
	outfile="condensed_all.json"
	#################################

	year=$start_year
	current_json="condensed_${current_year}.json.zip"
	if [[ "$end_year" -eq "$current_year" && -f $current_json ]]; then
	# Always download current year's tweets, if needed
	echo "Removing old $current_json"
	rm $current_json
	fi
	while [ "$year" -le "$end_year" ]; do
	if [ -f condensed_${year}.json.zip ]; then
	echo "condensed_${year}.json.zip exists"
	fi
	wget -nc -nv https://github.com/bpb27/trump_tweet_data_archive/raw/master/condensed_${year}.json.zip
	((year+=1))
	done
	echo "Writing $outfile"
	year=$end_year
	if [ "$start_year" -eq "$year" ]; then
	unzip -o -q condensed_${year}.json.zip
	mv condensed_${year}.json $outfile
	else
	echo "[" > $outfile
	while [ "$year" -ge "$start_year" ]; do
	unzip -o -q condensed_${year}.json.zip
	cat condensed_${year}.json \| sed 's/^.//;s/.$//' >> $outfile
	rm condensed_${year}.json
	if [ "$year" -ne "$start_year" ]; then
	echo "," >> $outfile
	fi
	((year-=1))
	done
	echo "]" >> $outfile
	fi
	"""
	Calculates the average elapsed time between @realDonaldTrump's
	linked tweets. (Tweets are identified as linked if they begin or end with
	ellipsis [...].)

	To run:
	-------
	$ ./gettweets.sh
	$ python trumplinkedtweettime.py -e 4 condensed_all.json

	The example argument "-e 4" is optional, and specifies the minimum ellipsis
	length (the default is 3).
	"""

	import sys, argparse, json
	from datetime import datetime, timedelta

	def secs_to_minsec(secs):
	return divmod(secs, 60)

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-e', metavar='NUM_ELLIPSIS_DOTS', type=int,
	default=3, help='minimum number of ellipsis dots '
	'linking tweets (default=%(default)s)')
	parser.add_argument('-t', metavar='DISCARD_THRESHOLD', type=int,
	default=60, help='Discard apparently-linked tweets '
	'separated by more than this many minutes '
	'(default=%(default)s)')
	parser.add_argument('jsonfile', help='JSON file containing tweets')
	args = parser.parse_args()
	num_dots = args.e
	threshold = args.t * 60
	ellipsis = '.' * num_dots

	with open(args.jsonfile, 'r') as jsonfile:
	tweets = json.load(jsonfile)
	dt_format = '%a %b %d %X +0000 %Y'
	count = 0
	delta_sum = 0
	longest = 0
	longest_id_a = longest_id_b = 0
	shortest = float('inf')

	numberedtweets = enumerate(tweets)
	for i, tweet in numberedtweets:
	if i < len(tweets) - 1:
	prev_tweet = tweets[i+1]
	if tweet['text'].startswith(ellipsis) or prev_tweet['text'].endswith(ellipsis):
	dt_current = datetime.strptime(tweet['created_at'], dt_format)
	dt_previous = datetime.strptime(prev_tweet['created_at'], dt_format)
	delta = dt_current - dt_previous
	delta_secs = delta.total_seconds()
	if delta_secs <= threshold:
	count = count + 1
	delta_sum = delta_sum + delta_secs
	wpm = len(tweet['text'].split()) / (delta_secs / 60)
	print("%2dm%ds\t: %s, tweet %s to %s (%0.1f wpm)" %
	(secs_to_minsec(delta_secs) +
	(dt_current.strftime('%Y-%m-%d %H:%M:%S'), prev_tweet['id_str'], tweet['id_str'], wpm)))
	if delta_secs > longest:
	longest = delta_secs
	longest_id_b = tweet['id_str']
	longest_id_a = prev_tweet['id_str']
	if delta_secs < shortest:
	shortest = delta_secs

	avg_seconds = delta_sum / count

	print("\nTweets linked by ellipsis of at least %d dots" % num_dots)
	print("Discarding tweet pairs separated by more than %dm%ds" % secs_to_minsec(threshold))
	print("\nAverage time between linked tweets: %dm%ds" % secs_to_minsec(avg_seconds))
	print("Longest : %dm%ds (tweets %s and %s)" % (secs_to_minsec(longest) + (longest_id_a, longest_id_b)))
	print("Shortest: %dm%ds" % secs_to_minsec(shortest))

	if __name__ == "__main__":
	main()