Sorts Markdown-formatted tweets monthly files; see http://beckism.com/2012/07/archiving-tweets/ for info
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
''' | |
This script parses a text file of tweets (generated by [IFTTT][1], | |
for instance) and sorts them into files by month. You can run it | |
manually from the command line: | |
cd /path/to/containing/folder | |
./archive-tweets.py /path/to/@username.txt | |
Or run it automatically using [Hazel][2] or similar. The script | |
expects that you have a file named like your Twitter username with | |
tweets formatted and delimited like so: | |
My tweet text | |
[July 04, 2012 at 06:48AM](http://twitter.com/link/to/status) | |
- - - | |
And that you want your tweets broken up by month in a subfolder next | |
to the original file. You can change the delimiting characters between | |
tweets and the name of the final archive file using the config variables | |
below. | |
By default, this script will also try to resolve t.co shortened links | |
into their original URLs. You can disable this by setting the | |
`expand_tco_links` config variable below to `False`. | |
[1]: http://ifttt.com/ | |
[2]: http://www.noodlesoft.com/hazel.php | |
''' | |
# CONFIG: adjust to your liking | |
separator_re = r'\s+- - -\s+' # IFTTT adds extra spaces, so have to use a regex | |
final_separator = '\n\n- - -\n\n' # What you want in your final montly archives | |
archive_directory = 'archive' # The sub-directory you want your monthly archives in | |
expand_tco_links = True # Whether you want t.co links expanded or not (slower!) | |
sanitize_usernames = False # Whether you want username underscores backslash escaped | |
# Don't edit below here unless you know what you're doing! | |
import sys | |
import os.path | |
import re | |
import dateutil.parser | |
import urllib2 | |
# Utility function for expanding t.co links | |
def expand_tco(match): | |
url = match.group(0) | |
# Only expand if we have a t.co link | |
if expand_tco_links and (url.startswith('http://t.co/') or url.startswith('https://t.co/')): | |
final_url = urllib2.urlopen(url, None, 15).geturl() | |
else: | |
final_url = url | |
# Make link self-linking for Markdown | |
return '<' + final_url.strip() + '>' | |
# Utility function for sanitizing underscores in usernames | |
def sanitize_characters(match): | |
if sanitize_usernames: | |
return match.group(0).replace('_', r'\_') | |
else: | |
return match.group(0) | |
# Grab our paths | |
filepath = sys.argv[1] | |
username, ext = os.path.splitext(os.path.basename(filepath)) | |
root_dir = os.path.dirname(filepath) | |
archive_dir = os.path.join(root_dir, archive_directory) | |
# Read our tweets from the file | |
file = open(filepath, 'r+') | |
tweets = file.read() | |
tweets = re.split(separator_re, tweets) | |
# Clear out the file | |
file.truncate(0) | |
file.close() | |
# Parse through our tweets and find their dates | |
tweet_re = re.compile(r'^(.*?)(\[([^\]]+)\]\([^(]+\))$', re.S) | |
# Link regex derivative of John Gruber's: http://daringfireball.net/2010/07/improved_regex_for_matching_urls | |
link_re = re.compile(r'\b(https?://(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', re.I) | |
dated_tweets = {} | |
for tweet in tweets: | |
if len(tweet) > 0: | |
# Parse our tweet | |
matched_tweet = tweet_re.match(tweet) | |
# Replace t.co links with expanded versions | |
sanitized_body = re.sub(r'@[a-z0-9]*_[a-z0-9_]+', sanitize_characters, matched_tweet.group(1)) | |
formatted_tweet = link_re.sub(expand_tco, sanitized_body) + matched_tweet.group(2) | |
# Grab our date, and toss the tweet into our dated dictionary | |
date = dateutil.parser.parse(matched_tweet.group(3)).strftime('%Y-%m') | |
if date not in dated_tweets: | |
dated_tweets[date] = [] | |
dated_tweets[date].append(formatted_tweet) | |
# Now we have our dated tweets; loop through them and write to disk | |
for date, tweets in dated_tweets.items(): | |
month_path = os.path.join(archive_dir, username + '-' + date + ext) | |
# Construct our string with a trailing separator, just in case of future tweets | |
tweet_string = final_separator.join(tweets) + final_separator | |
# Append our tweets to the archive file | |
file = open(month_path, 'a') | |
file.write(tweet_string) | |
file.close() | |
# All done! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment