Created
July 4, 2012 14:25
-
-
Save onecrayon/3047627 to your computer and use it in GitHub Desktop.
Sorts Markdown-formatted tweets monthly files; see http://beckism.com/2012/07/archiving-tweets/ for info
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
''' | |
This script parses a text file of tweets (generated by [IFTTT][1], | |
for instance) and sorts them into files by month. You can run it | |
manually from the command line: | |
cd /path/to/containing/folder | |
./archive-tweets.py /path/to/@username.txt | |
Or run it automatically using [Hazel][2] or similar. The script | |
expects that you have a file named like your Twitter username with | |
tweets formatted and delimited like so: | |
My tweet text | |
[July 04, 2012 at 06:48AM](http://twitter.com/link/to/status) | |
- - - | |
And that you want your tweets broken up by month in a subfolder next | |
to the original file. You can change the delimiting characters between | |
tweets and the name of the final archive file using the config variables | |
below. | |
By default, this script will also try to resolve t.co shortened links | |
into their original URLs. You can disable this by setting the | |
`expand_tco_links` config variable below to `False`. | |
[1]: http://ifttt.com/ | |
[2]: http://www.noodlesoft.com/hazel.php | |
''' | |
# CONFIG: adjust to your liking | |
separator_re = r'\s+- - -\s+' # IFTTT adds extra spaces, so have to use a regex | |
final_separator = '\n\n- - -\n\n' # What you want in your final montly archives | |
archive_directory = 'archive' # The sub-directory you want your monthly archives in | |
expand_tco_links = True # Whether you want t.co links expanded or not (slower!) | |
sanitize_usernames = False # Whether you want username underscores backslash escaped | |
# Don't edit below here unless you know what you're doing! | |
import sys | |
import os.path | |
import re | |
import dateutil.parser | |
import urllib2 | |
# Utility function for expanding t.co links | |
def expand_tco(match): | |
url = match.group(0) | |
# Only expand if we have a t.co link | |
if expand_tco_links and (url.startswith('http://t.co/') or url.startswith('https://t.co/')): | |
final_url = urllib2.urlopen(url, None, 15).geturl() | |
else: | |
final_url = url | |
# Make link self-linking for Markdown | |
return '<' + final_url.strip() + '>' | |
# Utility function for sanitizing underscores in usernames | |
def sanitize_characters(match): | |
if sanitize_usernames: | |
return match.group(0).replace('_', r'\_') | |
else: | |
return match.group(0) | |
# Grab our paths | |
filepath = sys.argv[1] | |
username, ext = os.path.splitext(os.path.basename(filepath)) | |
root_dir = os.path.dirname(filepath) | |
archive_dir = os.path.join(root_dir, archive_directory) | |
# Read our tweets from the file | |
file = open(filepath, 'r+') | |
tweets = file.read() | |
tweets = re.split(separator_re, tweets) | |
# Clear out the file | |
file.truncate(0) | |
file.close() | |
# Parse through our tweets and find their dates | |
tweet_re = re.compile(r'^(.*?)(\[([^\]]+)\]\([^(]+\))$', re.S) | |
# Link regex derivative of John Gruber's: http://daringfireball.net/2010/07/improved_regex_for_matching_urls | |
link_re = re.compile(r'\b(https?://(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', re.I) | |
dated_tweets = {} | |
for tweet in tweets: | |
if len(tweet) > 0: | |
# Parse our tweet | |
matched_tweet = tweet_re.match(tweet) | |
# Replace t.co links with expanded versions | |
sanitized_body = re.sub(r'@[a-z0-9]*_[a-z0-9_]+', sanitize_characters, matched_tweet.group(1)) | |
formatted_tweet = link_re.sub(expand_tco, sanitized_body) + matched_tweet.group(2) | |
# Grab our date, and toss the tweet into our dated dictionary | |
date = dateutil.parser.parse(matched_tweet.group(3)).strftime('%Y-%m') | |
if date not in dated_tweets: | |
dated_tweets[date] = [] | |
dated_tweets[date].append(formatted_tweet) | |
# Now we have our dated tweets; loop through them and write to disk | |
for date, tweets in dated_tweets.items(): | |
month_path = os.path.join(archive_dir, username + '-' + date + ext) | |
# Construct our string with a trailing separator, just in case of future tweets | |
tweet_string = final_separator.join(tweets) + final_separator | |
# Append our tweets to the archive file | |
file = open(month_path, 'a') | |
file.write(tweet_string) | |
file.close() | |
# All done! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment