Skip to content

Instantly share code, notes, and snippets.

@onecrayon
Created July 4, 2012 14:25
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save onecrayon/3047627 to your computer and use it in GitHub Desktop.
Save onecrayon/3047627 to your computer and use it in GitHub Desktop.
Sorts Markdown-formatted tweets monthly files; see http://beckism.com/2012/07/archiving-tweets/ for info
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
This script parses a text file of tweets (generated by [IFTTT][1],
for instance) and sorts them into files by month. You can run it
manually from the command line:
cd /path/to/containing/folder
./archive-tweets.py /path/to/@username.txt
Or run it automatically using [Hazel][2] or similar. The script
expects that you have a file named like your Twitter username with
tweets formatted and delimited like so:
My tweet text
[July 04, 2012 at 06:48AM](http://twitter.com/link/to/status)
- - -
And that you want your tweets broken up by month in a subfolder next
to the original file. You can change the delimiting characters between
tweets and the name of the final archive file using the config variables
below.
By default, this script will also try to resolve t.co shortened links
into their original URLs. You can disable this by setting the
`expand_tco_links` config variable below to `False`.
[1]: http://ifttt.com/
[2]: http://www.noodlesoft.com/hazel.php
'''
# CONFIG: adjust to your liking
separator_re = r'\s+- - -\s+' # IFTTT adds extra spaces, so have to use a regex
final_separator = '\n\n- - -\n\n' # What you want in your final montly archives
archive_directory = 'archive' # The sub-directory you want your monthly archives in
expand_tco_links = True # Whether you want t.co links expanded or not (slower!)
sanitize_usernames = False # Whether you want username underscores backslash escaped
# Don't edit below here unless you know what you're doing!
import sys
import os.path
import re
import dateutil.parser
import urllib2
# Utility function for expanding t.co links
def expand_tco(match):
url = match.group(0)
# Only expand if we have a t.co link
if expand_tco_links and (url.startswith('http://t.co/') or url.startswith('https://t.co/')):
final_url = urllib2.urlopen(url, None, 15).geturl()
else:
final_url = url
# Make link self-linking for Markdown
return '<' + final_url.strip() + '>'
# Utility function for sanitizing underscores in usernames
def sanitize_characters(match):
if sanitize_usernames:
return match.group(0).replace('_', r'\_')
else:
return match.group(0)
# Grab our paths
filepath = sys.argv[1]
username, ext = os.path.splitext(os.path.basename(filepath))
root_dir = os.path.dirname(filepath)
archive_dir = os.path.join(root_dir, archive_directory)
# Read our tweets from the file
file = open(filepath, 'r+')
tweets = file.read()
tweets = re.split(separator_re, tweets)
# Clear out the file
file.truncate(0)
file.close()
# Parse through our tweets and find their dates
tweet_re = re.compile(r'^(.*?)(\[([^\]]+)\]\([^(]+\))$', re.S)
# Link regex derivative of John Gruber's: http://daringfireball.net/2010/07/improved_regex_for_matching_urls
link_re = re.compile(r'\b(https?://(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', re.I)
dated_tweets = {}
for tweet in tweets:
if len(tweet) > 0:
# Parse our tweet
matched_tweet = tweet_re.match(tweet)
# Replace t.co links with expanded versions
sanitized_body = re.sub(r'@[a-z0-9]*_[a-z0-9_]+', sanitize_characters, matched_tweet.group(1))
formatted_tweet = link_re.sub(expand_tco, sanitized_body) + matched_tweet.group(2)
# Grab our date, and toss the tweet into our dated dictionary
date = dateutil.parser.parse(matched_tweet.group(3)).strftime('%Y-%m')
if date not in dated_tweets:
dated_tweets[date] = []
dated_tweets[date].append(formatted_tweet)
# Now we have our dated tweets; loop through them and write to disk
for date, tweets in dated_tweets.items():
month_path = os.path.join(archive_dir, username + '-' + date + ext)
# Construct our string with a trailing separator, just in case of future tweets
tweet_string = final_separator.join(tweets) + final_separator
# Append our tweets to the archive file
file = open(month_path, 'a')
file.write(tweet_string)
file.close()
# All done!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment