onecrayon/archive-tweets.py

## archive-tweets.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

'''
This script parses a text file of tweets (generated by [IFTTT][1],
for instance) and sorts them into files by month. You can run it
manually from the command line:

    cd /path/to/containing/folder
    ./archive-tweets.py /path/to/@username.txt

Or run it automatically using [Hazel][2] or similar. The script
expects that you have a file named like your Twitter username with
tweets formatted and delimited like so:

    My tweet text

    [July 04, 2012 at 06:48AM](http://twitter.com/link/to/status)

    - - -

And that you want your tweets broken up by month in a subfolder next
to the original file. You can change the delimiting characters between
tweets and the name of the final archive file using the config variables
below.

By default, this script will also try to resolve t.co shortened links
into their original URLs. You can disable this by setting the
`expand_tco_links` config variable below to `False`.

   [1]: http://ifttt.com/
   [2]: http://www.noodlesoft.com/hazel.php
'''

# CONFIG: adjust to your liking
separator_re = r'\s+- - -\s+'     # IFTTT adds extra spaces, so have to use a regex
final_separator = '\n\n- - -\n\n' # What you want in your final montly archives
archive_directory = 'archive'     # The sub-directory you want your monthly archives in
expand_tco_links = True           # Whether you want t.co links expanded or not (slower!)
sanitize_usernames = False        # Whether you want username underscores backslash escaped

# Don't edit below here unless you know what you're doing!

import sys
import os.path
import re
import dateutil.parser
import urllib2

# Utility function for expanding t.co links
def expand_tco(match):
	url = match.group(0)
	# Only expand if we have a t.co link
	if expand_tco_links and (url.startswith('http://t.co/') or url.startswith('https://t.co/')):
		final_url = urllib2.urlopen(url, None, 15).geturl()
	else:
		final_url = url
	# Make link self-linking for Markdown
	return '<' + final_url.strip() + '>'

# Utility function for sanitizing underscores in usernames
def sanitize_characters(match):
	if sanitize_usernames:
		return match.group(0).replace('_', r'\_')
	else:
		return match.group(0)

# Grab our paths
filepath = sys.argv[1]
username, ext = os.path.splitext(os.path.basename(filepath))
root_dir = os.path.dirname(filepath)
archive_dir = os.path.join(root_dir, archive_directory)

# Read our tweets from the file
file = open(filepath, 'r+')
tweets = file.read()
tweets = re.split(separator_re, tweets)
# Clear out the file
file.truncate(0)
file.close()

# Parse through our tweets and find their dates
tweet_re = re.compile(r'^(.*?)(\[([^\]]+)\]\([^(]+\))$', re.S)
# Link regex derivative of John Gruber's: http://daringfireball.net/2010/07/improved_regex_for_matching_urls
link_re = re.compile(r'\b(https?://(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', re.I)
dated_tweets = {}
for tweet in tweets:
	if len(tweet) > 0:
		# Parse our tweet
		matched_tweet = tweet_re.match(tweet)
		# Replace t.co links with expanded versions
		sanitized_body = re.sub(r'@[a-z0-9]*_[a-z0-9_]+', sanitize_characters, matched_tweet.group(1))
		formatted_tweet = link_re.sub(expand_tco, sanitized_body) + matched_tweet.group(2)
		# Grab our date, and toss the tweet into our dated dictionary
		date = dateutil.parser.parse(matched_tweet.group(3)).strftime('%Y-%m')
		if date not in dated_tweets:
			dated_tweets[date] = []
		dated_tweets[date].append(formatted_tweet)

# Now we have our dated tweets; loop through them and write to disk
for date, tweets in dated_tweets.items():
	month_path = os.path.join(archive_dir, username + '-' + date + ext)
	# Construct our string with a trailing separator, just in case of future tweets
	tweet_string = final_separator.join(tweets) + final_separator
	# Append our tweets to the archive file
	file = open(month_path, 'a')
	file.write(tweet_string)
	file.close()

# All done!
	#!/usr/bin/python
	# -- coding: utf-8 --

	'''
	This script parses a text file of tweets (generated by [IFTTT][1],
	for instance) and sorts them into files by month. You can run it
	manually from the command line:

	cd /path/to/containing/folder
	./archive-tweets.py /path/to/@username.txt

	Or run it automatically using [Hazel][2] or similar. The script
	expects that you have a file named like your Twitter username with
	tweets formatted and delimited like so:

	My tweet text

	[July 04, 2012 at 06:48AM](http://twitter.com/link/to/status)

	- - -

	And that you want your tweets broken up by month in a subfolder next
	to the original file. You can change the delimiting characters between
	tweets and the name of the final archive file using the config variables
	below.

	By default, this script will also try to resolve t.co shortened links
	into their original URLs. You can disable this by setting the
	`expand_tco_links` config variable below to `False`.

	[1]: http://ifttt.com/
	[2]: http://www.noodlesoft.com/hazel.php
	'''

	# CONFIG: adjust to your liking
	separator_re = r'\s+- - -\s+' # IFTTT adds extra spaces, so have to use a regex
	final_separator = '\n\n- - -\n\n' # What you want in your final montly archives
	archive_directory = 'archive' # The sub-directory you want your monthly archives in
	expand_tco_links = True # Whether you want t.co links expanded or not (slower!)
	sanitize_usernames = False # Whether you want username underscores backslash escaped

	# Don't edit below here unless you know what you're doing!

	import sys
	import os.path
	import re
	import dateutil.parser
	import urllib2

	# Utility function for expanding t.co links
	def expand_tco(match):
	url = match.group(0)
	# Only expand if we have a t.co link
	if expand_tco_links and (url.startswith('http://t.co/') or url.startswith('https://t.co/')):
	final_url = urllib2.urlopen(url, None, 15).geturl()
	else:
	final_url = url
	# Make link self-linking for Markdown
	return '<' + final_url.strip() + '>'

	# Utility function for sanitizing underscores in usernames
	def sanitize_characters(match):
	if sanitize_usernames:
	return match.group(0).replace('_', r'\_')
	else:
	return match.group(0)

	# Grab our paths
	filepath = sys.argv[1]
	username, ext = os.path.splitext(os.path.basename(filepath))
	root_dir = os.path.dirname(filepath)
	archive_dir = os.path.join(root_dir, archive_directory)

	# Read our tweets from the file
	file = open(filepath, 'r+')
	tweets = file.read()
	tweets = re.split(separator_re, tweets)
	# Clear out the file
	file.truncate(0)
	file.close()

	# Parse through our tweets and find their dates
	tweet_re = re.compile(r'^(.*?)(\[([^\]]+)\]\([^(]+\))$', re.S)
	# Link regex derivative of John Gruber's: http://daringfireball.net/2010/07/improved_regex_for_matching_urls
	link_re = re.compile(r'\b(https?://(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', re.I)
	dated_tweets = {}
	for tweet in tweets:
	if len(tweet) > 0:
	# Parse our tweet
	matched_tweet = tweet_re.match(tweet)
	# Replace t.co links with expanded versions
	sanitized_body = re.sub(r'@[a-z0-9]*_[a-z0-9_]+', sanitize_characters, matched_tweet.group(1))
	formatted_tweet = link_re.sub(expand_tco, sanitized_body) + matched_tweet.group(2)
	# Grab our date, and toss the tweet into our dated dictionary
	date = dateutil.parser.parse(matched_tweet.group(3)).strftime('%Y-%m')
	if date not in dated_tweets:
	dated_tweets[date] = []
	dated_tweets[date].append(formatted_tweet)

	# Now we have our dated tweets; loop through them and write to disk
	for date, tweets in dated_tweets.items():
	month_path = os.path.join(archive_dir, username + '-' + date + ext)
	# Construct our string with a trailing separator, just in case of future tweets
	tweet_string = final_separator.join(tweets) + final_separator
	# Append our tweets to the archive file
	file = open(month_path, 'a')
	file.write(tweet_string)
	file.close()

	# All done!