ttscoff/ifttt-tweets2md.py

## ifttt-tweets2md.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

'''
This script parses a text file with archived tweets and sorts them into archive files based on month

Original script by [Ian Beck](http://beckism.com/2012/07/archiving-tweets/) 2012
This iteration by [Brett Terpstra](http://brettterpstra.com) 2012

Designed to operate via Hazel or other file watcher. Reads a Dropbox file
created by <http://ifttt.com> with recent tweets.

A certain format is expected by the script for the IFTTT template:

    {{Text}}<br><br>
    [{{CreatedAt}}]({{LinkToTweet}})<br><br>
    ---<br><br>

It looks at the first argument passed for the file to read. It empties the file out when it's done.

This works well with nvALT if you save your notes to Dropbox and then point this script to save
there. That allows for very fast indexing and searching of your tweets.

Set the folder to save to below. If you don't want the overhead of expanding t.co links,
be sure to set expand_tco_links to false.

TODO: Expand all url shorteners
TODO: Make links <self-linking>
TODO: Escape hashtags (at least ones at the beginning of a line) to avoid Markdown formatting issues
'''
## Config

# CONFIG: adjust to your liking
archive_dir = '/Users/ttscoff/Dropbox/nvALT2.2' # The sub-directory you want your monthly archives in
separator_re = r'\s+---\s+'       # IFTTT adds extra spaces, so have to use a regex
final_separator = '\n\n---\n\n'   # What you want in your final montly archives
expand_tco_links = True           # Whether you want t.co links expanded or not (makes script run much slower, more error prone)
ext = '.md'                       # Extension to use on archive filenames

# Don't edit below here unless you know what you're doing

import sys
import os.path
import re
import dateutil.parser
import urllib2

# Utility function for expanding t.co links
def expand_tco(match):
	if expand_tco_links:
		url = re.sub(r'^https','http',match.group(0))
		sys.stderr.write('\033[KExpanding t.co link: <' + url + '> ... \r')
		try:
			final_url = urllib2.urlopen(url).geturl()
			if url == final_url:
				sys.stderr.write('\033[KNo expansion for ' + final_url + '\r')
			else:
				sys.stderr.write('\033[KExpanded ' + url + ' to ' + final_url[0:15] + '\r')
			return '<' + final_url.strip() + '>'
		except:
			sys.stderr.write('\033[KError expanding ' + url + '\r')
			return '<' + url.strip() + '>'
	else:
		return '<' + match.group(0).strip() + '>'

# Grab our paths
filepath = sys.argv[1]
fileparts = re.match(r'^(.*?)-(.*?)\.',os.path.basename(filepath))
if fileparts == None:
	sys.stderr.write("Invalid filename")
	quit()

username, typename = fileparts.group(1), fileparts.group(2)
# root_dir = os.path.dirname(filepath)
# archive_dir = os.path.join(root_dir, archive_directory)

# Read our tweets from the file
file = open(filepath, 'r+')
tweets = file.read()
tweets = re.split(separator_re, tweets)

## Debugging
# import pprint
# pp = pprint.PrettyPrinter(indent=4)

# pp.pprint(tweets)
# file.close()
# quit()

# Clear out the file
file.truncate(0)
file.close()

# Parse through our tweets and find their dates
date_re = re.compile(r'^.*?\[([^\]]+)\]\(.+?\)$', re.S)
dated_tweets = {}
for tweet in tweets:
	if len(tweet) > 0:
		# Replace t.co links with expanded versions
		if expand_tco_links:
			formatted_tweet = re.sub(r'https?://t.co/\S+?(?=\s|\.|,|\)|:|;|\'|"|\?|!|>|&)', expand_tco, tweet)
		else:
			formatted_text = tweet
		# Grab our date, and toss the tweet into our dated dictionary
		date = date_re.sub(r'\1', tweet)
		date = dateutil.parser.parse(date).strftime('%Y-%m')
		if date not in dated_tweets:
			dated_tweets[date] = []
		dated_tweets[date].append(formatted_tweet)

# Now we have our dated tweets; loop through them and write to disk
for date, tweets in dated_tweets.items():
	month_path = os.path.join(archive_dir, '@' + username + '-' + typename + '_' + date + ext)
	# Construct our string with a trailing separator, just in case of future tweets
	tweet_string = final_separator.join(tweets) + final_separator
	# Append our tweets to the archive file
	file = open(month_path, 'a')
	file.write(tweet_string)
	sys.stderr.write('Wrote tweet to ' + month_path + '\n')
	file.close()

# All done!

## tu2md.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Converts a ThinkUp CSV export to monthly archive files with Markdown formatting

Original script by [Dr. Drang](http://www.leancrew.com/all-this/2012/07/archiving-tweets/) 2012
Pieces borrowed from [Ian Beck](http://beckism.com/2012/07/archiving-tweets/) 2012
This iteration by [Brett Terpstra](http://brettterpstra.com) 2012

Requires three ordered arguments: tu2md.py filename.csv username target_folder
`filename.csv` is the [path and] name of the CSV file you exported from ThinkUp
`username` is used for naming the files, needed for handling multiple accounts
`target_folder` is the directory where the archive.md files will be created. Trailing slash optional.

TODO: Expand all url shorteners
TODO: Make links <self-linking>
TODO: Escape hashtags (at least ones at the beginning of a line) to avoid Markdown formatting issues
'''

## Config
ext = '.md' # the extension to use for created files
expand_tco_links = True # should we expand t.co links while archiving (slow)?
## End Config

# User-editable content ends here. Proceed with caution.

import csv
import os
from datetime import datetime
import sys
import re
import urllib2

if len(sys.argv) != 4:
	print "Requires three arguments:"
	print "filename username archive_directory"
	quit()

me = sys.argv[2]
archive_dir = re.sub(r'/?$', '/', sys.argv[3])

# Utility function for expanding t.co links
def expand_tco(match):
	if expand_tco_links:
		url = re.sub(r'^https','http',match.group(0))
		sys.stderr.write('\033[KExpanding t.co link: <' + url + '> ... \r')
		try:
			final_url = urllib2.urlopen(url).geturl()
			if url == final_url:
				sys.stderr.write('\033[KNo expansion for ' + final_url + '\r')
			else:
				sys.stderr.write('\033[KExpanded ' + url + ' to ' + final_url[0:15] + '\r')
			return '<' + final_url.strip() + '>'
		except:
			sys.stderr.write('\033[KError expanding ' + url + '\r')
			return '<' + url.strip() + '>'
	else:
		return '<' + match.group(0).strip() + '>'

# Open the CSV file specified on the command line and read the field names.
tfile = open(sys.argv[1])
treader = csv.reader(tfile)
fields = treader.next()

# Fill a list with the tweets, with each tweet a dictionary.
allInfo = []
for row in treader:
  allInfo.append(dict(zip(fields,row)))

# Collect only the info we need in a list of lists. Convert the date string
# into a datetime object.

tweets = [ [datetime.strptime(x['pub_date'], "%Y-%m-%d %H:%M:%S"), x['post_id'], x['post_text']] for x in allInfo ]

# We put the date first so we can sort by date easily.
tweets.sort()

last_date = ''
output = ''

for x in tweets:
	cur_date = x[0].strftime("%Y-%m")

	if last_date == '':
		first_date = x[0].strftime("%Y-%m")
		sys.stderr.write('[' + datetime.today().strftime('%c') + '] Starting archive from ' + x[0].strftime("%B, %Y") + '\n')
	elif cur_date != last_date:
		out_path = os.path.join(archive_dir, '@' + me + '-Twitter_' + last_date + ext)
		file = open(out_path, 'w')
		file.write(output)
		file.close()
		output = ''
		sys.stderr.write("\033[K===[ Archive for " + cur_date + " saved to " + out_path + ' ]\n')
	last_date = cur_date
	output += re.sub(r'https?://t.co/\S+?(?=\s|\.|,|\)|:|;|\'|"|\?|!|>|&|’|”|’)', expand_tco, x[2]) + '\n\n' + '[' + x[0].strftime("%B %d, %Y at %I:%M%p") + '](http://twitter.com/' + me + '/status/' + x[1] + ')\n\n---\n\n'

out_path = os.path.join(archive_dir, '@' + me + '-Twitter_' + last_date + ext)
file = open(out_path, 'w')
file.write(output)
file.close()
sys.stderr.write('\033[K[' + datetime.today().strftime('%c') + '] Done archiving from ' + first_date + ' to ' + last_date + '\n')

# $ ./tu2md.py posts-MarkedApp-twitter.csv MarkedApp archivetest
# [Thu Jul  5 06:58:53 2012] Starting archive from July, 2011
# ===[ Archive for 2011-08 saved to archivetest/@MarkedApp-Twitter_2011-07.md ]
# ===[ Archive for 2011-09 saved to archivetest/@MarkedApp-Twitter_2011-08.md ]
# ===[ Archive for 2011-10 saved to archivetest/@MarkedApp-Twitter_2011-09.md ]
# ===[ Archive for 2011-11 saved to archivetest/@MarkedApp-Twitter_2011-10.md ]
# [...]
# [Thu Jul  5 07:00:02 2012] Done archiving from 2011-07 to 2012-07
# $

## tweetscan.rb
#!/usr/bin/ruby
# Brett Terpstra <http://brettterpstra.com>
# The beginnings of a means to quickly search the text archive of tweets I've built
# Currently allows you to return tweets, one per line, based on a simple timespan parameter
# Not entirely accurate at this point. Seriously.
#
# `tweetscan.rb d` returns tweets from the current day
# `tweetscan.rb m` returns all tweets since the first day of the month
# `tweetscan.rb y` returns all tweets this year
# `tweetscan.rb` returns all tweets in the archive
#
# Try `tweetscan.rb d|wc -l` to see how many times you've tweeted today
# Use `tweetscan.rb y|wc -w` to see about how many words you've tweeted this year
# Then, get depressed about how much time you spend on Twitter
#
# TODO: Add user-specified date filter
# TODO: Add tweet/word/char count options
# TODO: Add (username and) fuzzy keyword search
# TODO: Add (username and) keyword filter
# TODO: Option to output date with tweet
#
## Config
archive = '~/Dropbox/nvALT2.2'
username = 'ttscoff'
tweettype = 'twitter'
## End Config

# Don't edit beyond this point unless you want to help out and make my life easier. Seriously, I'm short on time these days.
search_span = ARGV[0]

search_date = ""
unless search_span.nil?
	file_date, search_date = case search_span
		when /[td]((od)?ay)?/i : [Time.now.strftime('%Y-%m'),Time.now.strftime('%B %d, %Y')]
		when /m(onth)?/i : [Time.now.strftime('%Y-%m'),Time.now.strftime('%B \d\d, %Y')]
		when /y(ear)?/i : [Time.now.strftime('%Y'),Time.now.strftime('.*?%Y')]
	end
end

Dir.chdir(File.expand_path(archive))
Dir["@#{username}-#{tweettype.capitalize}_#{file_date}*.md"].each do |file|
	File.open(file).read.split(/---/).each {|tweet|
		puts tweet.gsub(/\[#{search_date}.*?\)/i,'').strip.gsub(/\n/,' ') if tweet =~ /\[#{search_date}/
	}
end
	#!/usr/bin/python
	# -- coding: utf-8 --

	'''
	This script parses a text file with archived tweets and sorts them into archive files based on month

	Original script by [Ian Beck](http://beckism.com/2012/07/archiving-tweets/) 2012
	This iteration by [Brett Terpstra](http://brettterpstra.com) 2012

	Designed to operate via Hazel or other file watcher. Reads a Dropbox file
	created by <http://ifttt.com> with recent tweets.

	A certain format is expected by the script for the IFTTT template:

	{{Text}}<br><br>
	[{{CreatedAt}}]({{LinkToTweet}})<br><br>
	---<br><br>

	It looks at the first argument passed for the file to read. It empties the file out when it's done.

	This works well with nvALT if you save your notes to Dropbox and then point this script to save
	there. That allows for very fast indexing and searching of your tweets.

	Set the folder to save to below. If you don't want the overhead of expanding t.co links,
	be sure to set expand_tco_links to false.

	TODO: Expand all url shorteners
	TODO: Make links <self-linking>
	TODO: Escape hashtags (at least ones at the beginning of a line) to avoid Markdown formatting issues
	'''
	## Config

	# CONFIG: adjust to your liking
	archive_dir = '/Users/ttscoff/Dropbox/nvALT2.2' # The sub-directory you want your monthly archives in
	separator_re = r'\s+---\s+' # IFTTT adds extra spaces, so have to use a regex
	final_separator = '\n\n---\n\n' # What you want in your final montly archives
	expand_tco_links = True # Whether you want t.co links expanded or not (makes script run much slower, more error prone)
	ext = '.md' # Extension to use on archive filenames

	# Don't edit below here unless you know what you're doing

	import sys
	import os.path
	import re
	import dateutil.parser
	import urllib2

	# Utility function for expanding t.co links
	def expand_tco(match):
	if expand_tco_links:
	url = re.sub(r'^https','http',match.group(0))
	sys.stderr.write('\033[KExpanding t.co link: <' + url + '> ... \r')
	try:
	final_url = urllib2.urlopen(url).geturl()
	if url == final_url:
	sys.stderr.write('\033[KNo expansion for ' + final_url + '\r')
	else:
	sys.stderr.write('\033[KExpanded ' + url + ' to ' + final_url[0:15] + '\r')
	return '<' + final_url.strip() + '>'
	except:
	sys.stderr.write('\033[KError expanding ' + url + '\r')
	return '<' + url.strip() + '>'
	else:
	return '<' + match.group(0).strip() + '>'

	# Grab our paths
	filepath = sys.argv[1]
	fileparts = re.match(r'^(.?)-(.?)\.',os.path.basename(filepath))
	if fileparts == None:
	sys.stderr.write("Invalid filename")
	quit()

	username, typename = fileparts.group(1), fileparts.group(2)
	# root_dir = os.path.dirname(filepath)
	# archive_dir = os.path.join(root_dir, archive_directory)

	# Read our tweets from the file
	file = open(filepath, 'r+')
	tweets = file.read()
	tweets = re.split(separator_re, tweets)

	## Debugging
	# import pprint
	# pp = pprint.PrettyPrinter(indent=4)

	# pp.pprint(tweets)
	# file.close()
	# quit()

	# Clear out the file
	file.truncate(0)
	file.close()

	# Parse through our tweets and find their dates
	date_re = re.compile(r'^.*?\[([^\]]+)\]\(.+?\)$', re.S)
	dated_tweets = {}
	for tweet in tweets:
	if len(tweet) > 0:
	# Replace t.co links with expanded versions
	if expand_tco_links:
	formatted_tweet = re.sub(r'https?://t.co/\S+?(?=\s\|\.\|,\|\)\|:\|;\|\'\|"\|\?\|!\|>\|&)', expand_tco, tweet)
	else:
	formatted_text = tweet
	# Grab our date, and toss the tweet into our dated dictionary
	date = date_re.sub(r'\1', tweet)
	date = dateutil.parser.parse(date).strftime('%Y-%m')
	if date not in dated_tweets:
	dated_tweets[date] = []
	dated_tweets[date].append(formatted_tweet)

	# Now we have our dated tweets; loop through them and write to disk
	for date, tweets in dated_tweets.items():
	month_path = os.path.join(archive_dir, '@' + username + '-' + typename + '_' + date + ext)
	# Construct our string with a trailing separator, just in case of future tweets
	tweet_string = final_separator.join(tweets) + final_separator
	# Append our tweets to the archive file
	file = open(month_path, 'a')
	file.write(tweet_string)
	sys.stderr.write('Wrote tweet to ' + month_path + '\n')
	file.close()

	# All done!
	#!/usr/bin/python
	# -- coding: utf-8 --
	'''
	Converts a ThinkUp CSV export to monthly archive files with Markdown formatting

	Original script by [Dr. Drang](http://www.leancrew.com/all-this/2012/07/archiving-tweets/) 2012
	Pieces borrowed from [Ian Beck](http://beckism.com/2012/07/archiving-tweets/) 2012
	This iteration by [Brett Terpstra](http://brettterpstra.com) 2012

	Requires three ordered arguments: tu2md.py filename.csv username target_folder
	`filename.csv` is the [path and] name of the CSV file you exported from ThinkUp
	`username` is used for naming the files, needed for handling multiple accounts
	`target_folder` is the directory where the archive.md files will be created. Trailing slash optional.

	TODO: Expand all url shorteners
	TODO: Make links <self-linking>
	TODO: Escape hashtags (at least ones at the beginning of a line) to avoid Markdown formatting issues
	'''

	## Config
	ext = '.md' # the extension to use for created files
	expand_tco_links = True # should we expand t.co links while archiving (slow)?
	## End Config

	# User-editable content ends here. Proceed with caution.

	import csv
	import os
	from datetime import datetime
	import sys
	import re
	import urllib2

	if len(sys.argv) != 4:
	print "Requires three arguments:"
	print "filename username archive_directory"
	quit()

	me = sys.argv[2]
	archive_dir = re.sub(r'/?$', '/', sys.argv[3])

	# Utility function for expanding t.co links
	def expand_tco(match):
	if expand_tco_links:
	url = re.sub(r'^https','http',match.group(0))
	sys.stderr.write('\033[KExpanding t.co link: <' + url + '> ... \r')
	try:
	final_url = urllib2.urlopen(url).geturl()
	if url == final_url:
	sys.stderr.write('\033[KNo expansion for ' + final_url + '\r')
	else:
	sys.stderr.write('\033[KExpanded ' + url + ' to ' + final_url[0:15] + '\r')
	return '<' + final_url.strip() + '>'
	except:
	sys.stderr.write('\033[KError expanding ' + url + '\r')
	return '<' + url.strip() + '>'
	else:
	return '<' + match.group(0).strip() + '>'

	# Open the CSV file specified on the command line and read the field names.
	tfile = open(sys.argv[1])
	treader = csv.reader(tfile)
	fields = treader.next()

	# Fill a list with the tweets, with each tweet a dictionary.
	allInfo = []
	for row in treader:
	allInfo.append(dict(zip(fields,row)))

	# Collect only the info we need in a list of lists. Convert the date string
	# into a datetime object.

	tweets = [ [datetime.strptime(x['pub_date'], "%Y-%m-%d %H:%M:%S"), x['post_id'], x['post_text']] for x in allInfo ]

	# We put the date first so we can sort by date easily.
	tweets.sort()

	last_date = ''
	output = ''

	for x in tweets:
	cur_date = x[0].strftime("%Y-%m")

	if last_date == '':
	first_date = x[0].strftime("%Y-%m")
	sys.stderr.write('[' + datetime.today().strftime('%c') + '] Starting archive from ' + x[0].strftime("%B, %Y") + '\n')
	elif cur_date != last_date:
	out_path = os.path.join(archive_dir, '@' + me + '-Twitter_' + last_date + ext)
	file = open(out_path, 'w')
	file.write(output)
	file.close()
	output = ''
	sys.stderr.write("\033[K===[ Archive for " + cur_date + " saved to " + out_path + ' ]\n')
	last_date = cur_date
	output += re.sub(r'https?://t.co/\S+?(?=\s\|\.\|,\|\)\|:\|;\|\'\|"\|\?\|!\|>\|&\|’\|”\|’)', expand_tco, x[2]) + '\n\n' + '[' + x[0].strftime("%B %d, %Y at %I:%M%p") + '](http://twitter.com/' + me + '/status/' + x[1] + ')\n\n---\n\n'

	out_path = os.path.join(archive_dir, '@' + me + '-Twitter_' + last_date + ext)
	file = open(out_path, 'w')
	file.write(output)
	file.close()
	sys.stderr.write('\033[K[' + datetime.today().strftime('%c') + '] Done archiving from ' + first_date + ' to ' + last_date + '\n')

	# $ ./tu2md.py posts-MarkedApp-twitter.csv MarkedApp archivetest
	# [Thu Jul 5 06:58:53 2012] Starting archive from July, 2011
	# ===[ Archive for 2011-08 saved to archivetest/@MarkedApp-Twitter_2011-07.md ]
	# ===[ Archive for 2011-09 saved to archivetest/@MarkedApp-Twitter_2011-08.md ]
	# ===[ Archive for 2011-10 saved to archivetest/@MarkedApp-Twitter_2011-09.md ]
	# ===[ Archive for 2011-11 saved to archivetest/@MarkedApp-Twitter_2011-10.md ]
	# [...]
	# [Thu Jul 5 07:00:02 2012] Done archiving from 2011-07 to 2012-07
	# $
	#!/usr/bin/ruby
	# Brett Terpstra <http://brettterpstra.com>
	# The beginnings of a means to quickly search the text archive of tweets I've built
	# Currently allows you to return tweets, one per line, based on a simple timespan parameter
	# Not entirely accurate at this point. Seriously.
	#
	# `tweetscan.rb d` returns tweets from the current day
	# `tweetscan.rb m` returns all tweets since the first day of the month
	# `tweetscan.rb y` returns all tweets this year
	# `tweetscan.rb` returns all tweets in the archive
	#
	# Try `tweetscan.rb d\|wc -l` to see how many times you've tweeted today
	# Use `tweetscan.rb y\|wc -w` to see about how many words you've tweeted this year
	# Then, get depressed about how much time you spend on Twitter
	#
	# TODO: Add user-specified date filter
	# TODO: Add tweet/word/char count options
	# TODO: Add (username and) fuzzy keyword search
	# TODO: Add (username and) keyword filter
	# TODO: Option to output date with tweet
	#
	## Config
	archive = '~/Dropbox/nvALT2.2'
	username = 'ttscoff'
	tweettype = 'twitter'
	## End Config

	# Don't edit beyond this point unless you want to help out and make my life easier. Seriously, I'm short on time these days.
	search_span = ARGV[0]

	search_date = ""
	unless search_span.nil?
	file_date, search_date = case search_span
	when /[td]((od)?ay)?/i : [Time.now.strftime('%Y-%m'),Time.now.strftime('%B %d, %Y')]
	when /m(onth)?/i : [Time.now.strftime('%Y-%m'),Time.now.strftime('%B \d\d, %Y')]
	when /y(ear)?/i : [Time.now.strftime('%Y'),Time.now.strftime('.*?%Y')]
	end
	end

	Dir.chdir(File.expand_path(archive))
	Dir["@#{username}-#{tweettype.capitalize}_#{file_date}*.md"].each do \|file\|
	File.open(file).read.split(/---/).each {\|tweet\|
	puts tweet.gsub(/\[#{search_date}.*?\)/i,'').strip.gsub(/\n/,' ') if tweet =~ /\[#{search_date}/
	}
	end