Skip to content

Instantly share code, notes, and snippets.

@ttscoff
Created July 5, 2012 12:06
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ttscoff/3053353 to your computer and use it in GitHub Desktop.
Save ttscoff/3053353 to your computer and use it in GitHub Desktop.
Converts a ThinkUp CSV export to monthly archive files with Markdown formatting
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
This script parses a text file with archived tweets and sorts them into archive files based on month
Original script by [Ian Beck](http://beckism.com/2012/07/archiving-tweets/) 2012
This iteration by [Brett Terpstra](http://brettterpstra.com) 2012
Designed to operate via Hazel or other file watcher. Reads a Dropbox file
created by <http://ifttt.com> with recent tweets.
A certain format is expected by the script for the IFTTT template:
{{Text}}<br><br>
[{{CreatedAt}}]({{LinkToTweet}})<br><br>
---<br><br>
It looks at the first argument passed for the file to read. It empties the file out when it's done.
This works well with nvALT if you save your notes to Dropbox and then point this script to save
there. That allows for very fast indexing and searching of your tweets.
Set the folder to save to below. If you don't want the overhead of expanding t.co links,
be sure to set expand_tco_links to false.
TODO: Expand all url shorteners
TODO: Make links <self-linking>
TODO: Escape hashtags (at least ones at the beginning of a line) to avoid Markdown formatting issues
'''
## Config
# CONFIG: adjust to your liking
archive_dir = '/Users/ttscoff/Dropbox/nvALT2.2' # The sub-directory you want your monthly archives in
separator_re = r'\s+---\s+' # IFTTT adds extra spaces, so have to use a regex
final_separator = '\n\n---\n\n' # What you want in your final montly archives
expand_tco_links = True # Whether you want t.co links expanded or not (makes script run much slower, more error prone)
ext = '.md' # Extension to use on archive filenames
# Don't edit below here unless you know what you're doing
import sys
import os.path
import re
import dateutil.parser
import urllib2
# Utility function for expanding t.co links
def expand_tco(match):
if expand_tco_links:
url = re.sub(r'^https','http',match.group(0))
sys.stderr.write('\033[KExpanding t.co link: <' + url + '> ... \r')
try:
final_url = urllib2.urlopen(url).geturl()
if url == final_url:
sys.stderr.write('\033[KNo expansion for ' + final_url + '\r')
else:
sys.stderr.write('\033[KExpanded ' + url + ' to ' + final_url[0:15] + '\r')
return '<' + final_url.strip() + '>'
except:
sys.stderr.write('\033[KError expanding ' + url + '\r')
return '<' + url.strip() + '>'
else:
return '<' + match.group(0).strip() + '>'
# Grab our paths
filepath = sys.argv[1]
fileparts = re.match(r'^(.*?)-(.*?)\.',os.path.basename(filepath))
if fileparts == None:
sys.stderr.write("Invalid filename")
quit()
username, typename = fileparts.group(1), fileparts.group(2)
# root_dir = os.path.dirname(filepath)
# archive_dir = os.path.join(root_dir, archive_directory)
# Read our tweets from the file
file = open(filepath, 'r+')
tweets = file.read()
tweets = re.split(separator_re, tweets)
## Debugging
# import pprint
# pp = pprint.PrettyPrinter(indent=4)
# pp.pprint(tweets)
# file.close()
# quit()
# Clear out the file
file.truncate(0)
file.close()
# Parse through our tweets and find their dates
date_re = re.compile(r'^.*?\[([^\]]+)\]\(.+?\)$', re.S)
dated_tweets = {}
for tweet in tweets:
if len(tweet) > 0:
# Replace t.co links with expanded versions
if expand_tco_links:
formatted_tweet = re.sub(r'https?://t.co/\S+?(?=\s|\.|,|\)|:|;|\'|"|\?|!|>|&)', expand_tco, tweet)
else:
formatted_text = tweet
# Grab our date, and toss the tweet into our dated dictionary
date = date_re.sub(r'\1', tweet)
date = dateutil.parser.parse(date).strftime('%Y-%m')
if date not in dated_tweets:
dated_tweets[date] = []
dated_tweets[date].append(formatted_tweet)
# Now we have our dated tweets; loop through them and write to disk
for date, tweets in dated_tweets.items():
month_path = os.path.join(archive_dir, '@' + username + '-' + typename + '_' + date + ext)
# Construct our string with a trailing separator, just in case of future tweets
tweet_string = final_separator.join(tweets) + final_separator
# Append our tweets to the archive file
file = open(month_path, 'a')
file.write(tweet_string)
sys.stderr.write('Wrote tweet to ' + month_path + '\n')
file.close()
# All done!
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Converts a ThinkUp CSV export to monthly archive files with Markdown formatting
Original script by [Dr. Drang](http://www.leancrew.com/all-this/2012/07/archiving-tweets/) 2012
Pieces borrowed from [Ian Beck](http://beckism.com/2012/07/archiving-tweets/) 2012
This iteration by [Brett Terpstra](http://brettterpstra.com) 2012
Requires three ordered arguments: tu2md.py filename.csv username target_folder
`filename.csv` is the [path and] name of the CSV file you exported from ThinkUp
`username` is used for naming the files, needed for handling multiple accounts
`target_folder` is the directory where the archive.md files will be created. Trailing slash optional.
TODO: Expand all url shorteners
TODO: Make links <self-linking>
TODO: Escape hashtags (at least ones at the beginning of a line) to avoid Markdown formatting issues
'''
## Config
ext = '.md' # the extension to use for created files
expand_tco_links = True # should we expand t.co links while archiving (slow)?
## End Config
# User-editable content ends here. Proceed with caution.
import csv
import os
from datetime import datetime
import sys
import re
import urllib2
if len(sys.argv) != 4:
print "Requires three arguments:"
print "filename username archive_directory"
quit()
me = sys.argv[2]
archive_dir = re.sub(r'/?$', '/', sys.argv[3])
# Utility function for expanding t.co links
def expand_tco(match):
if expand_tco_links:
url = re.sub(r'^https','http',match.group(0))
sys.stderr.write('\033[KExpanding t.co link: <' + url + '> ... \r')
try:
final_url = urllib2.urlopen(url).geturl()
if url == final_url:
sys.stderr.write('\033[KNo expansion for ' + final_url + '\r')
else:
sys.stderr.write('\033[KExpanded ' + url + ' to ' + final_url[0:15] + '\r')
return '<' + final_url.strip() + '>'
except:
sys.stderr.write('\033[KError expanding ' + url + '\r')
return '<' + url.strip() + '>'
else:
return '<' + match.group(0).strip() + '>'
# Open the CSV file specified on the command line and read the field names.
tfile = open(sys.argv[1])
treader = csv.reader(tfile)
fields = treader.next()
# Fill a list with the tweets, with each tweet a dictionary.
allInfo = []
for row in treader:
allInfo.append(dict(zip(fields,row)))
# Collect only the info we need in a list of lists. Convert the date string
# into a datetime object.
tweets = [ [datetime.strptime(x['pub_date'], "%Y-%m-%d %H:%M:%S"), x['post_id'], x['post_text']] for x in allInfo ]
# We put the date first so we can sort by date easily.
tweets.sort()
last_date = ''
output = ''
for x in tweets:
cur_date = x[0].strftime("%Y-%m")
if last_date == '':
first_date = x[0].strftime("%Y-%m")
sys.stderr.write('[' + datetime.today().strftime('%c') + '] Starting archive from ' + x[0].strftime("%B, %Y") + '\n')
elif cur_date != last_date:
out_path = os.path.join(archive_dir, '@' + me + '-Twitter_' + last_date + ext)
file = open(out_path, 'w')
file.write(output)
file.close()
output = ''
sys.stderr.write("\033[K===[ Archive for " + cur_date + " saved to " + out_path + ' ]\n')
last_date = cur_date
output += re.sub(r'https?://t.co/\S+?(?=\s|\.|,|\)|:|;|\'|"|\?|!|>|&|’|”|’)', expand_tco, x[2]) + '\n\n' + '[' + x[0].strftime("%B %d, %Y at %I:%M%p") + '](http://twitter.com/' + me + '/status/' + x[1] + ')\n\n---\n\n'
out_path = os.path.join(archive_dir, '@' + me + '-Twitter_' + last_date + ext)
file = open(out_path, 'w')
file.write(output)
file.close()
sys.stderr.write('\033[K[' + datetime.today().strftime('%c') + '] Done archiving from ' + first_date + ' to ' + last_date + '\n')
# $ ./tu2md.py posts-MarkedApp-twitter.csv MarkedApp archivetest
# [Thu Jul 5 06:58:53 2012] Starting archive from July, 2011
# ===[ Archive for 2011-08 saved to archivetest/@MarkedApp-Twitter_2011-07.md ]
# ===[ Archive for 2011-09 saved to archivetest/@MarkedApp-Twitter_2011-08.md ]
# ===[ Archive for 2011-10 saved to archivetest/@MarkedApp-Twitter_2011-09.md ]
# ===[ Archive for 2011-11 saved to archivetest/@MarkedApp-Twitter_2011-10.md ]
# [...]
# [Thu Jul 5 07:00:02 2012] Done archiving from 2011-07 to 2012-07
# $
#!/usr/bin/ruby
# Brett Terpstra <http://brettterpstra.com>
# The beginnings of a means to quickly search the text archive of tweets I've built
# Currently allows you to return tweets, one per line, based on a simple timespan parameter
# Not entirely accurate at this point. Seriously.
#
# `tweetscan.rb d` returns tweets from the current day
# `tweetscan.rb m` returns all tweets since the first day of the month
# `tweetscan.rb y` returns all tweets this year
# `tweetscan.rb` returns all tweets in the archive
#
# Try `tweetscan.rb d|wc -l` to see how many times you've tweeted today
# Use `tweetscan.rb y|wc -w` to see about how many words you've tweeted this year
# Then, get depressed about how much time you spend on Twitter
#
# TODO: Add user-specified date filter
# TODO: Add tweet/word/char count options
# TODO: Add (username and) fuzzy keyword search
# TODO: Add (username and) keyword filter
# TODO: Option to output date with tweet
#
## Config
archive = '~/Dropbox/nvALT2.2'
username = 'ttscoff'
tweettype = 'twitter'
## End Config
# Don't edit beyond this point unless you want to help out and make my life easier. Seriously, I'm short on time these days.
search_span = ARGV[0]
search_date = ""
unless search_span.nil?
file_date, search_date = case search_span
when /[td]((od)?ay)?/i : [Time.now.strftime('%Y-%m'),Time.now.strftime('%B %d, %Y')]
when /m(onth)?/i : [Time.now.strftime('%Y-%m'),Time.now.strftime('%B \d\d, %Y')]
when /y(ear)?/i : [Time.now.strftime('%Y'),Time.now.strftime('.*?%Y')]
end
end
Dir.chdir(File.expand_path(archive))
Dir["@#{username}-#{tweettype.capitalize}_#{file_date}*.md"].each do |file|
File.open(file).read.split(/---/).each {|tweet|
puts tweet.gsub(/\[#{search_date}.*?\)/i,'').strip.gsub(/\n/,' ') if tweet =~ /\[#{search_date}/
}
end
@ttscoff
Copy link
Author

ttscoff commented Jul 5, 2012

Can't remember if images work in GitHub comments, but this is tweetscan.rb combined with ack to remind me what the trick was to get Droplr direct links. I knew I tweeted it a while ago when I found out, but didn't write it down anywhere else.

Ack Barf

Since GitHub seems to hide image links somehow…

http://assets.brettterpstra.com/Screenshot%202012-07-05%20at%2011.46.png

See, I told you this would be useful.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment