Created
July 5, 2012 12:06
-
-
Save ttscoff/3053353 to your computer and use it in GitHub Desktop.
Converts a ThinkUp CSV export to monthly archive files with Markdown formatting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
''' | |
This script parses a text file with archived tweets and sorts them into archive files based on month | |
Original script by [Ian Beck](http://beckism.com/2012/07/archiving-tweets/) 2012 | |
This iteration by [Brett Terpstra](http://brettterpstra.com) 2012 | |
Designed to operate via Hazel or other file watcher. Reads a Dropbox file | |
created by <http://ifttt.com> with recent tweets. | |
A certain format is expected by the script for the IFTTT template: | |
{{Text}}<br><br> | |
[{{CreatedAt}}]({{LinkToTweet}})<br><br> | |
---<br><br> | |
It looks at the first argument passed for the file to read. It empties the file out when it's done. | |
This works well with nvALT if you save your notes to Dropbox and then point this script to save | |
there. That allows for very fast indexing and searching of your tweets. | |
Set the folder to save to below. If you don't want the overhead of expanding t.co links, | |
be sure to set expand_tco_links to false. | |
TODO: Expand all url shorteners | |
TODO: Make links <self-linking> | |
TODO: Escape hashtags (at least ones at the beginning of a line) to avoid Markdown formatting issues | |
''' | |
## Config | |
# CONFIG: adjust to your liking | |
archive_dir = '/Users/ttscoff/Dropbox/nvALT2.2' # The sub-directory you want your monthly archives in | |
separator_re = r'\s+---\s+' # IFTTT adds extra spaces, so have to use a regex | |
final_separator = '\n\n---\n\n' # What you want in your final montly archives | |
expand_tco_links = True # Whether you want t.co links expanded or not (makes script run much slower, more error prone) | |
ext = '.md' # Extension to use on archive filenames | |
# Don't edit below here unless you know what you're doing | |
import sys | |
import os.path | |
import re | |
import dateutil.parser | |
import urllib2 | |
# Utility function for expanding t.co links | |
def expand_tco(match): | |
if expand_tco_links: | |
url = re.sub(r'^https','http',match.group(0)) | |
sys.stderr.write('\033[KExpanding t.co link: <' + url + '> ... \r') | |
try: | |
final_url = urllib2.urlopen(url).geturl() | |
if url == final_url: | |
sys.stderr.write('\033[KNo expansion for ' + final_url + '\r') | |
else: | |
sys.stderr.write('\033[KExpanded ' + url + ' to ' + final_url[0:15] + '\r') | |
return '<' + final_url.strip() + '>' | |
except: | |
sys.stderr.write('\033[KError expanding ' + url + '\r') | |
return '<' + url.strip() + '>' | |
else: | |
return '<' + match.group(0).strip() + '>' | |
# Grab our paths | |
filepath = sys.argv[1] | |
fileparts = re.match(r'^(.*?)-(.*?)\.',os.path.basename(filepath)) | |
if fileparts == None: | |
sys.stderr.write("Invalid filename") | |
quit() | |
username, typename = fileparts.group(1), fileparts.group(2) | |
# root_dir = os.path.dirname(filepath) | |
# archive_dir = os.path.join(root_dir, archive_directory) | |
# Read our tweets from the file | |
file = open(filepath, 'r+') | |
tweets = file.read() | |
tweets = re.split(separator_re, tweets) | |
## Debugging | |
# import pprint | |
# pp = pprint.PrettyPrinter(indent=4) | |
# pp.pprint(tweets) | |
# file.close() | |
# quit() | |
# Clear out the file | |
file.truncate(0) | |
file.close() | |
# Parse through our tweets and find their dates | |
date_re = re.compile(r'^.*?\[([^\]]+)\]\(.+?\)$', re.S) | |
dated_tweets = {} | |
for tweet in tweets: | |
if len(tweet) > 0: | |
# Replace t.co links with expanded versions | |
if expand_tco_links: | |
formatted_tweet = re.sub(r'https?://t.co/\S+?(?=\s|\.|,|\)|:|;|\'|"|\?|!|>|&)', expand_tco, tweet) | |
else: | |
formatted_text = tweet | |
# Grab our date, and toss the tweet into our dated dictionary | |
date = date_re.sub(r'\1', tweet) | |
date = dateutil.parser.parse(date).strftime('%Y-%m') | |
if date not in dated_tweets: | |
dated_tweets[date] = [] | |
dated_tweets[date].append(formatted_tweet) | |
# Now we have our dated tweets; loop through them and write to disk | |
for date, tweets in dated_tweets.items(): | |
month_path = os.path.join(archive_dir, '@' + username + '-' + typename + '_' + date + ext) | |
# Construct our string with a trailing separator, just in case of future tweets | |
tweet_string = final_separator.join(tweets) + final_separator | |
# Append our tweets to the archive file | |
file = open(month_path, 'a') | |
file.write(tweet_string) | |
sys.stderr.write('Wrote tweet to ' + month_path + '\n') | |
file.close() | |
# All done! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
''' | |
Converts a ThinkUp CSV export to monthly archive files with Markdown formatting | |
Original script by [Dr. Drang](http://www.leancrew.com/all-this/2012/07/archiving-tweets/) 2012 | |
Pieces borrowed from [Ian Beck](http://beckism.com/2012/07/archiving-tweets/) 2012 | |
This iteration by [Brett Terpstra](http://brettterpstra.com) 2012 | |
Requires three ordered arguments: tu2md.py filename.csv username target_folder | |
`filename.csv` is the [path and] name of the CSV file you exported from ThinkUp | |
`username` is used for naming the files, needed for handling multiple accounts | |
`target_folder` is the directory where the archive.md files will be created. Trailing slash optional. | |
TODO: Expand all url shorteners | |
TODO: Make links <self-linking> | |
TODO: Escape hashtags (at least ones at the beginning of a line) to avoid Markdown formatting issues | |
''' | |
## Config | |
ext = '.md' # the extension to use for created files | |
expand_tco_links = True # should we expand t.co links while archiving (slow)? | |
## End Config | |
# User-editable content ends here. Proceed with caution. | |
import csv | |
import os | |
from datetime import datetime | |
import sys | |
import re | |
import urllib2 | |
if len(sys.argv) != 4: | |
print "Requires three arguments:" | |
print "filename username archive_directory" | |
quit() | |
me = sys.argv[2] | |
archive_dir = re.sub(r'/?$', '/', sys.argv[3]) | |
# Utility function for expanding t.co links | |
def expand_tco(match): | |
if expand_tco_links: | |
url = re.sub(r'^https','http',match.group(0)) | |
sys.stderr.write('\033[KExpanding t.co link: <' + url + '> ... \r') | |
try: | |
final_url = urllib2.urlopen(url).geturl() | |
if url == final_url: | |
sys.stderr.write('\033[KNo expansion for ' + final_url + '\r') | |
else: | |
sys.stderr.write('\033[KExpanded ' + url + ' to ' + final_url[0:15] + '\r') | |
return '<' + final_url.strip() + '>' | |
except: | |
sys.stderr.write('\033[KError expanding ' + url + '\r') | |
return '<' + url.strip() + '>' | |
else: | |
return '<' + match.group(0).strip() + '>' | |
# Open the CSV file specified on the command line and read the field names. | |
tfile = open(sys.argv[1]) | |
treader = csv.reader(tfile) | |
fields = treader.next() | |
# Fill a list with the tweets, with each tweet a dictionary. | |
allInfo = [] | |
for row in treader: | |
allInfo.append(dict(zip(fields,row))) | |
# Collect only the info we need in a list of lists. Convert the date string | |
# into a datetime object. | |
tweets = [ [datetime.strptime(x['pub_date'], "%Y-%m-%d %H:%M:%S"), x['post_id'], x['post_text']] for x in allInfo ] | |
# We put the date first so we can sort by date easily. | |
tweets.sort() | |
last_date = '' | |
output = '' | |
for x in tweets: | |
cur_date = x[0].strftime("%Y-%m") | |
if last_date == '': | |
first_date = x[0].strftime("%Y-%m") | |
sys.stderr.write('[' + datetime.today().strftime('%c') + '] Starting archive from ' + x[0].strftime("%B, %Y") + '\n') | |
elif cur_date != last_date: | |
out_path = os.path.join(archive_dir, '@' + me + '-Twitter_' + last_date + ext) | |
file = open(out_path, 'w') | |
file.write(output) | |
file.close() | |
output = '' | |
sys.stderr.write("\033[K===[ Archive for " + cur_date + " saved to " + out_path + ' ]\n') | |
last_date = cur_date | |
output += re.sub(r'https?://t.co/\S+?(?=\s|\.|,|\)|:|;|\'|"|\?|!|>|&|’|”|’)', expand_tco, x[2]) + '\n\n' + '[' + x[0].strftime("%B %d, %Y at %I:%M%p") + '](http://twitter.com/' + me + '/status/' + x[1] + ')\n\n---\n\n' | |
out_path = os.path.join(archive_dir, '@' + me + '-Twitter_' + last_date + ext) | |
file = open(out_path, 'w') | |
file.write(output) | |
file.close() | |
sys.stderr.write('\033[K[' + datetime.today().strftime('%c') + '] Done archiving from ' + first_date + ' to ' + last_date + '\n') | |
# $ ./tu2md.py posts-MarkedApp-twitter.csv MarkedApp archivetest | |
# [Thu Jul 5 06:58:53 2012] Starting archive from July, 2011 | |
# ===[ Archive for 2011-08 saved to archivetest/@MarkedApp-Twitter_2011-07.md ] | |
# ===[ Archive for 2011-09 saved to archivetest/@MarkedApp-Twitter_2011-08.md ] | |
# ===[ Archive for 2011-10 saved to archivetest/@MarkedApp-Twitter_2011-09.md ] | |
# ===[ Archive for 2011-11 saved to archivetest/@MarkedApp-Twitter_2011-10.md ] | |
# [...] | |
# [Thu Jul 5 07:00:02 2012] Done archiving from 2011-07 to 2012-07 | |
# $ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# Brett Terpstra <http://brettterpstra.com> | |
# The beginnings of a means to quickly search the text archive of tweets I've built | |
# Currently allows you to return tweets, one per line, based on a simple timespan parameter | |
# Not entirely accurate at this point. Seriously. | |
# | |
# `tweetscan.rb d` returns tweets from the current day | |
# `tweetscan.rb m` returns all tweets since the first day of the month | |
# `tweetscan.rb y` returns all tweets this year | |
# `tweetscan.rb` returns all tweets in the archive | |
# | |
# Try `tweetscan.rb d|wc -l` to see how many times you've tweeted today | |
# Use `tweetscan.rb y|wc -w` to see about how many words you've tweeted this year | |
# Then, get depressed about how much time you spend on Twitter | |
# | |
# TODO: Add user-specified date filter | |
# TODO: Add tweet/word/char count options | |
# TODO: Add (username and) fuzzy keyword search | |
# TODO: Add (username and) keyword filter | |
# TODO: Option to output date with tweet | |
# | |
## Config | |
archive = '~/Dropbox/nvALT2.2' | |
username = 'ttscoff' | |
tweettype = 'twitter' | |
## End Config | |
# Don't edit beyond this point unless you want to help out and make my life easier. Seriously, I'm short on time these days. | |
search_span = ARGV[0] | |
search_date = "" | |
unless search_span.nil? | |
file_date, search_date = case search_span | |
when /[td]((od)?ay)?/i : [Time.now.strftime('%Y-%m'),Time.now.strftime('%B %d, %Y')] | |
when /m(onth)?/i : [Time.now.strftime('%Y-%m'),Time.now.strftime('%B \d\d, %Y')] | |
when /y(ear)?/i : [Time.now.strftime('%Y'),Time.now.strftime('.*?%Y')] | |
end | |
end | |
Dir.chdir(File.expand_path(archive)) | |
Dir["@#{username}-#{tweettype.capitalize}_#{file_date}*.md"].each do |file| | |
File.open(file).read.split(/---/).each {|tweet| | |
puts tweet.gsub(/\[#{search_date}.*?\)/i,'').strip.gsub(/\n/,' ') if tweet =~ /\[#{search_date}/ | |
} | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Can't remember if images work in GitHub comments, but this is tweetscan.rb combined with ack to remind me what the trick was to get Droplr direct links. I knew I tweeted it a while ago when I found out, but didn't write it down anywhere else.
Since GitHub seems to hide image links somehow…
http://assets.brettterpstra.com/Screenshot%202012-07-05%20at%2011.46.png
See, I told you this would be useful.