Skip to content

Instantly share code, notes, and snippets.

@lukeredpath
Created July 11, 2012 11:53
Show Gist options
  • Save lukeredpath/3089893 to your computer and use it in GitHub Desktop.
Save lukeredpath/3089893 to your computer and use it in GitHub Desktop.
A parser for Twitter's tweet archive dump format

Background

If you live in the EU, you should be able to obtain your entire Tweet history from Twitter under EU privacy laws.

In the UK these laws are backed by the Data Protection Act. To obtain your data from Twitter, you need to send them an email with a Subject Access Request. Once you have your support ticket, you need to fax them some ID. You will then receive an archive of all the data Twitter have about you, including all of your tweets.

This script/class can be used to parse the resulting file so you can stick the data into something a bit more permanent, a database or document store perhaps.

For more details on how to get your data from Twitter, read this article: https://www.privacyinternational.org/blog/what-does-twitter-know-about-its-users-nologs

gem "mongo"
gem "bson_ext"
gem "mongoid"
GEM
specs:
activemodel (3.2.2)
activesupport (= 3.2.2)
builder (~> 3.0.0)
activesupport (3.2.2)
i18n (~> 0.6)
multi_json (~> 1.0)
bson (1.5.2)
bson_ext (1.5.2)
bson (= 1.5.2)
builder (3.0.0)
i18n (0.6.0)
mongo (1.5.2)
bson (= 1.5.2)
mongoid (2.4.2)
activemodel (~> 3.1)
mongo (~> 1.3)
tzinfo (~> 0.3.22)
multi_json (1.3.6)
tzinfo (0.3.33)
PLATFORMS
ruby
DEPENDENCIES
bson_ext
mongo
mongoid
#!/usr/bin/env ruby
#
# If you request your Tweet history from Twitter using a Subject Access Request,
# you'll receive a file called yourusername-tweets.txt.
#
# On first glance, it looks like delimited YAML records, however the data is not
# quite valid enough to be easily parsed by Ruby's standard YAML parser.
#
# The main issue is that each field in a single record is not quoted, and it may
# also be spread over multiple lines (generally the 'text' field) and contain
# special characters.
#
# Here is an example of the format:
#
# ********************
# user_id: 72573
# created_at: Thu Jun 21 09:43:33 +0000 2007
# created_via: web
# status_id: 114323742
# text: testing testing...
#
# The file also contains a PGP signature.
#
# This gist contains a class, StreamingTweetParser, which parses the file
# line by line and produces an array of Hashes for each record. Some specs are included
# at the bottom of the file.
#
# If you find a bug, add a new spec, fix it and send me a pull request or patch.
#
# To use this script directly, just run `import.rb <input-file>`. It will parse each
# tweet and attempt to import it into a MongoDB database.
$:.unshift(File.dirname(__FILE__))
require 'streaming_tweet_parser'
require 'mongodb_tweet_importer'
input_file = ARGV[0]
if input_file.nil?
puts "Usage: parser.rb [input-file]"
exit(1)
end
parser = StreamingTweetParser.new
puts "Parsing records..."
File.readlines(input_file).each do |line|
parser << line
end
puts "Parsed #{parser.tweets.length} tweets from #{File.basename(input_file)}."
importer = MongodbTweetImporter.new
puts "Importing into MongoDB..."
parser.tweets.each do |tweet|
importer.import(tweet)
end
puts "Done."
require 'mongoid'
Mongoid.database = Mongo::Connection.new.db("tweet_archive")
module Mongo
class Tweet
include Mongoid::Document
store_in "tweets"
field :user_id, type: Integer
field :created_via, type: String
field :status_id, type: Integer
field :text, type: String
index :status_id, unique: true
def created_at
Time.parse(read_attribute("created_at"))
end
def self.last_tweet_id
order_by("status_id").last.status_id
end
end
end
require 'mongo'
class MongodbTweetImporter
def initialize(mongo = Mongo::Connection.new)
@db = mongo.db("tweet_archive")
end
def import(record)
@db["tweets"].insert(record)
end
end
class StreamingTweetParser
attr_reader :tweets
def initialize
@tweets = []
end
def <<(line)
if line =~ /^\*+/
start_new_record
elsif line =~ /-----BEGIN PGP SIGNATURE-----/
finish
else
@current_record << line.strip if @current_record
end
end
def finish
if @current_record
@tweets << @current_record.to_hash
end
end
private
def start_new_record
if @current_record
@tweets << @current_record.to_hash
end
@current_record = Record.new
end
class Record
VALID_KEYS = [
'user_id',
'created_at',
'status_id',
'created_via',
'text'
]
def initialize
@fields = []
end
def <<(line)
return if line.strip == ""
if line =~ /(#{VALID_KEYS.join('|')}):\s.*/
parts = line.split(/:\s/)
@fields << [parts[0], parts[1..-1].join(": ")]
else
@fields.last[1] << "\n" << line # multi-line continuation of last field
end
end
def to_hash
Hash[*@fields.flatten]
rescue
puts "Error converting #{@fields.inspect}"
end
end
end
if __FILE__ == $0
require 'rspec/autorun'
describe StreamingTweetParser do
before do
@parser = StreamingTweetParser.new
end
it 'can parse a single record with no special characters' do
@parser << "********************"
@parser << "user_id: 72573"
@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
@parser << "created_via: twitterrific"
@parser << "status_id: 383462562"
@parser << "text: tweet 1"
@parser << "\n"
@parser.finish
@parser.should have(1).tweets
@parser.tweets[0].to_hash.should have(5).keys
@parser.tweets[0]['user_id'].should == '72573'
@parser.tweets[0]['text'].should == 'tweet 1'
end
it 'ignores data before the first record' do
@parser << "some data"
@parser << "before"
@parser << "********************"
@parser << "user_id: 72573"
@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
@parser << "created_via: twitterrific"
@parser << "status_id: 383462562"
@parser << "text: tweet 1"
@parser << "\n"
@parser.finish
@parser.should have(1).tweets
@parser.tweets[0]['user_id'].should == '72573'
@parser.tweets[0]['text'].should == 'tweet 1'
end
it 'can parse multiple records with no special characters' do
@parser << "********************"
@parser << "user_id: 72573"
@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
@parser << "created_via: twitterrific"
@parser << "status_id: 383462562"
@parser << "text: tweet 1"
@parser << "\n"
@parser << "********************"
@parser << "user_id: 72574"
@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
@parser << "created_via: twitterrific"
@parser << "status_id: 383462562"
@parser << "text: tweet 2"
@parser << "\n"
@parser.finish
@parser.should have(2).tweets
@parser.tweets[0]['user_id'].should == '72573'
@parser.tweets[0]['text'].should == 'tweet 1'
@parser.tweets[1]['user_id'].should == '72574'
@parser.tweets[1]['text'].should == 'tweet 2'
end
it 'can parse a single record with an @symbol' do
@parser << "********************"
@parser << "user_id: 72573"
@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
@parser << "created_via: twitterrific"
@parser << "status_id: 383462562"
@parser << "text: @reply tweet 1"
@parser << "\n"
@parser.finish
@parser.should have(1).tweets
@parser.tweets[0]['user_id'].should == '72573'
@parser.tweets[0]['text'].should == '@reply tweet 1'
end
it 'can parse a single record with an quotes' do
@parser << "********************"
@parser << "user_id: 72573"
@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
@parser << "created_via: twitterrific"
@parser << "status_id: 383462562"
@parser << "text: @reply \"tweet\" 1"
@parser << "\n"
@parser.finish
@parser.should have(1).tweets
@parser.tweets[0]['user_id'].should == '72573'
@parser.tweets[0]['text'].should == '@reply "tweet" 1'
end
it 'can parse a single record with text split over multiple lines' do
@parser << "********************"
@parser << "user_id: 72573"
@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
@parser << "created_via: twitterrific"
@parser << "status_id: 383462562"
@parser << "text: @reply tweet 1"
@parser << "\n"
@parser << "containing some new lines"
@parser << "\n"
@parser << "another line"
@parser << "\n"
@parser.finish
@parser.should have(1).tweets
@parser.tweets[0]['user_id'].should == '72573'
@parser.tweets[0]['text'].should == "@reply tweet 1\ncontaining some new lines\nanother line"
end
it 'can parse a single record with text split over multiple lines with colons' do
@parser << "********************"
@parser << "user_id: 72573"
@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
@parser << "created_via: twitterrific"
@parser << "status_id: 383462562"
@parser << "text: @reply tweet 1"
@parser << "oh yes: look"
@parser << "\n"
@parser.finish
@parser.should have(1).tweets
@parser.tweets[0]['user_id'].should == '72573'
@parser.tweets[0]['text'].should == "@reply tweet 1\noh yes: look"
end
it 'can parse a single record with colons in the value' do
@parser << "********************"
@parser << "user_id: 72573"
@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
@parser << "created_via: twitterrific"
@parser << "status_id: 383462562"
@parser << "text: this is cool: http://google.com"
@parser << "\n"
@parser.finish
@parser.should have(1).tweets
@parser.tweets[0]['user_id'].should == '72573'
@parser.tweets[0]['text'].should == 'this is cool: http://google.com'
end
it 'automatically finishes if it sees a signature' do
@parser << "********************"
@parser << "user_id: 72573"
@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
@parser << "created_via: twitterrific"
@parser << "status_id: 383462562"
@parser << "text: tweet 1"
@parser << "\n"
@parser << "-----BEGIN PGP SIGNATURE-----"
@parser.should have(1).tweets
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment