lukeredpath/Gemfile

## README.md

      
    Raw
  

              README.md
            
          
    Background

If you live in the EU, you should be able to obtain your entire Tweet history from Twitter under EU privacy laws.
In the UK these laws are backed by the Data Protection Act. To obtain your data from Twitter, you need to send them an email with a Subject Access Request. Once you have your support ticket, you need to fax them some ID. You will then receive an archive of all the data Twitter have about you, including all of your tweets.
This script/class can be used to parse the resulting file so you can stick the data into something a bit more permanent, a database or document store perhaps.
For more details on how to get your data from Twitter, read this article:
https://www.privacyinternational.org/blog/what-does-twitter-know-about-its-users-nologs

  
## Gemfile
gem "mongo"
gem "bson_ext"
gem "mongoid"

## Gemfile.lock
GEM
  specs:
    activemodel (3.2.2)
      activesupport (= 3.2.2)
      builder (~> 3.0.0)
    activesupport (3.2.2)
      i18n (~> 0.6)
      multi_json (~> 1.0)
    bson (1.5.2)
    bson_ext (1.5.2)
      bson (= 1.5.2)
    builder (3.0.0)
    i18n (0.6.0)
    mongo (1.5.2)
      bson (= 1.5.2)
    mongoid (2.4.2)
      activemodel (~> 3.1)
      mongo (~> 1.3)
      tzinfo (~> 0.3.22)
    multi_json (1.3.6)
    tzinfo (0.3.33)

PLATFORMS
  ruby

DEPENDENCIES
  bson_ext
  mongo
  mongoid

## import.rb
#!/usr/bin/env ruby
#
# If you request your Tweet history from Twitter using a Subject Access Request,
# you'll receive a file called yourusername-tweets.txt.
#
# On first glance, it looks like delimited YAML records, however the data is not
# quite valid enough to be easily parsed by Ruby's standard YAML parser.
#
# The main issue is that each field in a single record is not quoted, and it may
# also be spread over multiple lines (generally the 'text' field) and contain
# special characters.
#
# Here is an example of the format:
#
#   ********************
#   user_id: 72573
#   created_at: Thu Jun 21 09:43:33 +0000 2007
#   created_via: web
#   status_id: 114323742
#   text: testing testing...
#
# The file also contains a PGP signature.
#
# This gist contains a class, StreamingTweetParser, which parses the file
# line by line and produces an array of Hashes for each record. Some specs are included
# at the bottom of the file.
#
# If you find a bug, add a new spec, fix it and send me a pull request or patch.
#
# To use this script directly, just run `import.rb <input-file>`. It will parse each
# tweet and attempt to import it into a MongoDB database.

$:.unshift(File.dirname(__FILE__))

require 'streaming_tweet_parser'
require 'mongodb_tweet_importer'

input_file = ARGV[0]

if input_file.nil?
  puts "Usage: parser.rb [input-file]"
  exit(1)
end

parser = StreamingTweetParser.new

puts "Parsing records..."

File.readlines(input_file).each do |line|
  parser << line
end

puts "Parsed #{parser.tweets.length} tweets from #{File.basename(input_file)}."

importer = MongodbTweetImporter.new

puts "Importing into MongoDB..."

parser.tweets.each do |tweet|
  importer.import(tweet)
end

puts "Done."

## mongo_tweets.rb
require 'mongoid'

Mongoid.database = Mongo::Connection.new.db("tweet_archive")

module Mongo
  class Tweet
    include Mongoid::Document

    store_in "tweets"

    field :user_id,     type: Integer
    field :created_via, type: String
    field :status_id,   type: Integer
    field :text,        type: String

    index :status_id, unique: true

    def created_at
      Time.parse(read_attribute("created_at"))
    end

    def self.last_tweet_id
      order_by("status_id").last.status_id
    end
  end
end

## mongodb_tweet_importer.rb
require 'mongo'

class MongodbTweetImporter
  def initialize(mongo = Mongo::Connection.new)
    @db = mongo.db("tweet_archive")
  end

  def import(record)
    @db["tweets"].insert(record)
  end
end

## streaming_tweet_parser.rb
class StreamingTweetParser
  attr_reader :tweets

  def initialize
    @tweets = []
  end

  def <<(line)
    if line =~ /^\*+/
      start_new_record
    elsif line =~ /-----BEGIN PGP SIGNATURE-----/
      finish
    else
      @current_record << line.strip if @current_record
    end
  end

  def finish
    if @current_record
      @tweets << @current_record.to_hash
    end
  end

  private

  def start_new_record
    if @current_record
      @tweets << @current_record.to_hash
    end

    @current_record = Record.new
  end

  class Record
    VALID_KEYS = [
      'user_id',
      'created_at',
      'status_id',
      'created_via',
      'text'
    ]

    def initialize
      @fields = []
    end

    def <<(line)
      return if line.strip == ""

      if line =~ /(#{VALID_KEYS.join('|')}):\s.*/
        parts = line.split(/:\s/)
        @fields << [parts[0], parts[1..-1].join(": ")]
      else
        @fields.last[1] << "\n" << line # multi-line continuation of last field
      end
    end

    def to_hash
      Hash[*@fields.flatten]
    rescue
      puts "Error converting #{@fields.inspect}"
    end
  end
end

if __FILE__ == $0
    require 'rspec/autorun'

  describe StreamingTweetParser do

    before do
      @parser = StreamingTweetParser.new
    end

    it 'can parse a single record with no special characters' do
      @parser << "********************"
      @parser << "user_id: 72573"
      @parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
      @parser << "created_via: twitterrific"
      @parser << "status_id: 383462562"
      @parser << "text: tweet 1"
      @parser << "\n"
      @parser.finish

      @parser.should have(1).tweets
      @parser.tweets[0].to_hash.should have(5).keys
      @parser.tweets[0]['user_id'].should == '72573'
      @parser.tweets[0]['text'].should == 'tweet 1'
    end

    it 'ignores data before the first record' do
      @parser << "some data"
      @parser << "before"
      @parser << "********************"
      @parser << "user_id: 72573"
      @parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
      @parser << "created_via: twitterrific"
      @parser << "status_id: 383462562"
      @parser << "text: tweet 1"
      @parser << "\n"
      @parser.finish

      @parser.should have(1).tweets
      @parser.tweets[0]['user_id'].should == '72573'
      @parser.tweets[0]['text'].should == 'tweet 1'
    end

    it 'can parse multiple records with no special characters' do
      @parser << "********************"
      @parser << "user_id: 72573"
      @parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
      @parser << "created_via: twitterrific"
      @parser << "status_id: 383462562"
      @parser << "text: tweet 1"
      @parser << "\n"
      @parser << "********************"
      @parser << "user_id: 72574"
      @parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
      @parser << "created_via: twitterrific"
      @parser << "status_id: 383462562"
      @parser << "text: tweet 2"
      @parser << "\n"
      @parser.finish

      @parser.should have(2).tweets
      @parser.tweets[0]['user_id'].should == '72573'
      @parser.tweets[0]['text'].should == 'tweet 1'
      @parser.tweets[1]['user_id'].should == '72574'
      @parser.tweets[1]['text'].should == 'tweet 2'
    end

    it 'can parse a single record with an @symbol' do
      @parser << "********************"
      @parser << "user_id: 72573"
      @parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
      @parser << "created_via: twitterrific"
      @parser << "status_id: 383462562"
      @parser << "text: @reply tweet 1"
      @parser << "\n"
      @parser.finish

      @parser.should have(1).tweets
      @parser.tweets[0]['user_id'].should == '72573'
      @parser.tweets[0]['text'].should == '@reply tweet 1'
    end

    it 'can parse a single record with an quotes' do
      @parser << "********************"
      @parser << "user_id: 72573"
      @parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
      @parser << "created_via: twitterrific"
      @parser << "status_id: 383462562"
      @parser << "text: @reply \"tweet\" 1"
      @parser << "\n"
      @parser.finish

      @parser.should have(1).tweets
      @parser.tweets[0]['user_id'].should == '72573'
      @parser.tweets[0]['text'].should == '@reply "tweet" 1'
    end

    it 'can parse a single record with text split over multiple lines' do
      @parser << "********************"
      @parser << "user_id: 72573"
      @parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
      @parser << "created_via: twitterrific"
      @parser << "status_id: 383462562"
      @parser << "text: @reply tweet 1"
      @parser << "\n"
      @parser << "containing some new lines"
      @parser << "\n"
      @parser << "another line"
      @parser << "\n"
      @parser.finish

      @parser.should have(1).tweets
      @parser.tweets[0]['user_id'].should == '72573'
      @parser.tweets[0]['text'].should == "@reply tweet 1\ncontaining some new lines\nanother line"
    end

    it 'can parse a single record with text split over multiple lines with colons' do
      @parser << "********************"
      @parser << "user_id: 72573"
      @parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
      @parser << "created_via: twitterrific"
      @parser << "status_id: 383462562"
      @parser << "text: @reply tweet 1"
      @parser << "oh yes: look"
      @parser << "\n"
      @parser.finish

      @parser.should have(1).tweets
      @parser.tweets[0]['user_id'].should == '72573'
      @parser.tweets[0]['text'].should == "@reply tweet 1\noh yes: look"
    end

    it 'can parse a single record with colons in the value' do
      @parser << "********************"
      @parser << "user_id: 72573"
      @parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
      @parser << "created_via: twitterrific"
      @parser << "status_id: 383462562"
      @parser << "text: this is cool: http://google.com"
      @parser << "\n"
      @parser.finish

      @parser.should have(1).tweets
      @parser.tweets[0]['user_id'].should == '72573'
      @parser.tweets[0]['text'].should == 'this is cool: http://google.com'
    end

    it 'automatically finishes if it sees a signature' do
      @parser << "********************"
      @parser << "user_id: 72573"
      @parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
      @parser << "created_via: twitterrific"
      @parser << "status_id: 383462562"
      @parser << "text: tweet 1"
      @parser << "\n"
      @parser << "-----BEGIN PGP SIGNATURE-----"

      @parser.should have(1).tweets
    end
  end
end
	GEM
	specs:
	activemodel (3.2.2)
	activesupport (= 3.2.2)
	builder (~> 3.0.0)
	activesupport (3.2.2)
	i18n (~> 0.6)
	multi_json (~> 1.0)
	bson (1.5.2)
	bson_ext (1.5.2)
	bson (= 1.5.2)
	builder (3.0.0)
	i18n (0.6.0)
	mongo (1.5.2)
	bson (= 1.5.2)
	mongoid (2.4.2)
	activemodel (~> 3.1)
	mongo (~> 1.3)
	tzinfo (~> 0.3.22)
	multi_json (1.3.6)
	tzinfo (0.3.33)

	PLATFORMS
	ruby

	DEPENDENCIES
	bson_ext
	mongo
	mongoid
	#!/usr/bin/env ruby
	#
	# If you request your Tweet history from Twitter using a Subject Access Request,
	# you'll receive a file called yourusername-tweets.txt.
	#
	# On first glance, it looks like delimited YAML records, however the data is not
	# quite valid enough to be easily parsed by Ruby's standard YAML parser.
	#
	# The main issue is that each field in a single record is not quoted, and it may
	# also be spread over multiple lines (generally the 'text' field) and contain
	# special characters.
	#
	# Here is an example of the format:
	#
	# ********************
	# user_id: 72573
	# created_at: Thu Jun 21 09:43:33 +0000 2007
	# created_via: web
	# status_id: 114323742
	# text: testing testing...
	#
	# The file also contains a PGP signature.
	#
	# This gist contains a class, StreamingTweetParser, which parses the file
	# line by line and produces an array of Hashes for each record. Some specs are included
	# at the bottom of the file.
	#
	# If you find a bug, add a new spec, fix it and send me a pull request or patch.
	#
	# To use this script directly, just run `import.rb <input-file>`. It will parse each
	# tweet and attempt to import it into a MongoDB database.

	$:.unshift(File.dirname(__FILE__))

	require 'streaming_tweet_parser'
	require 'mongodb_tweet_importer'

	input_file = ARGV[0]

	if input_file.nil?
	puts "Usage: parser.rb [input-file]"
	exit(1)
	end

	parser = StreamingTweetParser.new

	puts "Parsing records..."

	File.readlines(input_file).each do \|line\|
	parser << line
	end

	puts "Parsed #{parser.tweets.length} tweets from #{File.basename(input_file)}."

	importer = MongodbTweetImporter.new

	puts "Importing into MongoDB..."

	parser.tweets.each do \|tweet\|
	importer.import(tweet)
	end

	puts "Done."
	require 'mongoid'

	Mongoid.database = Mongo::Connection.new.db("tweet_archive")

	module Mongo
	class Tweet
	include Mongoid::Document

	store_in "tweets"

	field :user_id, type: Integer
	field :created_via, type: String
	field :status_id, type: Integer
	field :text, type: String

	index :status_id, unique: true

	def created_at
	Time.parse(read_attribute("created_at"))
	end

	def self.last_tweet_id
	order_by("status_id").last.status_id
	end
	end
	end
	require 'mongo'

	class MongodbTweetImporter
	def initialize(mongo = Mongo::Connection.new)
	@db = mongo.db("tweet_archive")
	end

	def import(record)
	@db["tweets"].insert(record)
	end
	end
	class StreamingTweetParser
	attr_reader :tweets

	def initialize
	@tweets = []
	end

	def <<(line)
	if line =~ /^\*+/
	start_new_record
	elsif line =~ /-----BEGIN PGP SIGNATURE-----/
	finish
	else
	@current_record << line.strip if @current_record
	end
	end

	def finish
	if @current_record
	@tweets << @current_record.to_hash
	end
	end

	private

	def start_new_record
	if @current_record
	@tweets << @current_record.to_hash
	end

	@current_record = Record.new
	end

	class Record
	VALID_KEYS = [
	'user_id',
	'created_at',
	'status_id',
	'created_via',
	'text'
	]

	def initialize
	@fields = []
	end

	def <<(line)
	return if line.strip == ""

	if line =~ /(#{VALID_KEYS.join('\|')}):\s.*/
	parts = line.split(/:\s/)
	@fields << [parts[0], parts[1..-1].join(": ")]
	else
	@fields.last[1] << "\n" << line # multi-line continuation of last field
	end
	end

	def to_hash
	Hash[*@fields.flatten]
	rescue
	puts "Error converting #{@fields.inspect}"
	end
	end
	end

	if __FILE__ == $0
	require 'rspec/autorun'

	describe StreamingTweetParser do

	before do
	@parser = StreamingTweetParser.new
	end

	it 'can parse a single record with no special characters' do
	@parser << "********************"
	@parser << "user_id: 72573"
	@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
	@parser << "created_via: twitterrific"
	@parser << "status_id: 383462562"
	@parser << "text: tweet 1"
	@parser << "\n"
	@parser.finish

	@parser.should have(1).tweets
	@parser.tweets[0].to_hash.should have(5).keys
	@parser.tweets[0]['user_id'].should == '72573'
	@parser.tweets[0]['text'].should == 'tweet 1'
	end

	it 'ignores data before the first record' do
	@parser << "some data"
	@parser << "before"
	@parser << "********************"
	@parser << "user_id: 72573"
	@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
	@parser << "created_via: twitterrific"
	@parser << "status_id: 383462562"
	@parser << "text: tweet 1"
	@parser << "\n"
	@parser.finish

	@parser.should have(1).tweets
	@parser.tweets[0]['user_id'].should == '72573'
	@parser.tweets[0]['text'].should == 'tweet 1'
	end

	it 'can parse multiple records with no special characters' do
	@parser << "********************"
	@parser << "user_id: 72573"
	@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
	@parser << "created_via: twitterrific"
	@parser << "status_id: 383462562"
	@parser << "text: tweet 1"
	@parser << "\n"
	@parser << "********************"
	@parser << "user_id: 72574"
	@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
	@parser << "created_via: twitterrific"
	@parser << "status_id: 383462562"
	@parser << "text: tweet 2"
	@parser << "\n"
	@parser.finish

	@parser.should have(2).tweets
	@parser.tweets[0]['user_id'].should == '72573'
	@parser.tweets[0]['text'].should == 'tweet 1'
	@parser.tweets[1]['user_id'].should == '72574'
	@parser.tweets[1]['text'].should == 'tweet 2'
	end

	it 'can parse a single record with an @symbol' do
	@parser << "********************"
	@parser << "user_id: 72573"
	@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
	@parser << "created_via: twitterrific"
	@parser << "status_id: 383462562"
	@parser << "text: @reply tweet 1"
	@parser << "\n"
	@parser.finish

	@parser.should have(1).tweets
	@parser.tweets[0]['user_id'].should == '72573'
	@parser.tweets[0]['text'].should == '@reply tweet 1'
	end

	it 'can parse a single record with an quotes' do
	@parser << "********************"
	@parser << "user_id: 72573"
	@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
	@parser << "created_via: twitterrific"
	@parser << "status_id: 383462562"
	@parser << "text: @reply \"tweet\" 1"
	@parser << "\n"
	@parser.finish

	@parser.should have(1).tweets
	@parser.tweets[0]['user_id'].should == '72573'
	@parser.tweets[0]['text'].should == '@reply "tweet" 1'
	end

	it 'can parse a single record with text split over multiple lines' do
	@parser << "********************"
	@parser << "user_id: 72573"
	@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
	@parser << "created_via: twitterrific"
	@parser << "status_id: 383462562"
	@parser << "text: @reply tweet 1"
	@parser << "\n"
	@parser << "containing some new lines"
	@parser << "\n"
	@parser << "another line"
	@parser << "\n"
	@parser.finish

	@parser.should have(1).tweets
	@parser.tweets[0]['user_id'].should == '72573'
	@parser.tweets[0]['text'].should == "@reply tweet 1\ncontaining some new lines\nanother line"
	end

	it 'can parse a single record with text split over multiple lines with colons' do
	@parser << "********************"
	@parser << "user_id: 72573"
	@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
	@parser << "created_via: twitterrific"
	@parser << "status_id: 383462562"
	@parser << "text: @reply tweet 1"
	@parser << "oh yes: look"
	@parser << "\n"
	@parser.finish

	@parser.should have(1).tweets
	@parser.tweets[0]['user_id'].should == '72573'
	@parser.tweets[0]['text'].should == "@reply tweet 1\noh yes: look"
	end

	it 'can parse a single record with colons in the value' do
	@parser << "********************"
	@parser << "user_id: 72573"
	@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
	@parser << "created_via: twitterrific"
	@parser << "status_id: 383462562"
	@parser << "text: this is cool: http://google.com"
	@parser << "\n"
	@parser.finish

	@parser.should have(1).tweets
	@parser.tweets[0]['user_id'].should == '72573'
	@parser.tweets[0]['text'].should == 'this is cool: http://google.com'
	end

	it 'automatically finishes if it sees a signature' do
	@parser << "********************"
	@parser << "user_id: 72573"
	@parser << "created_at: Fri Nov 02 15:31:13 +0000 2007"
	@parser << "created_via: twitterrific"
	@parser << "status_id: 383462562"
	@parser << "text: tweet 1"
	@parser << "\n"
	@parser << "-----BEGIN PGP SIGNATURE-----"

	@parser.should have(1).tweets
	end
	end
	end