mxbees/extract-data-from-twitter-archives.rb

## extract-data-from-twitter-archives.rb
#!/bin/ruby

require 'open-uri'
require 'uri'
require 'json'
require 'rubygems'

#This is for iterating over every .js file in dir. It also assumes you're running the script in the same directory where you unzipped the twitter archive.
Dir.glob('data/js/tweets/*.js') do |x|
    #Opens the .js file and ignores the first line.
    json_data = File.read(x).sub(/^Grailbird.data.tweets_([^=]*)=/){} #This regex is originally from: http://rubyist.g.hatena.ne.jp/hatecha/20130322/tweetsjson
    #this parses the json data
    parsed_data = JSON.parse(json_data)
    #iterate over each tweets' hashed data
    parsed_data.each do |x|
        #looks for the key 'text' and returns value, this is the part that you'll want to change if you want to extract other data.
        tweet_text = x.fetch("text")
        #creates a file to dump the output, appending not overwriting.
        save_text = File.open("tweet_text.txt", "a")
        #this ensures that each text value is on its own line, since 'puts' ends in a newline.
        save_text.puts(tweet_text)
    end
end
	#!/bin/ruby

	require 'open-uri'
	require 'uri'
	require 'json'
	require 'rubygems'

	#This is for iterating over every .js file in dir. It also assumes you're running the script in the same directory where you unzipped the twitter archive.
	Dir.glob('data/js/tweets/*.js') do \|x\|
	#Opens the .js file and ignores the first line.
	json_data = File.read(x).sub(/^Grailbird.data.tweets_([^=]*)=/){} #This regex is originally from: http://rubyist.g.hatena.ne.jp/hatecha/20130322/tweetsjson
	#this parses the json data
	parsed_data = JSON.parse(json_data)
	#iterate over each tweets' hashed data
	parsed_data.each do \|x\|
	#looks for the key 'text' and returns value, this is the part that you'll want to change if you want to extract other data.
	tweet_text = x.fetch("text")
	#creates a file to dump the output, appending not overwriting.
	save_text = File.open("tweet_text.txt", "a")
	#this ensures that each text value is on its own line, since 'puts' ends in a newline.
	save_text.puts(tweet_text)
	end
	end