Created
March 5, 2015 19:14
-
-
Save kenbod/def2ca3d5851067a205d to your computer and use it in GitHub Desktop.
A script to import tweets into MongoDB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'bundler/setup' | |
require 'date' | |
require 'json' | |
require 'mongo' | |
require 'time' | |
include Mongo | |
def print_message(message) | |
print message | |
message_range = 1..message.size | |
back_it_up = message_range.inject([]) {|array, element| array << "\b" } | |
print back_it_up.join() | |
STDOUT.flush | |
end | |
def to_datetime(value) | |
DateTime.parse(value) | |
end | |
def to_time(value) | |
value.to_time.utc | |
end | |
def get_value(tweet, keys) | |
value = tweet | |
keys.each do |key| | |
value = value[key] | |
end | |
value | |
end | |
def set_value(tweet, keys, new_value) | |
previous_value = nil | |
value = tweet | |
keys.each do |key| | |
previous_value = value | |
value = value[key] | |
end | |
previous_value[keys[-1]] = new_value | |
end | |
def key_array | |
[ | |
['created_at'], | |
['user', 'created_at'], | |
['retweeted_status', 'created_at'], | |
['retweeted_status', 'user', 'created_at'] | |
] | |
end | |
def convert_created_at(tweet) | |
key_array.each do |keys| | |
begin | |
value = get_value(tweet, keys) | |
new_value = to_time(to_datetime(value)) | |
set_value(tweet, keys, new_value) | |
rescue | |
# do nothing if attribute does not exist | |
end | |
end | |
end | |
def remove_coordinates_if_needed(tweet) | |
if tweet['coordinates'].nil? | |
tweet.delete('coordinates') | |
return | |
end | |
value = tweet['coordinates']['coordinates'][0] | |
if value == 0 | |
tweet.delete('coordinates') | |
end | |
end | |
if __FILE__ == $0 | |
STDOUT.sync = true | |
input_file = ARGV[0] | |
mongo = MongoClient.new | |
db = mongo.db('data') | |
tweets = db['tweets'] | |
index = 0 | |
IO.foreach(input_file) do |line| | |
index += 1 | |
print_message("Processing Line #{index}") | |
tweet = JSON.parse(line.chomp) | |
convert_created_at(tweet) | |
remove_coordinates_if_needed(tweet) | |
tweets.insert( tweet ) | |
end | |
mongo.close | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment