Skip to content

Instantly share code, notes, and snippets.

@kenbod
Created March 5, 2015 19:14
Show Gist options
  • Save kenbod/def2ca3d5851067a205d to your computer and use it in GitHub Desktop.
Save kenbod/def2ca3d5851067a205d to your computer and use it in GitHub Desktop.
A script to import tweets into MongoDB
require 'bundler/setup'
require 'date'
require 'json'
require 'mongo'
require 'time'
include Mongo
def print_message(message)
print message
message_range = 1..message.size
back_it_up = message_range.inject([]) {|array, element| array << "\b" }
print back_it_up.join()
STDOUT.flush
end
def to_datetime(value)
DateTime.parse(value)
end
def to_time(value)
value.to_time.utc
end
def get_value(tweet, keys)
value = tweet
keys.each do |key|
value = value[key]
end
value
end
def set_value(tweet, keys, new_value)
previous_value = nil
value = tweet
keys.each do |key|
previous_value = value
value = value[key]
end
previous_value[keys[-1]] = new_value
end
def key_array
[
['created_at'],
['user', 'created_at'],
['retweeted_status', 'created_at'],
['retweeted_status', 'user', 'created_at']
]
end
def convert_created_at(tweet)
key_array.each do |keys|
begin
value = get_value(tweet, keys)
new_value = to_time(to_datetime(value))
set_value(tweet, keys, new_value)
rescue
# do nothing if attribute does not exist
end
end
end
def remove_coordinates_if_needed(tweet)
if tweet['coordinates'].nil?
tweet.delete('coordinates')
return
end
value = tweet['coordinates']['coordinates'][0]
if value == 0
tweet.delete('coordinates')
end
end
if __FILE__ == $0
STDOUT.sync = true
input_file = ARGV[0]
mongo = MongoClient.new
db = mongo.db('data')
tweets = db['tweets']
index = 0
IO.foreach(input_file) do |line|
index += 1
print_message("Processing Line #{index}")
tweet = JSON.parse(line.chomp)
convert_created_at(tweet)
remove_coordinates_if_needed(tweet)
tweets.insert( tweet )
end
mongo.close
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment