Last active
August 29, 2015 14:01
-
-
Save mattscilipoti/bebb86c8600cad0168a9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Used to convert json from MixPanel into csv | |
# see: https://stackoverflow.com/questions/7845015/convert-json-to-csv-in-ruby | |
# and: https://stackoverflow.com/questions/273262/best-practices-with-stdin-in-ruby | |
class MixPanelJsonToCsv | |
require 'csv' | |
require 'optparse' | |
attr_reader :config | |
def initialize(cli_args) | |
parse_cli_args(cli_args) | |
end | |
def convert_file(file_io) | |
csv_string = CSV.generate(:force_quotes => true) do |csv| | |
file_io.each_with_index do |raw_json_line, row_index| | |
parser = RowParser.new(raw_json_line, row_index) | |
if row_index == 0 && !config.skip_header | |
csv << RowParser.columns | |
end | |
csv << parser.values | |
end | |
end | |
puts csv_string | |
end | |
private | |
def parse_cli_args(args) | |
@config = OpenStruct.new | |
optparse = OptionParser.new do|opts| | |
opts.banner = "Usage: mixpanel_json_to_csv [options] < input.json > output.csv" | |
opts.on('-h', '--help', 'Displays this help.') do | |
puts opts | |
exit | |
end | |
opts.on('--columns', "Lists known columns.") do | |
puts RowParser.known_columns.inspect | |
exit | |
end | |
opts.on('-s', '--skip-headers', "Do not output header row.") { config.skip_header = true } | |
end | |
# Parse arguments and remove them from ARGV, so ARGF uses stdin | |
optparse.parse! args | |
end | |
end | |
# Converts a single json row (from MixPanel) to csv | |
class RowParser | |
require 'json' | |
attr_reader :raw_json_line, :row_index | |
def self.columns | |
@columns | |
end | |
def self.columns=(value) | |
@columns = value | |
end | |
# Since JSON source data can be sparce, we don't have a reliable "reference" row for columns to convert. | |
def self.known_columns | |
[ "$browser", "$city", "$created", "$device", "$email", "$initial_referrer", "$initial_referring_domain", | |
"$name", "$os", "$referrer", "$referring_domain", "$region", "$search_engine", | |
"Account type", "Auth type", "Classes count", "Customer", "Lesson grade", "Lesson id", "Lesson subject", | |
"Optimizely Lesson Page Tour", "Resource id", "Resource label", "Resource type", | |
"User grade levels", "User roles", "User subjects", | |
"action", "category", "customer", "distinct_id", "label", "last_visited", "mp_country_code", "mp_keyword", "mp_lib", | |
"time", "user_id", "$username", "utm_campaign", "utm_content", "utm_medium", "utm_source", "utm_term" | |
].sort.insert(0, 'event').uniq # event is always the first column | |
end | |
def initialize(raw_json_line, row_index) | |
@raw_json_line = raw_json_line | |
@row_index = row_index | |
unless RowParser.columns # use first row as "reference" to add additional columns | |
RowParser.columns = (RowParser.known_columns | properties.keys.sort) | |
end | |
validate! | |
end | |
def columns | |
RowParser.columns | |
end | |
def event_name | |
parsed_json['event'] | |
end | |
def extra_columns | |
properties.keys - RowParser.columns | |
end | |
def parsed_json | |
@parsed_json ||= JSON.parse(raw_json_line) | |
end | |
def properties | |
parsed_json['properties'] | |
end | |
def validate! | |
unless extra_columns.empty? | |
raise "Row ##{row_index} contains unknown columns: #{extra_columns.inspect}. Please add them to #known_columns." | |
end | |
end | |
def values | |
# json row is sparse, make sure all columns are represented | |
RowParser.columns.inject([]) do |row_values, header| | |
case header | |
when 'event' | |
row_values << event_name | |
else | |
row_values << properties.fetch(header, '') | |
end | |
row_values | |
end | |
end | |
end | |
MixPanelJsonToCsv.new(ARGV).convert_file(ARGF) # ARGF handles file from stdin |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment