Skip to content

Instantly share code, notes, and snippets.

@mattscilipoti
Last active August 29, 2015 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mattscilipoti/bebb86c8600cad0168a9 to your computer and use it in GitHub Desktop.
Save mattscilipoti/bebb86c8600cad0168a9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
# Used to convert json from MixPanel into csv
# see: https://stackoverflow.com/questions/7845015/convert-json-to-csv-in-ruby
# and: https://stackoverflow.com/questions/273262/best-practices-with-stdin-in-ruby
class MixPanelJsonToCsv
require 'csv'
require 'optparse'
attr_reader :config
def initialize(cli_args)
parse_cli_args(cli_args)
end
def convert_file(file_io)
csv_string = CSV.generate(:force_quotes => true) do |csv|
file_io.each_with_index do |raw_json_line, row_index|
parser = RowParser.new(raw_json_line, row_index)
if row_index == 0 && !config.skip_header
csv << RowParser.columns
end
csv << parser.values
end
end
puts csv_string
end
private
def parse_cli_args(args)
@config = OpenStruct.new
optparse = OptionParser.new do|opts|
opts.banner = "Usage: mixpanel_json_to_csv [options] < input.json > output.csv"
opts.on('-h', '--help', 'Displays this help.') do
puts opts
exit
end
opts.on('--columns', "Lists known columns.") do
puts RowParser.known_columns.inspect
exit
end
opts.on('-s', '--skip-headers', "Do not output header row.") { config.skip_header = true }
end
# Parse arguments and remove them from ARGV, so ARGF uses stdin
optparse.parse! args
end
end
# Converts a single json row (from MixPanel) to csv
class RowParser
require 'json'
attr_reader :raw_json_line, :row_index
def self.columns
@columns
end
def self.columns=(value)
@columns = value
end
# Since JSON source data can be sparce, we don't have a reliable "reference" row for columns to convert.
def self.known_columns
[ "$browser", "$city", "$created", "$device", "$email", "$initial_referrer", "$initial_referring_domain",
"$name", "$os", "$referrer", "$referring_domain", "$region", "$search_engine",
"Account type", "Auth type", "Classes count", "Customer", "Lesson grade", "Lesson id", "Lesson subject",
"Optimizely Lesson Page Tour", "Resource id", "Resource label", "Resource type",
"User grade levels", "User roles", "User subjects",
"action", "category", "customer", "distinct_id", "label", "last_visited", "mp_country_code", "mp_keyword", "mp_lib",
"time", "user_id", "$username", "utm_campaign", "utm_content", "utm_medium", "utm_source", "utm_term"
].sort.insert(0, 'event').uniq # event is always the first column
end
def initialize(raw_json_line, row_index)
@raw_json_line = raw_json_line
@row_index = row_index
unless RowParser.columns # use first row as "reference" to add additional columns
RowParser.columns = (RowParser.known_columns | properties.keys.sort)
end
validate!
end
def columns
RowParser.columns
end
def event_name
parsed_json['event']
end
def extra_columns
properties.keys - RowParser.columns
end
def parsed_json
@parsed_json ||= JSON.parse(raw_json_line)
end
def properties
parsed_json['properties']
end
def validate!
unless extra_columns.empty?
raise "Row ##{row_index} contains unknown columns: #{extra_columns.inspect}. Please add them to #known_columns."
end
end
def values
# json row is sparse, make sure all columns are represented
RowParser.columns.inject([]) do |row_values, header|
case header
when 'event'
row_values << event_name
else
row_values << properties.fetch(header, '')
end
row_values
end
end
end
MixPanelJsonToCsv.new(ARGV).convert_file(ARGF) # ARGF handles file from stdin
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment