Skip to content

Instantly share code, notes, and snippets.

@cwsteinbach
Created April 19, 2013 23:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cwsteinbach/783d12ccf777571d29fe to your computer and use it in GitHub Desktop.
Save cwsteinbach/783d12ccf777571d29fe to your computer and use it in GitHub Desktop.
Updated version of GithubArchive's transform.rb script. The original can be found here: https://github.com/igrigorik/githubarchive.org/blob/master/bigquery/transform.rb
# encoding: UTF-8
require 'optparse'
require 'time'
require 'zlib'
require 'yajl'
require 'csv'
$: << '.'
require 'remap.rb'
ARGV << '--help' if ARGV.empty?
options = {schema: 'schema.js', verbose: false, compress: false}
OptionParser.new do |opts|
opts.banner = "Usage: flatten.rb [options]"
opts.on("-i", "--input FILE", "input filename") do |v|
options[:input] = v
end
opts.on("-o", "--output FILE", "output filename") do |v|
options[:output] = v
end
opts.on("-s", "--schema FILE", "schema file (default: schema.js)") do |v|
options[:schema] = v
end
opts.on("-c", "--[no-]compress", "compress output") do |v|
options[:compress] = v
end
opts.on("-v", "--verbose", "verbose log (default: false)") do |v|
options[:verbose] = v
end
end.parse!
#
# Map GitHub JSON schema to flat CSV space based
# on provided Big Query column schema
#
def flatmap(h, e, prefix = '')
e.each do |k,v|
if v.is_a?(Hash)
flatmap(h, v, prefix+k+"_")
else
if not v.is_a? Array
if v.is_a? String
v = v.split.join(' ')
v = v[0,10000] + ' ...' if v.size > 10000
end
h[prefix+k] = v
end
end
end
h
end
def save(row, event, opt)
flatmap({}, event).each do |k,v|
v = (Time.parse(v).utc.strftime('%Y-%m-%d %T') rescue '') if k =~ /_at$/
v.clean! if v.is_a? String
if row.include?(k)
row[k] = v
else
nk = k.remap
if row.include?(nk)
puts "Remapped #{k} => #{nk}, value: #{v}" if opt[:verbose]
row[nk] = v
else
puts "Unknown field: #{k}, value: #{v}" if opt[:verbose] && !IGNORED.include?(k)
end
end
end
end
start = Time.now
schema = Yajl::Parser.parse(open(options[:schema]).read)
headers = schema.map {|f| f['name']}
begin
options[:output] ||= options[:input] + "-out.csv"
options[:output] += '.gz' if options[:compress]
out = File.new(options[:output], "w")
out = Zlib::GzipWriter.new(out) if options[:compress]
cnt = 0
Zlib::GzipReader.new(open(options[:input])).each_line { |line|
begin
Yajl::Parser.parse("#{line}") do |event|
r = CSV::Row.new(headers, [])
case event['type']
when 'PushEvent'
num = event['payload'].delete 'size'
commits = event['payload'].delete 'shas'
commits ||= []
commits.each do |commit|
id, email, msg, name, flag = *commit
event['payload'].merge!({
'commit' => {
'id' => id, 'email' => email, 'msg' => msg,
'name' => name, 'flag' => flag
}
})
save(r, event, options)
end
when 'GollumEvent'
pages = event['payload'].delete 'pages'
pages ||= []
pages.each do |page|
page['summary'] = page['summary'] if page['summary']
event['payload'].merge!({'page' => page})
save(r, event, options)
end
else
save(r, event, options)
end
raise "Record <> schema mismatch: #{r.size}, #{schema.size}. Exiting." if r.size != schema.size
cnt += 1
out.write r.to_s
end
rescue
next
end
}
puts "Processed #{options[:input]}: #{cnt} rows in #{(Time.now - start).round} seconds"
ensure
out.close
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment