Last active
August 29, 2015 14:28
-
-
Save yyamano/505b43147ecabc132507 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Taken from embulk/lib/embulk/guess/csv.rb | |
def array_avg(array) | |
array.inject(0.0) {|r,i| r += i } / array.size | |
end | |
def array_variance(array) | |
avg = array_avg(array) | |
array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size | |
end | |
def column_length(row, column_index) | |
column_index >= row.length ? 0 : row[column_index].length | |
end | |
def guess(rows) | |
column_number = rows.map do |row| | |
row.length | |
end.max - 1 | |
headers = [] | |
for column_index in 0..(column_number) do | |
var1 = array_variance(rows.map {|row| column_length(row, column_index) }) | |
var2 = array_variance(rows[1..-1].map {|row| column_length(row, column_index) }) | |
headers << (var1 < var2) | |
puts "c#{column_index}: var1=#{var1} var2=#{var2}" | |
end | |
puts headers.inspect | |
rows.each do |row| | |
puts row.inspect | |
end | |
end | |
dame = [['magic', 'description'], | |
['0XDEADBEEF', 'word-fill pattern'], | |
['0xCAFEBABE', 'java byte code'], | |
['0xDEAD10CC', 'dead lock'], | |
['0xDEADFEED', 'crash report']] | |
sample_01 = [['id', 'account',' time', 'purchase', 'comment'], | |
['1', '32864', '2015-01-27 19:23:49', '20150127', 'embulk'], | |
['2', '14824', '2015-01-27 19:01:23', '20150127', 'embulk jruby'], | |
['3', '27559', '2015-01-28 02:20:02', '20150128', "Embulk ""csv"" parser plugin"], | |
['4', '11270', '2015-01-29 11:54:36', '20150129', 'NULL']] | |
guess(dame) | |
guess(sample_01) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
c0: var1=4.0 var2=0.0 | |
c1: var1=7.44 var2=8.5 | |
[false, true] | |
["magic", "description"] | |
["0XDEADBEEF", "word-fill pattern"] | |
["0xCAFEBABE", "java byte code"] | |
["0xDEAD10CC", "dead lock"] | |
["0xDEADFEED", "crash report"] | |
c0: var1=0.16000000000000006 var2=0.0 | |
c1: var1=0.6399999999999999 var2=0.0 | |
c2: var1=31.360000000000003 var2=0.0 | |
c3: var1=0.0 var2=0.0 | |
c4: var1=51.839999999999996 var2=60.75 | |
[false, false, false, false, true] | |
["id", "account", " time", "purchase", "comment"] | |
["1", "32864", "2015-01-27 19:23:49", "20150127", "embulk"] | |
["2", "14824", "2015-01-27 19:01:23", "20150127", "embulk jruby"] | |
["3", "27559", "2015-01-28 02:20:02", "20150128", "Embulk csv parser plugin"] | |
["4", "11270", "2015-01-29 11:54:36", "20150129", "NULL"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment