rickhull/histo.rb

## histo.rb
#!/usr/bin/env ruby

test_mode = false

BINS = {}
BINS[-1] = 0
[10, 100, 1000].each { |mult|
  9.times { |i|
    BINS[mult * (i+1)] = 0
  }
}
BINS[10000] = 0
BINS[10001] = 0

# determine the correct bin for any value
def interval_key(val)
  return 10001 if val > 10000 # too high
  return -1 if val < 0        # too low
  return 10 if val <= 10      # protect against value 0

  flt, exp = scientize(val)
  flt.ceil * 10**exp
end

# return the two numeric components of scientific notation
def scientize(val)
  exp = Math.log10(val).floor
  [val.to_f / 10 ** exp, exp]
end

# print a pseudo-histogram, with exact counts displayed
def print_histo(bins)
  total = bins.values.inject(:+)
  puts "Data points: #{total}"
  puts "==========="
  max_count = bins.values.max
  count_width = max_count.to_s.length
  # 80ch width minus 10ch for ID / percentage minus 2ch for spacing
  hist_width = 68.0 - count_width
  normal_factor = [bins.values.max / hist_width, 1.0].max

  bins.each { |bin, count|
    pctg = (count * 100 / total.to_f).round
    hist_width = (count / normal_factor).ceil
    puts ["#{bin.to_s.rjust(5, ' ')}:#{pctg.to_s.rjust(2, ' ')}%",
          count.to_s.ljust(count_width, ' '),
          '*' * hist_width].join(' ')
  }
end

unless test_mode
  $stdin.each_line { |num|
    next if num == "null\n"
    num.chomp!
    # next unless num.match /\A[\d.]+\z/
    key = interval_key(num.to_f)
    # BINS[key] ||= 0
    BINS[key] += 1
  }
  print_histo BINS
else
  puts "running tests..."

  tests = {
    -5 => -1,
    -1 => -1,
    -0.00001 => -1,
    0 => 10,
    5 => 10,
    10 => 10,
    10.0 => 10,
    10.01 => 20,
    25 => 30,
    99 => 100,
    100 => 100,
    101 => 200,
    199 => 200,
    200 => 200,
    201 => 300,
    999 => 1000,
    1000 => 1000,
    3645 => 4000,
    9999 => 10000,
    10000 => 10000,
    463573745 => 10001,
  }

  tests.each { |val, expected|
    actual = interval_key(val)
    if actual != expected
      raise "val: #{val}; expected: #{expected}; actual: #{actual}"
    end
  }
  puts "SUCCESS"
end
	#!/usr/bin/env ruby

	test_mode = false

	BINS = {}
	BINS[-1] = 0
	[10, 100, 1000].each { \|mult\|
	9.times { \|i\|
	BINS[mult * (i+1)] = 0
	}
	}
	BINS[10000] = 0
	BINS[10001] = 0

	# determine the correct bin for any value
	def interval_key(val)
	return 10001 if val > 10000 # too high
	return -1 if val < 0 # too low
	return 10 if val <= 10 # protect against value 0

	flt, exp = scientize(val)
	flt.ceil * 10**exp
	end

	# return the two numeric components of scientific notation
	def scientize(val)
	exp = Math.log10(val).floor
	[val.to_f / 10 ** exp, exp]
	end

	# print a pseudo-histogram, with exact counts displayed
	def print_histo(bins)
	total = bins.values.inject(:+)
	puts "Data points: #{total}"
	puts "==========="
	max_count = bins.values.max
	count_width = max_count.to_s.length
	# 80ch width minus 10ch for ID / percentage minus 2ch for spacing
	hist_width = 68.0 - count_width
	normal_factor = [bins.values.max / hist_width, 1.0].max

	bins.each { \|bin, count\|
	pctg = (count * 100 / total.to_f).round
	hist_width = (count / normal_factor).ceil
	puts ["#{bin.to_s.rjust(5, ' ')}:#{pctg.to_s.rjust(2, ' ')}%",
	count.to_s.ljust(count_width, ' '),
	'' hist_width].join(' ')
	}
	end

	unless test_mode
	$stdin.each_line { \|num\|
	next if num == "null\n"
	num.chomp!
	# next unless num.match /\A[\d.]+\z/
	key = interval_key(num.to_f)
	# BINS[key] \|\|= 0
	BINS[key] += 1
	}
	print_histo BINS
	else
	puts "running tests..."

	tests = {
	-5 => -1,
	-1 => -1,
	-0.00001 => -1,
	0 => 10,
	5 => 10,
	10 => 10,
	10.0 => 10,
	10.01 => 20,
	25 => 30,
	99 => 100,
	100 => 100,
	101 => 200,
	199 => 200,
	200 => 200,
	201 => 300,
	999 => 1000,
	1000 => 1000,
	3645 => 4000,
	9999 => 10000,
	10000 => 10000,
	463573745 => 10001,
	}

	tests.each { \|val, expected\|
	actual = interval_key(val)
	if actual != expected
	raise "val: #{val}; expected: #{expected}; actual: #{actual}"
	end
	}
	puts "SUCCESS"
	end