Skip to content

Instantly share code, notes, and snippets.

@mru2
Created December 13, 2013 17:01
Show Gist options
  • Save mru2/7947486 to your computer and use it in GitHub Desktop.
Save mru2/7947486 to your computer and use it in GitHub Desktop.
Grouping and 2D projection for multi-dimensional analytics data
require 'rspec'
#######
# SPECS
#######
describe "blob projection" do
before(:all) do
@blob = Blob.new(:month, :calendar, :state)
results = [
["2013-11", "agenda1", "venu", 5],
["2013-11", "agenda1", "pas venu", 2],
["2013-11", "agenda2", "venu", 6],
["2013-11", "agenda2", "pas venu", 0],
["2013-12", "agenda1", "venu", 7],
["2013-12", "agenda1", "pas venu", 1],
["2013-12", "agenda2", "venu", 9],
["2013-12", "agenda2", "pas venu", 3]
]
results.each do |result|
value = result.pop
@blob.push result, value # result is now the key
end
end
it "should project to 1 dimension" do
projection = @blob.project(:month)
projection.columns.should == ["2013-11", "2013-12"]
projection.values.should == [[13, 20]]
end
it "should project to 2 dimensions" do
projection = @blob.project(:calendar, :month)
projection.columns.should == ["agenda1", "agenda2"]
projection.rows.should == ["2013-11", "2013-12"]
projection.values.should == [[7, 6], [8, 12]]
end
it "should handle cross-arrays" do
projection = @blob.project(:state, [:month, :calendar])
projection.columns.should == ["venu", "pas venu"]
projection.rows.should == [["2013-11", "agenda1"], ["2013-11", "agenda2"], ["2013-12", "agenda1"], ["2013-12", "agenda2"]]
projection.values.should == [[5, 2], [6, 0], [7, 1], [9, 3]]
end
it "should handle filtering" do
filtered_projection = @blob.filter(:month => "2013-11").project(:state, :calendar)
filtered_projection.columns.should == ["venu", "pas venu"]
filtered_projection.rows.should == ["agenda1", "agenda2"]
filtered_projection.values.should == [[5, 2], [6, 0]]
end
end
################
# IMPLEMENTATION
################
class Index < Hash
def add(key, value)
self[key] ||= Set.new
self[key].add value
end
end
class Blob
def initialize(*key_names)
@key_names = key_names
@data = []
@indexes = Hash.new
@filters = {}
@key_names.each do |key_name|
@indexes[key_name] = Index.new
end
end
def push(keys, value)
@data.push [keys, value]
index = @data.length - 1
@key_names.each_with_index do |key_name, i|
@indexes[key_name].add keys[i], index
end
end
def project(column_keys, row_keys = nil)
values = []
multi_rows = row_keys && row_keys.is_a?(Array)
multi_cols = column_keys && column_keys.is_a?(Array)
rows = get_col_row([*row_keys])
columns = get_col_row([*column_keys])
values = []
if rows
rows.each do |row|
values << columns.map{|column| get_value(column.merge(row)) }
end
else
values << columns.map{|column| get_value(column) }
end
cols = columns.map(&:values)
cols.flatten!(1) unless multi_cols
rows = rows ? rows.map(&:values) : []
rows.flatten!(1) unless multi_rows
return OpenStruct.new({
:columns => cols,
:rows => rows,
:values => values
})
end
def filter(filter)
@filters.merge!(filter)
self
end
# [:month, :calendar] => [{:month => "2013-11", :calendar =>"agenda1"}, ...]
def get_col_row(index_keys)
return nil if index_keys.compact.empty?
indexes = Hash[@indexes.select{|k,v| index_keys.include? k}.map{|k,v|[k, v.keys]}]
product_hash(indexes)
end
# List of row/cols from index names
# {:month => ["2013-11", "2013-12"], :calendar => ["agenda1, "agenda2]} =>
# [
# {:month => "2013-11", :calendar => "agenda1" }, ...
# ]
def product_hash(hsh)
attrs = hsh.values
keys = hsh.keys
product = attrs[0].product(*attrs[1..-1])
product.map{ |p| Hash[keys.zip p] }
end
def get_value(filters)
query_values(filters).map(&:last).reduce(&:+)
end
def query_values(filters)
ids = nil
filters.merge(@filters).each do |key_name, key_value|
matching = @indexes[key_name][key_value]
# First value : populate the ids array
if !ids
ids ||= matching
# Then : intersection
else
ids = ids & matching
end
end
ids.map{|i|@data[i]}
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment