Created
December 13, 2013 17:01
-
-
Save mru2/7947486 to your computer and use it in GitHub Desktop.
Grouping and 2D projection for multi-dimensional analytics data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rspec' | |
####### | |
# SPECS | |
####### | |
describe "blob projection" do | |
before(:all) do | |
@blob = Blob.new(:month, :calendar, :state) | |
results = [ | |
["2013-11", "agenda1", "venu", 5], | |
["2013-11", "agenda1", "pas venu", 2], | |
["2013-11", "agenda2", "venu", 6], | |
["2013-11", "agenda2", "pas venu", 0], | |
["2013-12", "agenda1", "venu", 7], | |
["2013-12", "agenda1", "pas venu", 1], | |
["2013-12", "agenda2", "venu", 9], | |
["2013-12", "agenda2", "pas venu", 3] | |
] | |
results.each do |result| | |
value = result.pop | |
@blob.push result, value # result is now the key | |
end | |
end | |
it "should project to 1 dimension" do | |
projection = @blob.project(:month) | |
projection.columns.should == ["2013-11", "2013-12"] | |
projection.values.should == [[13, 20]] | |
end | |
it "should project to 2 dimensions" do | |
projection = @blob.project(:calendar, :month) | |
projection.columns.should == ["agenda1", "agenda2"] | |
projection.rows.should == ["2013-11", "2013-12"] | |
projection.values.should == [[7, 6], [8, 12]] | |
end | |
it "should handle cross-arrays" do | |
projection = @blob.project(:state, [:month, :calendar]) | |
projection.columns.should == ["venu", "pas venu"] | |
projection.rows.should == [["2013-11", "agenda1"], ["2013-11", "agenda2"], ["2013-12", "agenda1"], ["2013-12", "agenda2"]] | |
projection.values.should == [[5, 2], [6, 0], [7, 1], [9, 3]] | |
end | |
it "should handle filtering" do | |
filtered_projection = @blob.filter(:month => "2013-11").project(:state, :calendar) | |
filtered_projection.columns.should == ["venu", "pas venu"] | |
filtered_projection.rows.should == ["agenda1", "agenda2"] | |
filtered_projection.values.should == [[5, 2], [6, 0]] | |
end | |
end | |
################ | |
# IMPLEMENTATION | |
################ | |
class Index < Hash | |
def add(key, value) | |
self[key] ||= Set.new | |
self[key].add value | |
end | |
end | |
class Blob | |
def initialize(*key_names) | |
@key_names = key_names | |
@data = [] | |
@indexes = Hash.new | |
@filters = {} | |
@key_names.each do |key_name| | |
@indexes[key_name] = Index.new | |
end | |
end | |
def push(keys, value) | |
@data.push [keys, value] | |
index = @data.length - 1 | |
@key_names.each_with_index do |key_name, i| | |
@indexes[key_name].add keys[i], index | |
end | |
end | |
def project(column_keys, row_keys = nil) | |
values = [] | |
multi_rows = row_keys && row_keys.is_a?(Array) | |
multi_cols = column_keys && column_keys.is_a?(Array) | |
rows = get_col_row([*row_keys]) | |
columns = get_col_row([*column_keys]) | |
values = [] | |
if rows | |
rows.each do |row| | |
values << columns.map{|column| get_value(column.merge(row)) } | |
end | |
else | |
values << columns.map{|column| get_value(column) } | |
end | |
cols = columns.map(&:values) | |
cols.flatten!(1) unless multi_cols | |
rows = rows ? rows.map(&:values) : [] | |
rows.flatten!(1) unless multi_rows | |
return OpenStruct.new({ | |
:columns => cols, | |
:rows => rows, | |
:values => values | |
}) | |
end | |
def filter(filter) | |
@filters.merge!(filter) | |
self | |
end | |
# [:month, :calendar] => [{:month => "2013-11", :calendar =>"agenda1"}, ...] | |
def get_col_row(index_keys) | |
return nil if index_keys.compact.empty? | |
indexes = Hash[@indexes.select{|k,v| index_keys.include? k}.map{|k,v|[k, v.keys]}] | |
product_hash(indexes) | |
end | |
# List of row/cols from index names | |
# {:month => ["2013-11", "2013-12"], :calendar => ["agenda1, "agenda2]} => | |
# [ | |
# {:month => "2013-11", :calendar => "agenda1" }, ... | |
# ] | |
def product_hash(hsh) | |
attrs = hsh.values | |
keys = hsh.keys | |
product = attrs[0].product(*attrs[1..-1]) | |
product.map{ |p| Hash[keys.zip p] } | |
end | |
def get_value(filters) | |
query_values(filters).map(&:last).reduce(&:+) | |
end | |
def query_values(filters) | |
ids = nil | |
filters.merge(@filters).each do |key_name, key_value| | |
matching = @indexes[key_name][key_value] | |
# First value : populate the ids array | |
if !ids | |
ids ||= matching | |
# Then : intersection | |
else | |
ids = ids & matching | |
end | |
end | |
ids.map{|i|@data[i]} | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment