Here is an @with
macro to evaluate DataFrame (or even Dict) columns using references.
replace_syms(x, membernames) = x
function replace_syms(e::Expr, membernames)
if e.head != :quote
return Expr(e.head, (isempty(e.args) ? e.args : map(x -> replace_syms(x, membernames), e.args))...)
else
if has(membernames, e.args[1])
return membernames[e.args[1]]
else
a = gensym()
membernames[e.args[1]] = a
return a
end
end
end
function with_helper(d, body)
membernames = Dict{Symbol, Symbol}()
body = replace_syms(body, membernames)
funargs = map(x -> :( getindex($d, $(Meta.quot(x))) ), collect(keys(membernames)))
funname = gensym()
return(:( function $funname($(collect(values(membernames))...)) $body end; $funname($(funargs...)) ))
end
macro with(d, body)
with_helper(d, body)
end
Here are simple examples with a Dict:
d = {:s => 3, :y => 44, :d => 5}
x = @with(d, :s + :y)
t = 3
x = @with d :s + :y + t
x = @with d begin
z = t + :y - 1
:s + z
end
Here is the DataFrames example that @johnmyleswhite put together. I added dot9
as an example that uses @with
. dot9
runs at comparable speeds to dot8
.
dot1 => 0.01229682
dot2 => 0.388635147
dot3 => 1.89651897
dot4 => 0.355990775
dot5 => 0.013920764
dot6 => 0.015714847
dot7 => 0.065454129
dot8 => 0.013927748
dot9 => 0.013752586
using DataArrays, DataFrames
srand(1)
const n = 5_000_000
a = rand(n)
b = rand(n)
da = data(a)
db = data(b)
df = DataFrame(a = da, b = db)
df2 = DataFrame({a, b})
names!(df2, [:a, :b])
Base.values(da::DataArray) = da.data
function dot1(a::Vector, b::Vector)
x = 0.0
for i in 1:length(a)
x += a[i] * b[i]
end
return x
end
function dot2(da::DataVector, db::DataVector)
T = eltype(da)
x = 0.0
for i in 1:length(da)
x += da[i]::T * db[i]::T
end
return x
end
function dot3(df::DataFrame)
da, db = df[:a], df[:b]
T = eltype(da)
x = 0.0
for i in 1:length(da)
x += da[i]::T * db[i]::T
end
return x
end
function dot4(df::DataFrame)
da, db = df[:a], df[:b]
return dot2(da, db)
end
function dot5(da::DataVector, db::DataVector)
x = 0.0
for i in 1:length(da)
x += da.data[i] * db.data[i]
end
return x
end
function dot6(da::DataVector, db::DataVector)
x = 0.0
for i in 1:length(da)
x += values(da)[i] * values(db)[i]
end
return x
end
function dot7(da::DataVector, db::DataVector)
x = 0.0
for i in 1:length(da)
if !(isna(da, i) || isna(da, i))
x += values(da)[i] * values(db)[i]
end
end
return x
end
function dot8(a::Vector, b::Vector)
x = 0.0
for i in 1:length(a)
if !(isnan(a[i]) || isnan(a[i]))
x += a[i] * b[i]
end
end
return x
end
function dot9(df::DataFrame)
@with df begin
x = 0.0
for i in 1:length(:a)
x += values(:a)[i] * values(:b)[i]
end
x
end
end
t1 = @elapsed dot1(a, b)
t2 = @elapsed dot2(da, db)
t3 = @elapsed dot3(df)
t4 = @elapsed dot4(df)
t5 = @elapsed dot5(da, db)
t6 = @elapsed dot6(da, db)
t7 = @elapsed dot7(da, db)
t8 = @elapsed dot8(a, b)
t9 = @elapsed dot9(df)