Skip to content

Instantly share code, notes, and snippets.

@kleinschmidt
Last active October 22, 2016 03:02
Show Gist options
  • Save kleinschmidt/0125c14dd6d5b285e7bb82031e602ea1 to your computer and use it in GitHub Desktop.
Save kleinschmidt/0125c14dd6d5b285e7bb82031e602ea1 to your computer and use it in GitHub Desktop.
benchmarking DataFrames#1081
- # Formulas for representing and working with linear-model-type expressions
- # Original by Harlan D. Harris. Later modifications by John Myles White
- # and Douglas M. Bates.
-
- ## Formulas are written as expressions and parsed by the Julia parser.
- ## For example :(y ~ a + b + log(c))
- ## In Julia the & operator is used for an interaction. What would be written
- ## in R as y ~ a + b + a:b is written :(y ~ a + b + a&b) in Julia.
- ## The equivalent R expression, y ~ a*b, is the same in Julia
-
- ## The lhs of a one-sided formula is 'nothing'
- ## The rhs of a formula can be 1
-
- type Formula
- lhs::@compat(Union{Symbol, Expr, Void})
- rhs::@compat(Union{Symbol, Expr, Integer})
- end
-
- macro ~(lhs, rhs)
- ex = Expr(:call,
- :Formula,
- Base.Meta.quot(lhs),
- Base.Meta.quot(rhs))
- return ex
- end
-
- #
192000 # TODO: implement contrast types in Formula/Terms
- #
- type Terms
- terms::Vector
- eterms::Vector # evaluation terms
- factors::Matrix{Bool} # maps terms to evaluation terms
- ## An eterms x terms matrix which is true for terms that need to be "promoted"
- ## to full rank in constructing a model matrx
- is_non_redundant::Matrix{Bool}
- # order can probably be dropped. It is vec(sum(factors, 1))
- order::Vector{Int} # orders of rhs terms
- response::Bool # indicator of a response, which is eterms[1] if present
- intercept::Bool # is there an intercept column in the model matrix?
- end
-
- type ModelFrame
- df::AbstractDataFrame
- terms::Terms
- msng::BitArray
- ## mapping from df keys to contrasts matrices
- contrasts::Dict{Symbol, ContrastsMatrix}
- end
-
- typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T}
-
- type ModelMatrix{T <: AbstractFloatMatrix}
- m::T
- assign::Vector{Int}
- end
-
- Base.size(mm::ModelMatrix) = size(mm.m)
- Base.size(mm::ModelMatrix, dim...) = size(mm.m, dim...)
-
- function Base.show(io::IO, f::Formula)
- print(io,
- string("Formula: ",
- f.lhs == nothing ? "" : f.lhs, " ~ ", f.rhs))
576000 end
-
- ## Return, as a vector of symbols, the names of all the variables in
- ## an expression or a formula
- function allvars(ex::Expr)
- if ex.head != :call error("Non-call expression encountered") end
- cc=Symbol[]
- for i in ex.args[2:end] cc=append!(cc,allvars(i)) end
- cc
- end
- allvars(f::Formula) = unique(vcat(allvars(f.rhs), allvars(f.lhs)))
- allvars(sym::Symbol) = [sym]
- allvars(v::Any) = Array(Symbol, 0)
-
- # special operators in formulas
- const specials = Set([:+, :-, :*, :/, :&, :|, :^])
448000
- function dospecials(ex::Expr)
0 if ex.head != :call error("Non-call expression encountered") end
0 a1 = ex.args[1]
0 if !(a1 in specials) return ex end
0 excp = copy(ex)
0 excp.args = vcat(a1,map(dospecials, ex.args[2:end]))
0 if a1 != :* return excp end
0 aa = excp.args
0 a2 = aa[2]
0 a3 = aa[3]
0 if length(aa) > 3
0 excp.args = vcat(a1, aa[3:end])
0 a3 = dospecials(excp)
- end
- ## this order of expansion gives the R-style ordering of interaction
- ## terms (after sorting in increasing interaction order) for higher-
- ## order interaction terms (e.g. x1 * x2 * x3 should expand to x1 +
0 ## x2 + x3 + x1&x2 + x1&x3 + x2&x3 + x1&x2&x3)
0 :($a2 + $a2 & $a3 + $a3)
- end
- dospecials(a::Any) = a
-
- ## Distribution of & over +
- const distributive = @compat Dict(:& => :+)
-
- distribute(ex::Expr) = distribute!(copy(ex))
- distribute(a::Any) = a
- ## apply distributive property in-place
- function distribute!(ex::Expr)
0 if ex.head != :call error("Non-call expression encountered") end
0 [distribute!(a) for a in ex.args[2:end]]
- ## check that top-level can be distributed
0 a1 = ex.args[1]
0 if a1 in keys(distributive)
-
- ## which op is being DISTRIBUTED (e.g. &, *)?
0 distributed_op = a1
- ## which op is doing the distributing (e.g. +)?
0 distributing_op = distributive[a1]
-
- ## detect distributing sub-expression (first arg is, e.g. :+)
0 is_distributing_subex(e) =
- typeof(e)==Expr && e.head == :call && e.args[1] == distributing_op
-
- ## find first distributing subex
0 first_distributing_subex = findfirst(is_distributing_subex, ex.args)
0 if first_distributing_subex != 0
- ## remove distributing subexpression from args
0 subex = splice!(ex.args, first_distributing_subex)
-
0 newargs = Any[distributing_op]
- ## generate one new sub-expression, which calls the distributed operation
0 ## (e.g. &) on each of the distributing sub-expression's arguments, plus
- ## the non-distributed arguments of the original expression.
0 for a in subex.args[2:end]
0 new_subex = copy(ex)
0 push!(new_subex.args, a)
- ## need to recurse here, in case there are any other
- ## distributing operations in the sub expression
0 distribute!(new_subex)
0 push!(newargs, new_subex)
- end
0 ex.args = newargs
- end
- end
0 ex
- end
- distribute!(a::Any) = a
-
- const associative = Set([:+,:*,:&]) # associative special operators
-
- ## If the expression is a call to the function s return its arguments
- ## Otherwise return the expression
- function ex_or_args(ex::Expr,s::Symbol)
- if ex.head != :call error("Non-call expression encountered") end
- if ex.args[1] == s
- ## recurse in case there are more :calls of s below
- return vcat(map(x -> ex_or_args(x, s), ex.args[2:end])...)
- else
- ## not a :call to s, return condensed version of ex
- return condense(ex)
- end
- end
- ex_or_args(a,s::Symbol) = a
-
- ## Condense calls like :(+(a,+(b,c))) to :(+(a,b,c))
- function condense(ex::Expr)
0 if ex.head != :call error("Non-call expression encountered") end
0 a1 = ex.args[1]
0 if !(a1 in associative) return ex end
0 excp = copy(ex)
0 excp.args = vcat(a1, map(x->ex_or_args(x,a1), ex.args[2:end])...)
0 excp
- end
- condense(a::Any) = a
-
- ## always return an ARRAY of terms
- getterms(ex::Expr) = (ex.head == :call && ex.args[1] == :+) ? ex.args[2:end] : Expr[ex]
- getterms(a::Any) = Any[a]
-
- ord(ex::Expr) = (ex.head == :call && ex.args[1] == :&) ? length(ex.args)-1 : 1
- ord(a::Any) = 1
-
- const nonevaluation = Set([:&,:|]) # operators constructed from other evaluations
- ## evaluation terms - the (filtered) arguments for :& and :|, otherwise the term itself
- function evt(ex::Expr)
- if ex.head != :call error("Non-call expression encountered") end
- if !(ex.args[1] in nonevaluation) return ex end
- filter(x->!isa(x,Number), vcat(map(getterms, ex.args[2:end])...))
- end
- evt(a) = Any[a]
-
- function Terms(f::Formula)
- rhs = condense(distribute(dospecials(f.rhs)))
0 tt = unique(getterms(rhs))
0 tt = tt[!(tt .== 1)] # drop any explicit 1's
0 noint = (tt .== 0) | (tt .== -1) # should also handle :(-(expr,1))
0 tt = tt[!noint]
0 oo = Int[ord(t) for t in tt] # orders of interaction terms
0 if !issorted(oo) # sort terms by increasing order
0 pp = sortperm(oo)
0 tt = tt[pp]
0 oo = oo[pp]
- end
0 etrms = map(evt, tt)
0 haslhs = f.lhs != nothing
0 if haslhs
0 unshift!(etrms, Any[f.lhs])
0 unshift!(oo, 1)
- end
0 ev = unique(vcat(etrms...))
0 sets = [Set(x) for x in etrms]
0 facs = Bool[t in s for t in ev, s in sets]
0 non_redundants = fill(false, size(facs)) # initialize to false
0 Terms(tt, ev, facs, non_redundants, oo, haslhs, !any(noint))
- end
-
- ## Default NULL handler. Others can be added as keyword arguments
- function null_omit(df::DataFrame)
- cc = complete_cases(df)
0 sub(df, cc), cc
- end
-
- _droplevels!(x::Any) = x
- _droplevels!(x::Union{CategoricalArray, NullableCategoricalArray}) = droplevels!(x)
-
- is_categorical(::Union{CategoricalArray, NullableCategoricalArray}) = true
- is_categorical(::Any) = false
-
- ## Check for non-redundancy of columns. For instance, if x is a factor with two
- ## levels, it should be expanded into two columns in y~0+x but only one column
- ## in y~1+x. The default is the rank-reduced form (contrasts for n levels only
- ## produce n-1 columns). In general, an evaluation term x within a term
- ## x&y&... needs to be "promoted" to full rank if y&... hasn't already been
- ## included (either explicitly in the Terms or implicitly by promoting another
- ## term like z in z&y&...).
- ##
- ## This modifies the Terms, setting `trms.is_non_redundant = true` for all non-
- ## redundant evaluation terms.
- function check_non_redundancy!(trms::Terms, df::AbstractDataFrame)
-
- (n_eterms, n_terms) = size(trms.factors)
-
0 encountered_columns = Vector{eltype(trms.factors)}[]
-
0 if trms.intercept
0 push!(encountered_columns, zeros(eltype(trms.factors), n_eterms))
- end
-
0 for i_term in 1:n_terms
0 for i_eterm in 1:n_eterms
- ## only need to check eterms that are included and can be promoted
- ## (e.g., categorical variables that expand to multiple mm columns)
0 if Bool(trms.factors[i_eterm, i_term]) && is_categorical(df[trms.eterms[i_eterm]])
0 dropped = trms.factors[:,i_term]
0 dropped[i_eterm] = 0
-
0 if findfirst(encountered_columns, dropped) == 0
0 trms.is_non_redundant[i_eterm, i_term] = true
0 push!(encountered_columns, dropped)
- end
-
- end
- end
- ## once we've checked all the eterms in this term, add it to the list
- ## of encountered terms/columns
0 push!(encountered_columns, Compat.view(trms.factors, :, i_term))
- end
-
0 return trms.is_non_redundant
- end
-
-
- const DEFAULT_CONTRASTS = DummyCoding
-
- ## Set up contrasts:
- ## Combine actual DF columns and contrast types if necessary to compute the
- ## actual contrasts matrices, levels, and term names (using DummyCoding
- ## as the default)
- function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict())
- evaledContrasts = Dict()
0 for (term, col) in eachcol(df)
0 is_categorical(col) || continue
0 evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ?
- contrasts[term] :
- DEFAULT_CONTRASTS(),
- col)
- end
0 return evaledContrasts
- end
-
- function ModelFrame(trms::Terms, d::AbstractDataFrame;
- contrasts::Dict = Dict())
- df, msng = null_omit(DataFrame(map(x -> d[x], trms.eterms)))
0 names!(df, convert(Vector{Symbol}, map(string, trms.eterms)))
0 for c in eachcol(df) _droplevels!(c[2]) end
-
0 evaledContrasts = evalcontrasts(df, contrasts)
-
- ## Check for non-redundant terms, modifying terms in place
0 check_non_redundancy!(trms, df)
-
0 ModelFrame(df, trms, msng, evaledContrasts)
- end
-
- ModelFrame(df::AbstractDataFrame, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
- ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
- ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
-
- ## modify contrasts in place
- function setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
- new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, mf.df[col]))
- for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ])
-
- mf.contrasts = merge(mf.contrasts, new_contrasts)
- return mf
- end
- setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs))
-
- """
- StatsBase.model_response(mf::ModelFrame)
- Extract the response column, if present. `DataVector` or
- `PooledDataVector` columns are converted to `Array`s
- """
- function StatsBase.model_response(mf::ModelFrame)
- if mf.terms.response
- convert(Array, mf.df[mf.terms.eterms[1]])
- else
- error("Model formula one-sided")
- end
- end
-
- function nc(name::Symbol, mf::ModelFrame; non_redundant::Bool=false)
- if haskey(mf.contrasts, name)
0 size(non_redundant ?
- ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) :
- mf.contrasts[name],
- 2)
- else
0 1
- end
- end
992000
- ## construct model matrix columns from model frame + name (checks for contrasts)
- function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::ModelFrame; non_redundant::Bool = false)
- if haskey(mf.contrasts, name)
0 modelmat_cols(T, mf.df[name],
- non_redundant ?
- ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) :
- mf.contrasts[name])
- else
0 modelmat_cols(T, mf.df[name])
- end
- end
-
- modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector) =
- convert(T, reshape(v, length(v), 1))
- # FIXME: this inefficient method should not be needed, cf. JuliaLang/julia#18264
- modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::NullableVector) =
- convert(T, Matrix(reshape(v, length(v), 1)))
-
- """
- modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
-
16000 Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that
- levels align properly.
- """
- function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T},
- v::Union{CategoricalVector, NullableCategoricalVector},
- contrast::ContrastsMatrix)
- ## make sure the levels of the contrast matrix and the categorical data
- ## are the same by constructing a re-indexing vector. Indexing into
- ## reindex with v.refs will give the corresponding row number of the
- ## contrast matrix
- reindex = [findfirst(contrast.levels, l) for l in levels(v)]
0 contrastmatrix = convert(T, contrast.matrix)
0 return indexrows(contrastmatrix, reindex[v.refs])
- end
-
- indexrows(m::SparseMatrixCSC, ind::Vector{Int}) = m'[:, ind]'
- indexrows(m::AbstractMatrix, ind::Vector{Int}) = m[ind, :]
-
- """
- expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
- Create pairwise products of columns from a vector of matrices
- """
- function expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
- if length(trm) == 1
0 trm[1]
- else
0 a = trm[1]
0 b = expandcols(trm[2 : end])
0 reduce(hcat, [broadcast(*, a, Compat.view(b, :, j)) for j in 1 : size(b, 2)])
0 end
0 end
-
- """
- droprandomeffects(trms::Terms)
- Expressions of the form `(a|b)` are "random-effects" terms and are not
- incorporated in the ModelMatrix. This function checks for such terms and,
- if any are present, drops them from the `Terms` object.
- """
- function droprandomeffects(trms::Terms)
- retrms = Bool[Meta.isexpr(t, :call) && t.args[1] == :| for t in trms.terms]
128000 if !any(retrms) # return trms unchanged
0 trms
0 elseif all(retrms) && !trms.response # return an empty Terms object
0 Terms(Any[],Any[],Array(Bool, (0,0)),Array(Bool, (0,0)), Int[], false, trms.intercept)
- else
- # the rows of `trms.factors` correspond to `eterms`, the columns to `terms`
- # After dropping random-effects terms we drop any eterms whose rows are all false
0 ckeep = !retrms # columns to retain
0 facs = trms.factors[:, ckeep]
0 rkeep = vec(sum(facs, 2) .> 0)
0 Terms(trms.terms[ckeep], trms.eterms[rkeep], facs[rkeep, :],
- trms.is_non_redundant[rkeep, ckeep],
- trms.order[ckeep], trms.response, trms.intercept)
- end
- end
-
- """
- dropresponse!(trms::Terms)
- Drop the response term, `trms.eterms[1]` and the first row and column
- of `trms.factors` if `trms.response` is true.
- """
- function dropresponse!(trms::Terms)
- if trms.response
0 ckeep = 2:size(trms.factors, 2)
0 rkeep = vec(any(trms.factors[:, ckeep], 2))
0 Terms(trms.terms, trms.eterms[rkeep], trms.factors[rkeep, ckeep],
- trms.is_non_redundant[rkeep, ckeep], trms.order[ckeep], false, trms.intercept)
- else
0 trms
- end
- end
-
- """
- ModelMatrix{T<:AbstractFloatMatrix}(mf::ModelFrame)
- Create a `ModelMatrix` of type `T` (default `Matrix{Float64}`) from the
- `terms` and `df` members of `mf`.
-
- This is basically a map-reduce where terms are mapped to columns by `cols`
- and reduced by `hcat`. During the collection of the columns the `assign`
- vector is created. `assign` maps columns of the model matrix to terms in
- the model frame. It can also be considered as mapping coefficients to
- terms and, hence, to names.
-
- If there is an intercept in the model, that column occurs first and its
- `assign` value is zero.
-
- Mixed-effects models include "random-effects" terms which are ignored when
- creating the model matrix.
- """
- @compat function (::Type{ModelMatrix{T}}){T<:AbstractFloatMatrix}(mf::ModelFrame)
- dfrm = mf.df
0 terms = droprandomeffects(dropresponse!(mf.terms))
0 factors = terms.factors
-
- ## Step 1:
- ## compute which columns correspond to which terms in order to pre-allocate
- ## model matrix and set up views later
80000 term_col_inds = []
0 num_cols = convert(Int, terms.intercept)
-
240000 for (i_term, term) in enumerate(terms.terms)
0 f = view(factors, :, i_term)
0 any(f) || continue
-
480000 block_sizes = map((et,nr) -> nc(et, mf, non_redundant=nr),
- view(terms.eterms, f),
- view(terms.is_non_redundant, f, i_term))
0 num_term_cols = prod(block_sizes)
-
176000 push!(term_col_inds, (1:num_term_cols)+num_cols)
0 num_cols += num_term_cols
- end
-
0 if num_cols == 0
0 error("Could not construct model matrix. Resulting matrix has 0 columns.")
0 end
-
- # Pre-allocate model matrix
24000128000 mm = similar(convert(T, Matrix()), size(dfrm, 1), num_cols)
112000 assign = zeros(Int, num_cols)
-
0 if terms.intercept
0 mm[:,1] = 1
- end
-
- ## Step 2:
- ## Fill in the columns for each term.
-
- ## Map eval. term name + redundancy bool to cached model matrix columns
- ## (views of model matrix itself for eval terms that correspond directly to
- ## mm columns, and arrays otherwise)
0 eterm_cols = @compat Dict{Tuple{Symbol,Bool}, T}()
-
- ## turn each term into a vector of mm columns for its eval. terms, using
- ## "promoted" full-rank versions of categorical columns for non-redundant
- ## eval. terms:
304000 for (i_term, term) in enumerate(terms.terms)
- ## Pull out the eval terms, and the non-redundancy flags for this term
0 ff = view(factors, :, i_term)
384000 eterms = view(terms.eterms, ff)
64000 non_redundants = view(terms.is_non_redundant, ff, i_term)
192000 if length(eterms) == 1 && term == eterms[1]
- ## handle special case where term == eterm (e.g., main effect
- ## terms). in this case, we can store the columns generated for the
- ## eterm directly in the model matrix and just hold onto a view of
- ## those columns for any higher-order terms that need them
0 et = eterms[1]
0 nr = non_redundants[1]
0 mm[:, term_col_inds[i_term]] = modelmat_cols(T, et, mf, non_redundant=nr)
16000384000 eterm_cols[(et, nr)] = view(mm, :, term_col_inds[i_term])
- else
- ## general case with multiple eterms and expandcols
0 blocks = T[]
0 for (et, nr) in zip(eterms, non_redundants)
0 block = get!(eterm_cols, (et, nr)) do
- modelmat_cols(T, et, mf, non_redundant=nr)
- end
0 push!(blocks, block)
- end
0 mm[:, term_col_inds[i_term]] = expandcols(blocks)
- end
0 assign[term_col_inds[i_term]] = i_term
- end
-
32000 ModelMatrix{T}(mm, assign)
- end
- ModelMatrix(mf::ModelFrame) = ModelMatrix{Matrix{Float64}}(mf)
-
-
- """
- termnames(term::Symbol, col)
- Returns a vector of strings with the names of the coefficients
- associated with a term. If the column corresponding to the term
- is not a `CategoricalArray` or `NullableCategoricalArray`,
- a one-element vector is returned.
- """
- termnames(term::Symbol, col) = [string(term)]
- function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false)
- if haskey(mf.contrasts, term)
- termnames(term, mf.df[term],
- non_redundant ?
- ContrastsMatrix{FullDummyCoding}(mf.contrasts[term]) :
- mf.contrasts[term])
- else
- termnames(term, mf.df[term])
- end
- end
-
- termnames(term::Symbol, col::Any, contrast::ContrastsMatrix) =
- ["$term: $name" for name in contrast.termnames]
-
-
- function expandtermnames(term::Vector)
- if length(term) == 1
- return term[1]
- else
- return foldr((a,b) -> vec([string(lev1, " & ", lev2) for
- lev1 in a,
- lev2 in b]),
- term)
- end
- end
-
-
- """
- coefnames(mf::ModelFrame)
- Returns a vector of coefficient names constructed from the Terms
- member and the types of the evaluation columns.
- """
- function coefnames(mf::ModelFrame)
- terms = droprandomeffects(dropresponse!(mf.terms))
-
- ## strategy mirrors ModelMatrx constructor:
- eterm_names = @compat Dict{Tuple{Symbol,Bool}, Vector{Compat.UTF8String}}()
- term_names = Vector{Compat.UTF8String}[]
-
- if terms.intercept
- push!(term_names, Compat.UTF8String["(Intercept)"])
- end
-
- factors = terms.factors
-
- for (i_term, term) in enumerate(terms.terms)
-
- ## names for columns for eval terms
- names = Vector{Compat.UTF8String}[]
-
- ff = Compat.view(factors, :, i_term)
- eterms = Compat.view(terms.eterms, ff)
- non_redundants = Compat.view(terms.is_non_redundant, ff, i_term)
-
- for (et, nr) in zip(eterms, non_redundants)
- if !haskey(eterm_names, (et, nr))
- eterm_names[(et, nr)] = termnames(et, mf, non_redundant=nr)
- end
- push!(names, eterm_names[(et, nr)])
- end
- push!(term_names, expandtermnames(names))
- end
-
- reduce(vcat, Vector{Compat.UTF8String}(), term_names)
- end
-
- function Formula(t::Terms)
- lhs = t.response ? t.eterms[1] : nothing
- rhs = Expr(:call,:+)
- if t.intercept
- push!(rhs.args,1)
- end
- append!(rhs.args,t.terms)
- Formula(lhs,rhs)
- end
-
using DataFrames
using BenchmarkTools
using JLD
srand(1)
n = 1_000_000
d = DataFrame(x = rand(n), y = rand(n))
suite = BenchmarkGroup()
suite["ModelFrame"] = BenchmarkGroup()
suite["ModelMatrix"] = BenchmarkGroup()
for rhs in [ :(x), :(x+y), :(x*y) ]
f = Formula(nothing, rhs)
suite["ModelFrame"][rhs] = @benchmarkable ModelFrame($f, $d)
mf = ModelFrame(f, d)
suite["ModelMatrix"][rhs] = @benchmarkable ModelMatrix($mf)
end
# run these lines the first time to tune the benchmarking parameters:
# tune!(suite)
# JLD.save(Pkg.dir("DataFrames", "benchmark", "params.jld"), "suite", params(suite))
# then load them for future runs:
loadparams!(suite,
JLD.load(Pkg.dir("DataFrames", "benchmark", "params.jld"), "suite"),
:evals, :samples)
results = run(suite, verbose=true)
# run a fixed number of time for memory allocation profiling
using DataFrames
srand(1)
n = 1_000_000
d = DataFrame(x = rand(n), y = rand(n))
f = Formula(nothing, :(x+y))
mf = ModelFrame(f, d)
mm = ModelMatrix(mf)
Profile.init(delay=0.01)
Profile.clear()
Profile.clear_malloc_data()
@profile @time for _ in 1:1000
ModelMatrix(mf)
end
# Write profile results to profile.bin.
open("profile.bin", "w") do f serialize(f, Profile.retrieve()) end
julia> j = judge(median(results), median(results_master))
julia> j["ModelMatrix"]
3-element BenchmarkTools.BenchmarkGroup:
tags: []
:(x + y) => TrialJudgement(+21.64% => regression)
:(x * y) => TrialJudgement(+19.32% => regression)
:x => TrialJudgement(-14.57% => improvement)
julia> j["ModelFrame"]
3-element BenchmarkTools.BenchmarkGroup:
tags: []
:(x + y) => TrialJudgement(+0.10% => invariant)
:(x * y) => TrialJudgement(-0.88% => invariant)
:x => TrialJudgement(-1.63% => invariant)
julia> results_master["ModelMatrix"][:x]
BenchmarkTools.Trial:
samples: 477
evals/sample: 1
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 22.89 mb
allocs estimate: 79
minimum time: 8.11 ms (11.03% GC)
median time: 10.20 ms (31.64% GC)
mean time: 10.49 ms (31.88% GC)
maximum time: 17.91 ms (51.26% GC)
julia> results["ModelMatrix"][:x]
BenchmarkTools.Trial:
samples: 557
evals/sample: 1
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 22.89 mb
allocs estimate: 89
minimum time: 8.00 ms (12.85% GC)
median time: 8.71 ms (31.05% GC)
mean time: 8.98 ms (31.10% GC)
maximum time: 13.94 ms (35.56% GC)
julia> results_master["ModelMatrix"][:(x+y)]
BenchmarkTools.Trial:
samples: 356
evals/sample: 1
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 30.52 mb
allocs estimate: 128
minimum time: 12.37 ms (26.39% GC)
median time: 13.12 ms (25.51% GC)
mean time: 14.05 ms (24.98% GC)
maximum time: 224.42 ms (1.35% GC)
julia> results["ModelMatrix"][:(x+y)]
BenchmarkTools.Trial:
samples: 306
evals/sample: 1
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 38.15 mb
allocs estimate: 148
minimum time: 12.63 ms (7.10% GC)
median time: 15.95 ms (25.34% GC)
mean time: 16.34 ms (25.49% GC)
maximum time: 23.80 ms (17.63% GC)
julia> results_master["ModelMatrix"][:(x*y)]
BenchmarkTools.Trial:
samples: 224
evals/sample: 1
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 45.79 mb
allocs estimate: 233
minimum time: 18.26 ms (24.74% GC)
median time: 19.28 ms (24.32% GC)
mean time: 22.35 ms (23.24% GC)
maximum time: 549.82 ms (2.38% GC)
julia> results["ModelMatrix"][:(x*y)]
BenchmarkTools.Trial:
samples: 213
evals/sample: 1
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 53.42 mb
allocs estimate: 286
minimum time: 21.63 ms (25.27% GC)
median time: 23.01 ms (25.53% GC)
mean time: 23.52 ms (25.68% GC)
maximum time: 33.33 ms (5.18% GC)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment