kleinschmidt/formula.jl.mem

## formula.jl.mem
        - # Formulas for representing and working with linear-model-type expressions
        - # Original by Harlan D. Harris.  Later modifications by John Myles White
        - # and Douglas M. Bates.
        -
        - ## Formulas are written as expressions and parsed by the Julia parser.
        - ## For example :(y ~ a + b + log(c))
        - ## In Julia the & operator is used for an interaction.  What would be written
        - ## in R as y ~ a + b + a:b is written :(y ~ a + b + a&b) in Julia.
        - ## The equivalent R expression, y ~ a*b, is the same in Julia
        -
        - ## The lhs of a one-sided formula is 'nothing'
        - ## The rhs of a formula can be 1
        -
        - type Formula
        -     lhs::@compat(Union{Symbol, Expr, Void})
        -     rhs::@compat(Union{Symbol, Expr, Integer})
        - end
        -
        - macro ~(lhs, rhs)
        -     ex = Expr(:call,
        -               :Formula,
        -               Base.Meta.quot(lhs),
        -               Base.Meta.quot(rhs))
        -     return ex
        - end
        -
        - #
   192000 # TODO: implement contrast types in Formula/Terms
        - #
        - type Terms
        -     terms::Vector
        -     eterms::Vector        # evaluation terms
        -     factors::Matrix{Bool} # maps terms to evaluation terms
        -     ## An eterms x terms matrix which is true for terms that need to be "promoted"
        -     ## to full rank in constructing a model matrx
        -     is_non_redundant::Matrix{Bool}
        - # order can probably be dropped.  It is vec(sum(factors, 1))
        -     order::Vector{Int}    # orders of rhs terms
        -     response::Bool        # indicator of a response, which is eterms[1] if present
        -     intercept::Bool       # is there an intercept column in the model matrix?
        - end
        -
        - type ModelFrame
        -     df::AbstractDataFrame
        -     terms::Terms
        -     msng::BitArray
        -     ## mapping from df keys to contrasts matrices
        -     contrasts::Dict{Symbol, ContrastsMatrix}
        - end
        -
        - typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T}
        -
        - type ModelMatrix{T <: AbstractFloatMatrix}
        -     m::T
        -     assign::Vector{Int}
        - end
        -
        - Base.size(mm::ModelMatrix) = size(mm.m)
        - Base.size(mm::ModelMatrix, dim...) = size(mm.m, dim...)
        -
        - function Base.show(io::IO, f::Formula)
        -     print(io,
        -           string("Formula: ",
        -                  f.lhs == nothing ? "" : f.lhs, " ~ ", f.rhs))
   576000 end
        -
        - ## Return, as a vector of symbols, the names of all the variables in
        - ## an expression or a formula
        - function allvars(ex::Expr)
        -     if ex.head != :call error("Non-call expression encountered") end
        -     cc=Symbol[]
        -     for i in ex.args[2:end] cc=append!(cc,allvars(i)) end
        -     cc
        - end
        - allvars(f::Formula) = unique(vcat(allvars(f.rhs), allvars(f.lhs)))
        - allvars(sym::Symbol) = [sym]
        - allvars(v::Any) = Array(Symbol, 0)
        -
        - # special operators in formulas
        - const specials = Set([:+, :-, :*, :/, :&, :|, :^])
   448000
        - function dospecials(ex::Expr)
        0     if ex.head != :call error("Non-call expression encountered") end
        0     a1 = ex.args[1]
        0     if !(a1 in specials) return ex end
        0     excp = copy(ex)
        0     excp.args = vcat(a1,map(dospecials, ex.args[2:end]))
        0     if a1 != :* return excp end
        0     aa = excp.args
        0     a2 = aa[2]
        0     a3 = aa[3]
        0     if length(aa) > 3
        0         excp.args = vcat(a1, aa[3:end])
        0         a3 = dospecials(excp)
        -     end
        -     ## this order of expansion gives the R-style ordering of interaction
        -     ## terms (after sorting in increasing interaction order) for higher-
        -     ## order interaction terms (e.g. x1 * x2 * x3 should expand to x1 +
        0     ## x2 + x3 + x1&x2 + x1&x3 + x2&x3 + x1&x2&x3)
        0     :($a2 + $a2 & $a3 + $a3)
        - end
        - dospecials(a::Any) = a
        -
        - ## Distribution of & over +
        - const distributive = @compat Dict(:& => :+)
        -
        - distribute(ex::Expr) = distribute!(copy(ex))
        - distribute(a::Any) = a
        - ## apply distributive property in-place
        - function distribute!(ex::Expr)
        0     if ex.head != :call error("Non-call expression encountered") end
        0     [distribute!(a) for a in ex.args[2:end]]
        -     ## check that top-level can be distributed
        0     a1 = ex.args[1]
        0     if a1 in keys(distributive)
        -
        -         ## which op is being DISTRIBUTED (e.g. &, *)?
        0         distributed_op = a1
        -         ## which op is doing the distributing (e.g. +)?
        0         distributing_op = distributive[a1]
        -
        -         ## detect distributing sub-expression (first arg is, e.g. :+)
        0         is_distributing_subex(e) =
        -             typeof(e)==Expr && e.head == :call && e.args[1] == distributing_op
        -
        -         ## find first distributing subex
        0         first_distributing_subex = findfirst(is_distributing_subex, ex.args)
        0         if first_distributing_subex != 0
        -             ## remove distributing subexpression from args
        0             subex = splice!(ex.args, first_distributing_subex)
        -
        0             newargs = Any[distributing_op]
        -             ## generate one new sub-expression, which calls the distributed operation
        0             ## (e.g. &) on each of the distributing sub-expression's arguments, plus
        -             ## the non-distributed arguments of the original expression.
        0             for a in subex.args[2:end]
        0                 new_subex = copy(ex)
        0                 push!(new_subex.args, a)
        -                 ## need to recurse here, in case there are any other
        -                 ## distributing operations in the sub expression
        0                 distribute!(new_subex)
        0                 push!(newargs, new_subex)
        -             end
        0             ex.args = newargs
        -         end
        -     end
        0     ex
        - end
        - distribute!(a::Any) = a
        -
        - const associative = Set([:+,:*,:&])       # associative special operators
        -
        - ## If the expression is a call to the function s return its arguments
        - ## Otherwise return the expression
        - function ex_or_args(ex::Expr,s::Symbol)
        -     if ex.head != :call error("Non-call expression encountered") end
        -     if ex.args[1] == s
        -         ## recurse in case there are more :calls of s below
        -         return vcat(map(x -> ex_or_args(x, s), ex.args[2:end])...)
        -     else
        -         ## not a :call to s, return condensed version of ex
        -         return condense(ex)
        -     end
        - end
        - ex_or_args(a,s::Symbol) = a
        -
        - ## Condense calls like :(+(a,+(b,c))) to :(+(a,b,c))
        - function condense(ex::Expr)
        0     if ex.head != :call error("Non-call expression encountered") end
        0     a1 = ex.args[1]
        0     if !(a1 in associative) return ex end
        0     excp = copy(ex)
        0     excp.args = vcat(a1, map(x->ex_or_args(x,a1), ex.args[2:end])...)
        0     excp
        - end
        - condense(a::Any) = a
        -
        - ## always return an ARRAY of terms
        - getterms(ex::Expr) = (ex.head == :call && ex.args[1] == :+) ? ex.args[2:end] : Expr[ex]
        - getterms(a::Any) = Any[a]
        -
        - ord(ex::Expr) = (ex.head == :call && ex.args[1] == :&) ? length(ex.args)-1 : 1
        - ord(a::Any) = 1
        -
        - const nonevaluation = Set([:&,:|])        # operators constructed from other evaluations
        - ## evaluation terms - the (filtered) arguments for :& and :|, otherwise the term itself
        - function evt(ex::Expr)
        -     if ex.head != :call error("Non-call expression encountered") end
        -     if !(ex.args[1] in nonevaluation) return ex end
        -     filter(x->!isa(x,Number), vcat(map(getterms, ex.args[2:end])...))
        - end
        - evt(a) = Any[a]
        -
        - function Terms(f::Formula)
        -     rhs = condense(distribute(dospecials(f.rhs)))
        0     tt = unique(getterms(rhs))
        0     tt = tt[!(tt .== 1)]             # drop any explicit 1's
        0     noint = (tt .== 0) | (tt .== -1) # should also handle :(-(expr,1))
        0     tt = tt[!noint]
        0     oo = Int[ord(t) for t in tt]     # orders of interaction terms
        0     if !issorted(oo)                 # sort terms by increasing order
        0         pp = sortperm(oo)
        0         tt = tt[pp]
        0         oo = oo[pp]
        -     end
        0     etrms = map(evt, tt)
        0     haslhs = f.lhs != nothing
        0     if haslhs
        0         unshift!(etrms, Any[f.lhs])
        0         unshift!(oo, 1)
        -     end
        0     ev = unique(vcat(etrms...))
        0     sets = [Set(x) for x in etrms]
        0     facs = Bool[t in s for t in ev, s in sets]
        0     non_redundants = fill(false, size(facs)) # initialize to false
        0     Terms(tt, ev, facs, non_redundants, oo, haslhs, !any(noint))
        - end
        -
        - ## Default NULL handler.  Others can be added as keyword arguments
        - function null_omit(df::DataFrame)
        -     cc = complete_cases(df)
        0     sub(df, cc), cc
        - end
        -
        - _droplevels!(x::Any) = x
        - _droplevels!(x::Union{CategoricalArray, NullableCategoricalArray}) = droplevels!(x)
        -
        - is_categorical(::Union{CategoricalArray, NullableCategoricalArray}) = true
        - is_categorical(::Any) = false
        -
        - ## Check for non-redundancy of columns.  For instance, if x is a factor with two
        - ## levels, it should be expanded into two columns in y~0+x but only one column
        - ## in y~1+x.  The default is the rank-reduced form (contrasts for n levels only
        - ## produce n-1 columns).  In general, an evaluation term x within a term
        - ## x&y&... needs to be "promoted" to full rank if y&... hasn't already been
        - ## included (either explicitly in the Terms or implicitly by promoting another
        - ## term like z in z&y&...).
        - ##
        - ## This modifies the Terms, setting `trms.is_non_redundant = true` for all non-
        - ## redundant evaluation terms.
        - function check_non_redundancy!(trms::Terms, df::AbstractDataFrame)
        -
        -     (n_eterms, n_terms) = size(trms.factors)
        -
        0     encountered_columns = Vector{eltype(trms.factors)}[]
        -
        0     if trms.intercept
        0         push!(encountered_columns, zeros(eltype(trms.factors), n_eterms))
        -     end
        -
        0     for i_term in 1:n_terms
        0         for i_eterm in 1:n_eterms
        -             ## only need to check eterms that are included and can be promoted
        -             ## (e.g., categorical variables that expand to multiple mm columns)
        0             if Bool(trms.factors[i_eterm, i_term]) && is_categorical(df[trms.eterms[i_eterm]])
        0                 dropped = trms.factors[:,i_term]
        0                 dropped[i_eterm] = 0
        -
        0                 if findfirst(encountered_columns, dropped) == 0
        0                     trms.is_non_redundant[i_eterm, i_term] = true
        0                     push!(encountered_columns, dropped)
        -                 end
        -
        -             end
        -         end
        -         ## once we've checked all the eterms in this term, add it to the list
        -         ## of encountered terms/columns
        0         push!(encountered_columns, Compat.view(trms.factors, :, i_term))
        -     end
        -
        0     return trms.is_non_redundant
        - end
        -
        -
        - const DEFAULT_CONTRASTS = DummyCoding
        -
        - ## Set up contrasts:
        - ## Combine actual DF columns and contrast types if necessary to compute the
        - ## actual contrasts matrices, levels, and term names (using DummyCoding
        - ## as the default)
        - function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict())
        -     evaledContrasts = Dict()
        0     for (term, col) in eachcol(df)
        0         is_categorical(col) || continue
        0         evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ?
        -                                                 contrasts[term] :
        -                                                 DEFAULT_CONTRASTS(),
        -                                                 col)
        -     end
        0     return evaledContrasts
        - end
        -
        - function ModelFrame(trms::Terms, d::AbstractDataFrame;
        -                     contrasts::Dict = Dict())
        -     df, msng = null_omit(DataFrame(map(x -> d[x], trms.eterms)))
        0     names!(df, convert(Vector{Symbol}, map(string, trms.eterms)))
        0     for c in eachcol(df) _droplevels!(c[2]) end
        -
        0     evaledContrasts = evalcontrasts(df, contrasts)
        -
        -     ## Check for non-redundant terms, modifying terms in place
        0     check_non_redundancy!(trms, df)
        -
        0     ModelFrame(df, trms, msng, evaledContrasts)
        - end
        -
        - ModelFrame(df::AbstractDataFrame, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
        - ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
        - ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
        -
        - ## modify contrasts in place
        - function setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
        -     new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, mf.df[col]))
        -                       for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ])
        -
        -     mf.contrasts = merge(mf.contrasts, new_contrasts)
        -     return mf
        - end
        - setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs))
        -
        - """
        -     StatsBase.model_response(mf::ModelFrame)
        - Extract the response column, if present.  `DataVector` or
        - `PooledDataVector` columns are converted to `Array`s
        - """
        - function StatsBase.model_response(mf::ModelFrame)
        -     if mf.terms.response
        -         convert(Array, mf.df[mf.terms.eterms[1]])
        -     else
        -         error("Model formula one-sided")
        -     end
        - end
        -
        - function nc(name::Symbol, mf::ModelFrame; non_redundant::Bool=false)
        -     if haskey(mf.contrasts, name)
        0         size(non_redundant ?
        -              ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) :
        -              mf.contrasts[name],
        -              2)
        -     else
        0         1
        -     end
        - end
   992000
        - ## construct model matrix columns from model frame + name (checks for contrasts)
        - function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::ModelFrame; non_redundant::Bool = false)
        -     if haskey(mf.contrasts, name)
        0         modelmat_cols(T, mf.df[name],
        -                       non_redundant ?
        -                       ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) :
        -                       mf.contrasts[name])
        -     else
        0         modelmat_cols(T, mf.df[name])
        -     end
        - end
        -
        - modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector) =
        -     convert(T, reshape(v, length(v), 1))
        - # FIXME: this inefficient method should not be needed, cf. JuliaLang/julia#18264
        - modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::NullableVector) =
        -     convert(T, Matrix(reshape(v, length(v), 1)))
        -
        - """
        -     modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
        -
    16000 Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that
        - levels align properly.
        - """
        - function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T},
        -                                                v::Union{CategoricalVector, NullableCategoricalVector},
        -                                                contrast::ContrastsMatrix)
        -     ## make sure the levels of the contrast matrix and the categorical data
        -     ## are the same by constructing a re-indexing vector. Indexing into
        -     ## reindex with v.refs will give the corresponding row number of the
        -     ## contrast matrix
        -     reindex = [findfirst(contrast.levels, l) for l in levels(v)]
        0     contrastmatrix = convert(T, contrast.matrix)
        0     return indexrows(contrastmatrix, reindex[v.refs])
        - end
        -
        - indexrows(m::SparseMatrixCSC, ind::Vector{Int}) = m'[:, ind]'
        - indexrows(m::AbstractMatrix, ind::Vector{Int}) = m[ind, :]
        -
        - """
        -     expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
        - Create pairwise products of columns from a vector of matrices
        - """
        - function expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
        -     if length(trm) == 1
        0         trm[1]
        -     else
        0         a = trm[1]
        0         b = expandcols(trm[2 : end])
        0         reduce(hcat, [broadcast(*, a, Compat.view(b, :, j)) for j in 1 : size(b, 2)])
        0     end
        0 end
        -
        - """
        -     droprandomeffects(trms::Terms)
        - Expressions of the form `(a|b)` are "random-effects" terms and are not
        - incorporated in the ModelMatrix.  This function checks for such terms and,
        - if any are present, drops them from the `Terms` object.
        - """
        - function droprandomeffects(trms::Terms)
        -     retrms = Bool[Meta.isexpr(t, :call) && t.args[1] == :| for t in trms.terms]
   128000     if !any(retrms)  # return trms unchanged
        0         trms
        0     elseif all(retrms) && !trms.response   # return an empty Terms object
        0         Terms(Any[],Any[],Array(Bool, (0,0)),Array(Bool, (0,0)), Int[], false, trms.intercept)
        -     else
        -         # the rows of `trms.factors` correspond to `eterms`, the columns to `terms`
        -         # After dropping random-effects terms we drop any eterms whose rows are all false
        0         ckeep = !retrms                 # columns to retain
        0         facs = trms.factors[:, ckeep]
        0         rkeep = vec(sum(facs, 2) .> 0)
        0         Terms(trms.terms[ckeep], trms.eterms[rkeep], facs[rkeep, :],
        -               trms.is_non_redundant[rkeep, ckeep],
        -               trms.order[ckeep], trms.response, trms.intercept)
        -     end
        - end
        -
        - """
        -     dropresponse!(trms::Terms)
        - Drop the response term, `trms.eterms[1]` and the first row and column
        - of `trms.factors` if `trms.response` is true.
        - """
        - function dropresponse!(trms::Terms)
        -     if trms.response
        0         ckeep = 2:size(trms.factors, 2)
        0         rkeep = vec(any(trms.factors[:, ckeep], 2))
        0         Terms(trms.terms, trms.eterms[rkeep], trms.factors[rkeep, ckeep],
        -               trms.is_non_redundant[rkeep, ckeep], trms.order[ckeep], false, trms.intercept)
        -     else
        0         trms
        -     end
        - end
        -
        - """
        -     ModelMatrix{T<:AbstractFloatMatrix}(mf::ModelFrame)
        - Create a `ModelMatrix` of type `T` (default `Matrix{Float64}`) from the
        - `terms` and `df` members of `mf`.
        -
        - This is basically a map-reduce where terms are mapped to columns by `cols`
        - and reduced by `hcat`.  During the collection of the columns the `assign`
        - vector is created.  `assign` maps columns of the model matrix to terms in
        - the model frame.  It can also be considered as mapping coefficients to
        - terms and, hence, to names.
        -
        - If there is an intercept in the model, that column occurs first and its
        - `assign` value is zero.
        -
        - Mixed-effects models include "random-effects" terms which are ignored when
        - creating the model matrix.
        - """
        - @compat function (::Type{ModelMatrix{T}}){T<:AbstractFloatMatrix}(mf::ModelFrame)
        -     dfrm = mf.df
        0     terms = droprandomeffects(dropresponse!(mf.terms))
        0     factors = terms.factors
        -
        -     ## Step 1:
        -     ## compute which columns correspond to which terms in order to pre-allocate
        -     ## model matrix and set up views later
    80000     term_col_inds = []
        0     num_cols = convert(Int, terms.intercept)
        -
   240000     for (i_term, term) in enumerate(terms.terms)
        0         f = view(factors, :, i_term)
        0         any(f) || continue
        -
   480000         block_sizes = map((et,nr) -> nc(et, mf, non_redundant=nr),
        -                           view(terms.eterms, f),
        -                           view(terms.is_non_redundant, f, i_term))
        0         num_term_cols = prod(block_sizes)
        -
   176000         push!(term_col_inds, (1:num_term_cols)+num_cols)
        0         num_cols += num_term_cols
        -     end
        -
        0     if num_cols == 0
        0         error("Could not construct model matrix. Resulting matrix has 0 columns.")
        0     end
        -
        -     # Pre-allocate model matrix
24000128000     mm = similar(convert(T, Matrix()), size(dfrm, 1), num_cols)
   112000     assign = zeros(Int, num_cols)
        -
        0     if terms.intercept
        0         mm[:,1] = 1
        -     end
        -
        -     ## Step 2:
        -     ## Fill in the columns for each term.
        -
        -     ## Map eval. term name + redundancy bool to cached model matrix columns
        -     ## (views of model matrix itself for eval terms that correspond directly to
        -     ## mm columns, and arrays otherwise)
        0     eterm_cols = @compat Dict{Tuple{Symbol,Bool}, T}()
        -
        -     ## turn each term into a vector of mm columns for its eval. terms, using
        -     ## "promoted" full-rank versions of categorical columns for non-redundant
        -     ## eval. terms:
   304000     for (i_term, term) in enumerate(terms.terms)
        -         ## Pull out the eval terms, and the non-redundancy flags for this term
        0         ff = view(factors, :, i_term)
   384000         eterms = view(terms.eterms, ff)
    64000         non_redundants = view(terms.is_non_redundant, ff, i_term)
   192000         if length(eterms) == 1 && term == eterms[1]
        -             ## handle special case where term == eterm (e.g., main effect
        -             ## terms). in this case, we can store the columns generated for the
        -             ## eterm directly in the model matrix and just hold onto a view of
        -             ## those columns for any higher-order terms that need them
        0             et = eterms[1]
        0             nr = non_redundants[1]
        0             mm[:, term_col_inds[i_term]] = modelmat_cols(T, et, mf, non_redundant=nr)
16000384000             eterm_cols[(et, nr)] = view(mm, :, term_col_inds[i_term])
        -         else
        -             ## general case with multiple eterms and expandcols
        0             blocks = T[]
        0             for (et, nr) in zip(eterms, non_redundants)
        0                 block = get!(eterm_cols, (et, nr)) do
        -                     modelmat_cols(T, et, mf, non_redundant=nr)
        -                 end
        0                 push!(blocks, block)
        -             end
        0             mm[:, term_col_inds[i_term]] = expandcols(blocks)
        -         end
        0         assign[term_col_inds[i_term]] = i_term
        -     end
        -
    32000     ModelMatrix{T}(mm, assign)
        - end
        - ModelMatrix(mf::ModelFrame) = ModelMatrix{Matrix{Float64}}(mf)
        -
        -
        - """
        -     termnames(term::Symbol, col)
        - Returns a vector of strings with the names of the coefficients
        - associated with a term.  If the column corresponding to the term
        - is not a `CategoricalArray` or `NullableCategoricalArray`,
        - a one-element vector is returned.
        - """
        - termnames(term::Symbol, col) = [string(term)]
        - function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false)
        -     if haskey(mf.contrasts, term)
        -         termnames(term, mf.df[term],
        -                   non_redundant ?
        -                   ContrastsMatrix{FullDummyCoding}(mf.contrasts[term]) :
        -                   mf.contrasts[term])
        -     else
        -         termnames(term, mf.df[term])
        -     end
        - end
        -
        - termnames(term::Symbol, col::Any, contrast::ContrastsMatrix) =
        -     ["$term: $name" for name in contrast.termnames]
        -
        -
        - function expandtermnames(term::Vector)
        -     if length(term) == 1
        -         return term[1]
        -     else
        -         return foldr((a,b) -> vec([string(lev1, " & ", lev2) for
        -                                    lev1 in a,
        -                                    lev2 in b]),
        -                      term)
        -     end
        - end
        -
        -
        - """
        -     coefnames(mf::ModelFrame)
        - Returns a vector of coefficient names constructed from the Terms
        - member and the types of the evaluation columns.
        - """
        - function coefnames(mf::ModelFrame)
        -     terms = droprandomeffects(dropresponse!(mf.terms))
        -
        -     ## strategy mirrors ModelMatrx constructor:
        -     eterm_names = @compat Dict{Tuple{Symbol,Bool}, Vector{Compat.UTF8String}}()
        -     term_names = Vector{Compat.UTF8String}[]
        -
        -     if terms.intercept
        -         push!(term_names, Compat.UTF8String["(Intercept)"])
        -     end
        -
        -     factors = terms.factors
        -
        -     for (i_term, term) in enumerate(terms.terms)
        -
        -         ## names for columns for eval terms
        -         names = Vector{Compat.UTF8String}[]
        -
        -         ff = Compat.view(factors, :, i_term)
        -         eterms = Compat.view(terms.eterms, ff)
        -         non_redundants = Compat.view(terms.is_non_redundant, ff, i_term)
        -
        -         for (et, nr) in zip(eterms, non_redundants)
        -             if !haskey(eterm_names, (et, nr))
        -                 eterm_names[(et, nr)] = termnames(et, mf, non_redundant=nr)
        -             end
        -             push!(names, eterm_names[(et, nr)])
        -         end
        -         push!(term_names, expandtermnames(names))
        -     end
        -
        -     reduce(vcat, Vector{Compat.UTF8String}(), term_names)
        - end
        -
        - function Formula(t::Terms)
        -     lhs = t.response ? t.eterms[1] : nothing
        -     rhs = Expr(:call,:+)
        -     if t.intercept
        -         push!(rhs.args,1)
        -     end
        -     append!(rhs.args,t.terms)
        -     Formula(lhs,rhs)
        - end
        -

## modelmatrix.jl
using DataFrames
using BenchmarkTools
using JLD

srand(1)

n = 1_000_000
d = DataFrame(x = rand(n), y = rand(n))

suite = BenchmarkGroup()

suite["ModelFrame"] = BenchmarkGroup()
suite["ModelMatrix"] = BenchmarkGroup()

for rhs in [ :(x), :(x+y), :(x*y) ]
    f = Formula(nothing, rhs)
    suite["ModelFrame"][rhs] = @benchmarkable ModelFrame($f, $d)
    mf = ModelFrame(f, d)
    suite["ModelMatrix"][rhs] = @benchmarkable ModelMatrix($mf)
end

# run these lines the first time to tune the benchmarking parameters:
# tune!(suite)
# JLD.save(Pkg.dir("DataFrames", "benchmark", "params.jld"), "suite", params(suite))

# then load them for future runs:
loadparams!(suite,
            JLD.load(Pkg.dir("DataFrames", "benchmark", "params.jld"), "suite"),
            :evals, :samples)

results = run(suite, verbose=true)

## modelmatrix2.jl
# run a fixed number of time for memory allocation profiling

using DataFrames

srand(1)

n = 1_000_000
d = DataFrame(x = rand(n), y = rand(n))

f = Formula(nothing, :(x+y))

mf = ModelFrame(f, d)
mm = ModelMatrix(mf)

Profile.init(delay=0.01)
Profile.clear()
Profile.clear_malloc_data()

@profile @time for _ in 1:1000
    ModelMatrix(mf)
end

# Write profile results to profile.bin.
open("profile.bin", "w") do f serialize(f, Profile.retrieve()) end

## results
julia> j = judge(median(results), median(results_master))

julia> j["ModelMatrix"]
3-element BenchmarkTools.BenchmarkGroup:
  tags: []
  :(x + y) => TrialJudgement(+21.64% => regression)
  :(x * y) => TrialJudgement(+19.32% => regression)
  :x => TrialJudgement(-14.57% => improvement)

julia> j["ModelFrame"]
3-element BenchmarkTools.BenchmarkGroup:
  tags: []
  :(x + y) => TrialJudgement(+0.10% => invariant)
  :(x * y) => TrialJudgement(-0.88% => invariant)
  :x => TrialJudgement(-1.63% => invariant)


julia> results_master["ModelMatrix"][:x]
BenchmarkTools.Trial:
  samples:          477
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
  memory estimate:  22.89 mb
  allocs estimate:  79
  minimum time:     8.11 ms (11.03% GC)
  median time:      10.20 ms (31.64% GC)
  mean time:        10.49 ms (31.88% GC)
  maximum time:     17.91 ms (51.26% GC)

julia> results["ModelMatrix"][:x]
BenchmarkTools.Trial:
  samples:          557
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
  memory estimate:  22.89 mb
  allocs estimate:  89
  minimum time:     8.00 ms (12.85% GC)
  median time:      8.71 ms (31.05% GC)
  mean time:        8.98 ms (31.10% GC)
  maximum time:     13.94 ms (35.56% GC)

julia> results_master["ModelMatrix"][:(x+y)]
BenchmarkTools.Trial:
  samples:          356
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
  memory estimate:  30.52 mb
  allocs estimate:  128
  minimum time:     12.37 ms (26.39% GC)
  median time:      13.12 ms (25.51% GC)
  mean time:        14.05 ms (24.98% GC)
  maximum time:     224.42 ms (1.35% GC)

julia> results["ModelMatrix"][:(x+y)]
BenchmarkTools.Trial:
  samples:          306
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
  memory estimate:  38.15 mb
  allocs estimate:  148
  minimum time:     12.63 ms (7.10% GC)
  median time:      15.95 ms (25.34% GC)
  mean time:        16.34 ms (25.49% GC)
  maximum time:     23.80 ms (17.63% GC)


julia> results_master["ModelMatrix"][:(x*y)]
BenchmarkTools.Trial:
  samples:          224
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
  memory estimate:  45.79 mb
  allocs estimate:  233
  minimum time:     18.26 ms (24.74% GC)
  median time:      19.28 ms (24.32% GC)
  mean time:        22.35 ms (23.24% GC)
  maximum time:     549.82 ms (2.38% GC)

julia> results["ModelMatrix"][:(x*y)]
BenchmarkTools.Trial:
  samples:          213
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
  memory estimate:  53.42 mb
  allocs estimate:  286
  minimum time:     21.63 ms (25.27% GC)
  median time:      23.01 ms (25.53% GC)
  mean time:        23.52 ms (25.68% GC)
  maximum time:     33.33 ms (5.18% GC)
	- # Formulas for representing and working with linear-model-type expressions
	- # Original by Harlan D. Harris. Later modifications by John Myles White
	- # and Douglas M. Bates.
	-
	- ## Formulas are written as expressions and parsed by the Julia parser.
	- ## For example :(y ~ a + b + log(c))
	- ## In Julia the & operator is used for an interaction. What would be written
	- ## in R as y ~ a + b + a:b is written :(y ~ a + b + a&b) in Julia.
	- ## The equivalent R expression, y ~ a*b, is the same in Julia
	-
	- ## The lhs of a one-sided formula is 'nothing'
	- ## The rhs of a formula can be 1
	-
	- type Formula
	- lhs::@compat(Union{Symbol, Expr, Void})
	- rhs::@compat(Union{Symbol, Expr, Integer})
	- end
	-
	- macro ~(lhs, rhs)
	- ex = Expr(:call,
	- :Formula,
	- Base.Meta.quot(lhs),
	- Base.Meta.quot(rhs))
	- return ex
	- end
	-
	- #
	192000 # TODO: implement contrast types in Formula/Terms
	- #
	- type Terms
	- terms::Vector
	- eterms::Vector # evaluation terms
	- factors::Matrix{Bool} # maps terms to evaluation terms
	- ## An eterms x terms matrix which is true for terms that need to be "promoted"
	- ## to full rank in constructing a model matrx
	- is_non_redundant::Matrix{Bool}
	- # order can probably be dropped. It is vec(sum(factors, 1))
	- order::Vector{Int} # orders of rhs terms
	- response::Bool # indicator of a response, which is eterms[1] if present
	- intercept::Bool # is there an intercept column in the model matrix?
	- end
	-
	- type ModelFrame
	- df::AbstractDataFrame
	- terms::Terms
	- msng::BitArray
	- ## mapping from df keys to contrasts matrices
	- contrasts::Dict{Symbol, ContrastsMatrix}
	- end
	-
	- typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T}
	-
	- type ModelMatrix{T <: AbstractFloatMatrix}
	- m::T
	- assign::Vector{Int}
	- end
	-
	- Base.size(mm::ModelMatrix) = size(mm.m)
	- Base.size(mm::ModelMatrix, dim...) = size(mm.m, dim...)
	-
	- function Base.show(io::IO, f::Formula)
	- print(io,
	- string("Formula: ",
	- f.lhs == nothing ? "" : f.lhs, " ~ ", f.rhs))
	576000 end
	-
	- ## Return, as a vector of symbols, the names of all the variables in
	- ## an expression or a formula
	- function allvars(ex::Expr)
	- if ex.head != :call error("Non-call expression encountered") end
	- cc=Symbol[]
	- for i in ex.args[2:end] cc=append!(cc,allvars(i)) end
	- cc
	- end
	- allvars(f::Formula) = unique(vcat(allvars(f.rhs), allvars(f.lhs)))
	- allvars(sym::Symbol) = [sym]
	- allvars(v::Any) = Array(Symbol, 0)
	-
	- # special operators in formulas
	- const specials = Set([:+, :-, :*, :/, :&, :\|, :^])
	448000
	- function dospecials(ex::Expr)
	0 if ex.head != :call error("Non-call expression encountered") end
	0 a1 = ex.args[1]
	0 if !(a1 in specials) return ex end
	0 excp = copy(ex)
	0 excp.args = vcat(a1,map(dospecials, ex.args[2:end]))
	0 if a1 != :* return excp end
	0 aa = excp.args
	0 a2 = aa[2]
	0 a3 = aa[3]
	0 if length(aa) > 3
	0 excp.args = vcat(a1, aa[3:end])
	0 a3 = dospecials(excp)
	- end
	- ## this order of expansion gives the R-style ordering of interaction
	- ## terms (after sorting in increasing interaction order) for higher-
	- ## order interaction terms (e.g. x1 * x2 * x3 should expand to x1 +
	0 ## x2 + x3 + x1&x2 + x1&x3 + x2&x3 + x1&x2&x3)
	0 :($a2 + $a2 & $a3 + $a3)
	- end
	- dospecials(a::Any) = a
	-
	- ## Distribution of & over +
	- const distributive = @compat Dict(:& => :+)
	-
	- distribute(ex::Expr) = distribute!(copy(ex))
	- distribute(a::Any) = a
	- ## apply distributive property in-place
	- function distribute!(ex::Expr)
	0 if ex.head != :call error("Non-call expression encountered") end
	0 [distribute!(a) for a in ex.args[2:end]]
	- ## check that top-level can be distributed
	0 a1 = ex.args[1]
	0 if a1 in keys(distributive)
	-
	- ## which op is being DISTRIBUTED (e.g. &, *)?
	0 distributed_op = a1
	- ## which op is doing the distributing (e.g. +)?
	0 distributing_op = distributive[a1]
	-
	- ## detect distributing sub-expression (first arg is, e.g. :+)
	0 is_distributing_subex(e) =
	- typeof(e)==Expr && e.head == :call && e.args[1] == distributing_op
	-
	- ## find first distributing subex
	0 first_distributing_subex = findfirst(is_distributing_subex, ex.args)
	0 if first_distributing_subex != 0
	- ## remove distributing subexpression from args
	0 subex = splice!(ex.args, first_distributing_subex)
	-
	0 newargs = Any[distributing_op]
	- ## generate one new sub-expression, which calls the distributed operation
	0 ## (e.g. &) on each of the distributing sub-expression's arguments, plus
	- ## the non-distributed arguments of the original expression.
	0 for a in subex.args[2:end]
	0 new_subex = copy(ex)
	0 push!(new_subex.args, a)
	- ## need to recurse here, in case there are any other
	- ## distributing operations in the sub expression
	0 distribute!(new_subex)
	0 push!(newargs, new_subex)
	- end
	0 ex.args = newargs
	- end
	- end
	0 ex
	- end
	- distribute!(a::Any) = a
	-
	- const associative = Set([:+,:*,:&]) # associative special operators
	-
	- ## If the expression is a call to the function s return its arguments
	- ## Otherwise return the expression
	- function ex_or_args(ex::Expr,s::Symbol)
	- if ex.head != :call error("Non-call expression encountered") end
	- if ex.args[1] == s
	- ## recurse in case there are more :calls of s below
	- return vcat(map(x -> ex_or_args(x, s), ex.args[2:end])...)
	- else
	- ## not a :call to s, return condensed version of ex
	- return condense(ex)
	- end
	- end
	- ex_or_args(a,s::Symbol) = a
	-
	- ## Condense calls like :(+(a,+(b,c))) to :(+(a,b,c))
	- function condense(ex::Expr)
	0 if ex.head != :call error("Non-call expression encountered") end
	0 a1 = ex.args[1]
	0 if !(a1 in associative) return ex end
	0 excp = copy(ex)
	0 excp.args = vcat(a1, map(x->ex_or_args(x,a1), ex.args[2:end])...)
	0 excp
	- end
	- condense(a::Any) = a
	-
	- ## always return an ARRAY of terms
	- getterms(ex::Expr) = (ex.head == :call && ex.args[1] == :+) ? ex.args[2:end] : Expr[ex]
	- getterms(a::Any) = Any[a]
	-
	- ord(ex::Expr) = (ex.head == :call && ex.args[1] == :&) ? length(ex.args)-1 : 1
	- ord(a::Any) = 1
	-
	- const nonevaluation = Set([:&,:\|]) # operators constructed from other evaluations
	- ## evaluation terms - the (filtered) arguments for :& and :\|, otherwise the term itself
	- function evt(ex::Expr)
	- if ex.head != :call error("Non-call expression encountered") end
	- if !(ex.args[1] in nonevaluation) return ex end
	- filter(x->!isa(x,Number), vcat(map(getterms, ex.args[2:end])...))
	- end
	- evt(a) = Any[a]
	-
	- function Terms(f::Formula)
	- rhs = condense(distribute(dospecials(f.rhs)))
	0 tt = unique(getterms(rhs))
	0 tt = tt[!(tt .== 1)] # drop any explicit 1's
	0 noint = (tt .== 0) \| (tt .== -1) # should also handle :(-(expr,1))
	0 tt = tt[!noint]
	0 oo = Int[ord(t) for t in tt] # orders of interaction terms
	0 if !issorted(oo) # sort terms by increasing order
	0 pp = sortperm(oo)
	0 tt = tt[pp]
	0 oo = oo[pp]
	- end
	0 etrms = map(evt, tt)
	0 haslhs = f.lhs != nothing
	0 if haslhs
	0 unshift!(etrms, Any[f.lhs])
	0 unshift!(oo, 1)
	- end
	0 ev = unique(vcat(etrms...))
	0 sets = [Set(x) for x in etrms]
	0 facs = Bool[t in s for t in ev, s in sets]
	0 non_redundants = fill(false, size(facs)) # initialize to false
	0 Terms(tt, ev, facs, non_redundants, oo, haslhs, !any(noint))
	- end
	-
	- ## Default NULL handler. Others can be added as keyword arguments
	- function null_omit(df::DataFrame)
	- cc = complete_cases(df)
	0 sub(df, cc), cc
	- end
	-
	- _droplevels!(x::Any) = x
	- _droplevels!(x::Union{CategoricalArray, NullableCategoricalArray}) = droplevels!(x)
	-
	- is_categorical(::Union{CategoricalArray, NullableCategoricalArray}) = true
	- is_categorical(::Any) = false
	-
	- ## Check for non-redundancy of columns. For instance, if x is a factor with two
	- ## levels, it should be expanded into two columns in y~0+x but only one column
	- ## in y~1+x. The default is the rank-reduced form (contrasts for n levels only
	- ## produce n-1 columns). In general, an evaluation term x within a term
	- ## x&y&... needs to be "promoted" to full rank if y&... hasn't already been
	- ## included (either explicitly in the Terms or implicitly by promoting another
	- ## term like z in z&y&...).
	- ##
	- ## This modifies the Terms, setting `trms.is_non_redundant = true` for all non-
	- ## redundant evaluation terms.
	- function check_non_redundancy!(trms::Terms, df::AbstractDataFrame)
	-
	- (n_eterms, n_terms) = size(trms.factors)
	-
	0 encountered_columns = Vector{eltype(trms.factors)}[]
	-
	0 if trms.intercept
	0 push!(encountered_columns, zeros(eltype(trms.factors), n_eterms))
	- end
	-
	0 for i_term in 1:n_terms
	0 for i_eterm in 1:n_eterms
	- ## only need to check eterms that are included and can be promoted
	- ## (e.g., categorical variables that expand to multiple mm columns)
	0 if Bool(trms.factors[i_eterm, i_term]) && is_categorical(df[trms.eterms[i_eterm]])
	0 dropped = trms.factors[:,i_term]
	0 dropped[i_eterm] = 0
	-
	0 if findfirst(encountered_columns, dropped) == 0
	0 trms.is_non_redundant[i_eterm, i_term] = true
	0 push!(encountered_columns, dropped)
	- end
	-
	- end
	- end
	- ## once we've checked all the eterms in this term, add it to the list
	- ## of encountered terms/columns
	0 push!(encountered_columns, Compat.view(trms.factors, :, i_term))
	- end
	-
	0 return trms.is_non_redundant
	- end
	-
	-
	- const DEFAULT_CONTRASTS = DummyCoding
	-
	- ## Set up contrasts:
	- ## Combine actual DF columns and contrast types if necessary to compute the
	- ## actual contrasts matrices, levels, and term names (using DummyCoding
	- ## as the default)
	- function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict())
	- evaledContrasts = Dict()
	0 for (term, col) in eachcol(df)
	0 is_categorical(col) \|\| continue
	0 evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ?
	- contrasts[term] :
	- DEFAULT_CONTRASTS(),
	- col)
	- end
	0 return evaledContrasts
	- end
	-
	- function ModelFrame(trms::Terms, d::AbstractDataFrame;
	- contrasts::Dict = Dict())
	- df, msng = null_omit(DataFrame(map(x -> d[x], trms.eterms)))
	0 names!(df, convert(Vector{Symbol}, map(string, trms.eterms)))
	0 for c in eachcol(df) _droplevels!(c[2]) end
	-
	0 evaledContrasts = evalcontrasts(df, contrasts)
	-
	- ## Check for non-redundant terms, modifying terms in place
	0 check_non_redundancy!(trms, df)
	-
	0 ModelFrame(df, trms, msng, evaledContrasts)
	- end
	-
	- ModelFrame(df::AbstractDataFrame, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
	- ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
	- ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
	-
	- ## modify contrasts in place
	- function setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
	- new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, mf.df[col]))
	- for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ])
	-
	- mf.contrasts = merge(mf.contrasts, new_contrasts)
	- return mf
	- end
	- setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs))
	-
	- """
	- StatsBase.model_response(mf::ModelFrame)
	- Extract the response column, if present. `DataVector` or
	- `PooledDataVector` columns are converted to `Array`s
	- """
	- function StatsBase.model_response(mf::ModelFrame)
	- if mf.terms.response
	- convert(Array, mf.df[mf.terms.eterms[1]])
	- else
	- error("Model formula one-sided")
	- end
	- end
	-
	- function nc(name::Symbol, mf::ModelFrame; non_redundant::Bool=false)
	- if haskey(mf.contrasts, name)
	0 size(non_redundant ?
	- ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) :
	- mf.contrasts[name],
	- 2)
	- else
	0 1
	- end
	- end
	992000
	- ## construct model matrix columns from model frame + name (checks for contrasts)
	- function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::ModelFrame; non_redundant::Bool = false)
	- if haskey(mf.contrasts, name)
	0 modelmat_cols(T, mf.df[name],
	- non_redundant ?
	- ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) :
	- mf.contrasts[name])
	- else
	0 modelmat_cols(T, mf.df[name])
	- end
	- end
	-
	- modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector) =
	- convert(T, reshape(v, length(v), 1))
	- # FIXME: this inefficient method should not be needed, cf. JuliaLang/julia#18264
	- modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::NullableVector) =
	- convert(T, Matrix(reshape(v, length(v), 1)))
	-
	- """
	- modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
	-
	16000 Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that
	- levels align properly.
	- """
	- function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T},
	- v::Union{CategoricalVector, NullableCategoricalVector},
	- contrast::ContrastsMatrix)
	- ## make sure the levels of the contrast matrix and the categorical data
	- ## are the same by constructing a re-indexing vector. Indexing into
	- ## reindex with v.refs will give the corresponding row number of the
	- ## contrast matrix
	- reindex = [findfirst(contrast.levels, l) for l in levels(v)]
	0 contrastmatrix = convert(T, contrast.matrix)
	0 return indexrows(contrastmatrix, reindex[v.refs])
	- end
	-
	- indexrows(m::SparseMatrixCSC, ind::Vector{Int}) = m'[:, ind]'
	- indexrows(m::AbstractMatrix, ind::Vector{Int}) = m[ind, :]
	-
	- """
	- expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
	- Create pairwise products of columns from a vector of matrices
	- """
	- function expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
	- if length(trm) == 1
	0 trm[1]
	- else
	0 a = trm[1]
	0 b = expandcols(trm[2 : end])
	0 reduce(hcat, [broadcast(*, a, Compat.view(b, :, j)) for j in 1 : size(b, 2)])
	0 end
	0 end
	-
	- """
	- droprandomeffects(trms::Terms)
	- Expressions of the form `(a\|b)` are "random-effects" terms and are not
	- incorporated in the ModelMatrix. This function checks for such terms and,
	- if any are present, drops them from the `Terms` object.
	- """
	- function droprandomeffects(trms::Terms)
	- retrms = Bool[Meta.isexpr(t, :call) && t.args[1] == :\| for t in trms.terms]
	128000 if !any(retrms) # return trms unchanged
	0 trms
	0 elseif all(retrms) && !trms.response # return an empty Terms object
	0 Terms(Any[],Any[],Array(Bool, (0,0)),Array(Bool, (0,0)), Int[], false, trms.intercept)
	- else
	- # the rows of `trms.factors` correspond to `eterms`, the columns to `terms`
	- # After dropping random-effects terms we drop any eterms whose rows are all false
	0 ckeep = !retrms # columns to retain
	0 facs = trms.factors[:, ckeep]
	0 rkeep = vec(sum(facs, 2) .> 0)
	0 Terms(trms.terms[ckeep], trms.eterms[rkeep], facs[rkeep, :],
	- trms.is_non_redundant[rkeep, ckeep],
	- trms.order[ckeep], trms.response, trms.intercept)
	- end
	- end
	-
	- """
	- dropresponse!(trms::Terms)
	- Drop the response term, `trms.eterms[1]` and the first row and column
	- of `trms.factors` if `trms.response` is true.
	- """
	- function dropresponse!(trms::Terms)
	- if trms.response
	0 ckeep = 2:size(trms.factors, 2)
	0 rkeep = vec(any(trms.factors[:, ckeep], 2))
	0 Terms(trms.terms, trms.eterms[rkeep], trms.factors[rkeep, ckeep],
	- trms.is_non_redundant[rkeep, ckeep], trms.order[ckeep], false, trms.intercept)
	- else
	0 trms
	- end
	- end
	-
	- """
	- ModelMatrix{T<:AbstractFloatMatrix}(mf::ModelFrame)
	- Create a `ModelMatrix` of type `T` (default `Matrix{Float64}`) from the
	- `terms` and `df` members of `mf`.
	-
	- This is basically a map-reduce where terms are mapped to columns by `cols`
	- and reduced by `hcat`. During the collection of the columns the `assign`
	- vector is created. `assign` maps columns of the model matrix to terms in
	- the model frame. It can also be considered as mapping coefficients to
	- terms and, hence, to names.
	-
	- If there is an intercept in the model, that column occurs first and its
	- `assign` value is zero.
	-
	- Mixed-effects models include "random-effects" terms which are ignored when
	- creating the model matrix.
	- """
	- @compat function (::Type{ModelMatrix{T}}){T<:AbstractFloatMatrix}(mf::ModelFrame)
	- dfrm = mf.df
	0 terms = droprandomeffects(dropresponse!(mf.terms))
	0 factors = terms.factors
	-
	- ## Step 1:
	- ## compute which columns correspond to which terms in order to pre-allocate
	- ## model matrix and set up views later
	80000 term_col_inds = []
	0 num_cols = convert(Int, terms.intercept)
	-
	240000 for (i_term, term) in enumerate(terms.terms)
	0 f = view(factors, :, i_term)
	0 any(f) \|\| continue
	-
	480000 block_sizes = map((et,nr) -> nc(et, mf, non_redundant=nr),
	- view(terms.eterms, f),
	- view(terms.is_non_redundant, f, i_term))
	0 num_term_cols = prod(block_sizes)
	-
	176000 push!(term_col_inds, (1:num_term_cols)+num_cols)
	0 num_cols += num_term_cols
	- end
	-
	0 if num_cols == 0
	0 error("Could not construct model matrix. Resulting matrix has 0 columns.")
	0 end
	-
	- # Pre-allocate model matrix
	24000128000 mm = similar(convert(T, Matrix()), size(dfrm, 1), num_cols)
	112000 assign = zeros(Int, num_cols)
	-
	0 if terms.intercept
	0 mm[:,1] = 1
	- end
	-
	- ## Step 2:
	- ## Fill in the columns for each term.
	-
	- ## Map eval. term name + redundancy bool to cached model matrix columns
	- ## (views of model matrix itself for eval terms that correspond directly to
	- ## mm columns, and arrays otherwise)
	0 eterm_cols = @compat Dict{Tuple{Symbol,Bool}, T}()
	-
	- ## turn each term into a vector of mm columns for its eval. terms, using
	- ## "promoted" full-rank versions of categorical columns for non-redundant
	- ## eval. terms:
	304000 for (i_term, term) in enumerate(terms.terms)
	- ## Pull out the eval terms, and the non-redundancy flags for this term
	0 ff = view(factors, :, i_term)
	384000 eterms = view(terms.eterms, ff)
	64000 non_redundants = view(terms.is_non_redundant, ff, i_term)
	192000 if length(eterms) == 1 && term == eterms[1]
	- ## handle special case where term == eterm (e.g., main effect
	- ## terms). in this case, we can store the columns generated for the
	- ## eterm directly in the model matrix and just hold onto a view of
	- ## those columns for any higher-order terms that need them
	0 et = eterms[1]
	0 nr = non_redundants[1]
	0 mm[:, term_col_inds[i_term]] = modelmat_cols(T, et, mf, non_redundant=nr)
	16000384000 eterm_cols[(et, nr)] = view(mm, :, term_col_inds[i_term])
	- else
	- ## general case with multiple eterms and expandcols
	0 blocks = T[]
	0 for (et, nr) in zip(eterms, non_redundants)
	0 block = get!(eterm_cols, (et, nr)) do
	- modelmat_cols(T, et, mf, non_redundant=nr)
	- end
	0 push!(blocks, block)
	- end
	0 mm[:, term_col_inds[i_term]] = expandcols(blocks)
	- end
	0 assign[term_col_inds[i_term]] = i_term
	- end
	-
	32000 ModelMatrix{T}(mm, assign)
	- end
	- ModelMatrix(mf::ModelFrame) = ModelMatrix{Matrix{Float64}}(mf)
	-
	-
	- """
	- termnames(term::Symbol, col)
	- Returns a vector of strings with the names of the coefficients
	- associated with a term. If the column corresponding to the term
	- is not a `CategoricalArray` or `NullableCategoricalArray`,
	- a one-element vector is returned.
	- """
	- termnames(term::Symbol, col) = [string(term)]
	- function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false)
	- if haskey(mf.contrasts, term)
	- termnames(term, mf.df[term],
	- non_redundant ?
	- ContrastsMatrix{FullDummyCoding}(mf.contrasts[term]) :
	- mf.contrasts[term])
	- else
	- termnames(term, mf.df[term])
	- end
	- end
	-
	- termnames(term::Symbol, col::Any, contrast::ContrastsMatrix) =
	- ["$term: $name" for name in contrast.termnames]
	-
	-
	- function expandtermnames(term::Vector)
	- if length(term) == 1
	- return term[1]
	- else
	- return foldr((a,b) -> vec([string(lev1, " & ", lev2) for
	- lev1 in a,
	- lev2 in b]),
	- term)
	- end
	- end
	-
	-
	- """
	- coefnames(mf::ModelFrame)
	- Returns a vector of coefficient names constructed from the Terms
	- member and the types of the evaluation columns.
	- """
	- function coefnames(mf::ModelFrame)
	- terms = droprandomeffects(dropresponse!(mf.terms))
	-
	- ## strategy mirrors ModelMatrx constructor:
	- eterm_names = @compat Dict{Tuple{Symbol,Bool}, Vector{Compat.UTF8String}}()
	- term_names = Vector{Compat.UTF8String}[]
	-
	- if terms.intercept
	- push!(term_names, Compat.UTF8String["(Intercept)"])
	- end
	-
	- factors = terms.factors
	-
	- for (i_term, term) in enumerate(terms.terms)
	-
	- ## names for columns for eval terms
	- names = Vector{Compat.UTF8String}[]
	-
	- ff = Compat.view(factors, :, i_term)
	- eterms = Compat.view(terms.eterms, ff)
	- non_redundants = Compat.view(terms.is_non_redundant, ff, i_term)
	-
	- for (et, nr) in zip(eterms, non_redundants)
	- if !haskey(eterm_names, (et, nr))
	- eterm_names[(et, nr)] = termnames(et, mf, non_redundant=nr)
	- end
	- push!(names, eterm_names[(et, nr)])
	- end
	- push!(term_names, expandtermnames(names))
	- end
	-
	- reduce(vcat, Vector{Compat.UTF8String}(), term_names)
	- end
	-
	- function Formula(t::Terms)
	- lhs = t.response ? t.eterms[1] : nothing
	- rhs = Expr(:call,:+)
	- if t.intercept
	- push!(rhs.args,1)
	- end
	- append!(rhs.args,t.terms)
	- Formula(lhs,rhs)
	- end
	-
	using DataFrames
	using BenchmarkTools
	using JLD

	srand(1)

	n = 1_000_000
	d = DataFrame(x = rand(n), y = rand(n))

	suite = BenchmarkGroup()

	suite["ModelFrame"] = BenchmarkGroup()
	suite["ModelMatrix"] = BenchmarkGroup()

	for rhs in [ :(x), :(x+y), :(x*y) ]
	f = Formula(nothing, rhs)
	suite["ModelFrame"][rhs] = @benchmarkable ModelFrame($f, $d)
	mf = ModelFrame(f, d)
	suite["ModelMatrix"][rhs] = @benchmarkable ModelMatrix($mf)
	end

	# run these lines the first time to tune the benchmarking parameters:
	# tune!(suite)
	# JLD.save(Pkg.dir("DataFrames", "benchmark", "params.jld"), "suite", params(suite))

	# then load them for future runs:
	loadparams!(suite,
	JLD.load(Pkg.dir("DataFrames", "benchmark", "params.jld"), "suite"),
	:evals, :samples)

	results = run(suite, verbose=true)
	# run a fixed number of time for memory allocation profiling

	using DataFrames

	srand(1)

	n = 1_000_000
	d = DataFrame(x = rand(n), y = rand(n))

	f = Formula(nothing, :(x+y))

	mf = ModelFrame(f, d)
	mm = ModelMatrix(mf)

	Profile.init(delay=0.01)
	Profile.clear()
	Profile.clear_malloc_data()

	@profile @time for _ in 1:1000
	ModelMatrix(mf)
	end

	# Write profile results to profile.bin.
	open("profile.bin", "w") do f serialize(f, Profile.retrieve()) end
	julia> j = judge(median(results), median(results_master))

	julia> j["ModelMatrix"]
	3-element BenchmarkTools.BenchmarkGroup:
	tags: []
	:(x + y) => TrialJudgement(+21.64% => regression)
	:(x * y) => TrialJudgement(+19.32% => regression)
	:x => TrialJudgement(-14.57% => improvement)

	julia> j["ModelFrame"]
	3-element BenchmarkTools.BenchmarkGroup:
	tags: []
	:(x + y) => TrialJudgement(+0.10% => invariant)
	:(x * y) => TrialJudgement(-0.88% => invariant)
	:x => TrialJudgement(-1.63% => invariant)


	julia> results_master["ModelMatrix"][:x]
	BenchmarkTools.Trial:
	samples: 477
	evals/sample: 1
	time tolerance: 5.00%
	memory tolerance: 1.00%
	memory estimate: 22.89 mb
	allocs estimate: 79
	minimum time: 8.11 ms (11.03% GC)
	median time: 10.20 ms (31.64% GC)
	mean time: 10.49 ms (31.88% GC)
	maximum time: 17.91 ms (51.26% GC)

	julia> results["ModelMatrix"][:x]
	BenchmarkTools.Trial:
	samples: 557
	evals/sample: 1
	time tolerance: 5.00%
	memory tolerance: 1.00%
	memory estimate: 22.89 mb
	allocs estimate: 89
	minimum time: 8.00 ms (12.85% GC)
	median time: 8.71 ms (31.05% GC)
	mean time: 8.98 ms (31.10% GC)
	maximum time: 13.94 ms (35.56% GC)

	julia> results_master["ModelMatrix"][:(x+y)]
	BenchmarkTools.Trial:
	samples: 356
	evals/sample: 1
	time tolerance: 5.00%
	memory tolerance: 1.00%
	memory estimate: 30.52 mb
	allocs estimate: 128
	minimum time: 12.37 ms (26.39% GC)
	median time: 13.12 ms (25.51% GC)
	mean time: 14.05 ms (24.98% GC)
	maximum time: 224.42 ms (1.35% GC)

	julia> results["ModelMatrix"][:(x+y)]
	BenchmarkTools.Trial:
	samples: 306
	evals/sample: 1
	time tolerance: 5.00%
	memory tolerance: 1.00%
	memory estimate: 38.15 mb
	allocs estimate: 148
	minimum time: 12.63 ms (7.10% GC)
	median time: 15.95 ms (25.34% GC)
	mean time: 16.34 ms (25.49% GC)
	maximum time: 23.80 ms (17.63% GC)


	julia> results_master["ModelMatrix"][:(x*y)]
	BenchmarkTools.Trial:
	samples: 224
	evals/sample: 1
	time tolerance: 5.00%
	memory tolerance: 1.00%
	memory estimate: 45.79 mb
	allocs estimate: 233
	minimum time: 18.26 ms (24.74% GC)
	median time: 19.28 ms (24.32% GC)
	mean time: 22.35 ms (23.24% GC)
	maximum time: 549.82 ms (2.38% GC)

	julia> results["ModelMatrix"][:(x*y)]
	BenchmarkTools.Trial:
	samples: 213
	evals/sample: 1
	time tolerance: 5.00%
	memory tolerance: 1.00%
	memory estimate: 53.42 mb
	allocs estimate: 286
	minimum time: 21.63 ms (25.27% GC)
	median time: 23.01 ms (25.53% GC)
	mean time: 23.52 ms (25.68% GC)
	maximum time: 33.33 ms (5.18% GC)