Skip to content

Instantly share code, notes, and snippets.

@skranz
Created May 17, 2014 09:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save skranz/b2343e7178a657328f49 to your computer and use it in GitHub Desktop.
Save skranz/b2343e7178a657328f49 to your computer and use it in GitHub Desktop.
mutate_if
# Tools to make it run
deparse_all <- function(x) {
deparse2 <- function(x) paste(deparse(x, width.cutoff = 500L), collapse = "")
vapply(x, deparse2, FUN.VALUE = character(1))
}
dt_env <- function(dt, env) {
env <- new.env(parent = env, size = 2L)
env$dt <- dt
env$vars <- deparse_all(groups(dt))
env
}
# sugessted change of in manip.r
#' Data manipulation functions.
#'
#' These five functions form the backbone of dplyr. They are all S3 generic
#' functions with methods for each individual data type. All functions work
#' exactly the same way: the first argument is the tbl, and the
#' subsequence arguments are interpreted in the context of that tbl.
#'
#' @section Manipulation functions:
#'
#' The five key data manipulation functions are:
#'
#' \itemize{
#' \item filter: return only a subset of the rows. If multiple conditions are
#' supplied they are combined with \code{&}.
#' \item select: return only a subset of the columns. If multiple columns are
#' supplied they are all used.
#' \item arrange: reorder the rows. Multiple inputs are ordered from left-to-
#' right.
#' \item mutate: add new columns or replace existing columns. Multiple inputs create multiple columns.
#' \item mutate_if: replace selected rows of existing columns.
#' \item summarise: reduce each group to a single row. Multiple inputs create
#' multiple output summaries.
#' }
#'
#' These are all made significantly more useful when applied by group,
#' as with \code{\link{group_by}}
#'
#' @section Tbls:
#'
#' dplyr comes with three built-in tbls. Read the help for the
#' manip methods of that class to get more details:
#'
#' \itemize{
#' \item data.frame: \link{manip_df}
#' \item data.table: \link{manip_dt}
#' \item SQLite: \code{\link{src_sqlite}}
#' \item PostgreSQL: \code{\link{src_postgres}}
#' \item MySQL: \code{\link{src_mysql}}
#' }
#'
#' @section Output:
#'
#' Generally, manipulation functions will return an output object of the
#' same type as their input. The exceptions are:
#'
#' \itemize{
#' \item \code{summarise} will return an ungrouped source
#' \item remote sources (like databases) will typically return a local
#' source from at least \code{summarise} and \code{mutate}
#' }
#'
#' @section Row names:
#'
#' dplyr methods do not preserve row names. If have been using row names
#' to store important information, please make them explicit variables.
#'
#' @name manip
#' @param .data a tbl
#' @param ... variables interpreted in the context of that data frame.
#' @examples
#' filter(mtcars, cyl == 8)
#' select(mtcars, mpg, cyl, hp:vs)
#' arrange(mtcars, cyl, disp)
#' mutate(mtcars, displ_l = disp / 61.0237)
#' mutate_if(mtcars,cyl==8, displ_l = disp / 61.0237)
#' summarise(mtcars, mean(disp))
#' summarise(group_by(mtcars, cyl), mean(disp))
NULL
# code for manip.r
#' @rdname manip
#' @export
mutate_if = function (.data,.if,...) {
UseMethod("mutate_if")
}
# for tbl-data.frame.R
#' @rdname manip_df
#' @export
mutate_if.data.frame =function (.data,.if,...)
{
dt = as.data.table(.data)
.if.quoted = substitute(.if)
as.data.frame(mutate_if.data.table(.data=dt,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame()))
}
# for manip-df.r
#' @rdname manip_df
#' @export
mutate_if.tbl_df <- function (.data,.if,...) {
dt = as.data.table(.data)
.if.quoted = substitute(.if)
tbl_df(mutate_if.data.table(.data=dt,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame()))
}
#' @export
mutate.tbl_dt <- function(.data,.if, ...) {
.if.quoted = substitute(.if)
tbl_dt(
mutate_if.data.table(.data=.data,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame())
)
}
# for manip-dt.r
#' @rdname manip_dt
#' @export
mutate_if.data.table <- function (.data,.if, ..., inplace = FALSE,.if.quoted=NULL, .parent.env=parent.frame())
{
if (is.null(.if.quoted))
.if.quoted = substitute(.if)
if (!inplace)
.data <- copy(.data)
env <- new.env(parent = .parent.env, size = 1L)
env$data <- .data
cols <- named_dots(...)
for (i in seq_along(cols)) {
call <- substitute(data[.if.quoted, `:=`(lhs, rhs)], list(lhs = as.name(names(cols)[[i]]), rhs = cols[[i]], .if.quoted =.if.quoted))
eval(call, env)
}
.data
}
# for manip-grouped-dt.r
#' @rdname manip_grouped_dt
#' @export
mutate_if.grouped_dt <- function(.data,.if, ..., inplace = FALSE, .if.quoted=NULL) {
data <- .data
if (is.null(.if.quoted))
.if.quoted = substitute(.if)
if (!inplace) data <- copy(data)
env <- dt_env(data, parent.frame())
cols <- named_dots(...)
# For each new variable, generate a call of the form df[, new := expr]
for(col in names(cols)) {
call <- substitute(dt[.if.quoted, lhs := rhs, by = vars],
list(lhs = as.name(col), rhs = cols[[col]], .if.quoted=.if.quoted))
eval(call, env)
}
grouped_dt(
data = data,
vars = groups(.data)
)
}
#' @rdname manip_grouped_df
#' @export
mutate_if.grouped_df <- function(.data,.if, ...) {
# This function is currently extremely unelegant and inefficient
# Problem: when transforming to data.table row order will be changed
# by group_by operation at least in dplyr 0.1.3
# So I manually restore the original row order
if (NROW(.data)==0)
return(.data)
.if.quoted = substitute(.if)
vars = groups(.data)
dt = as.data.table(.data)
class(dt) = c("data.table","data.frame")
mutate(dt, INDEX.ROW__ = 1:NROW(.data), inplace=TRUE)
gdt = grouped_dt(dt, vars=vars)
gdt = mutate_if.grouped_dt(gdt,.if.quoted=.if.quoted,..., inplace=TRUE)
data = dplyr:::grouped_df(data=as.data.frame(gdt), vars=vars)
# restore original order
data = select(arrange(data, INDEX.ROW__), -INDEX.ROW__)
data
}
examples = function() {
library(microbenchmark)
#library(modify)
library(dplyr)
library(pryr)
library(data.table)
# Benckmark compared to directly using data.table or dplyr
set.seed(123456)
n = 1e1
df = data.frame(a= sample(1:3,n,replace=TRUE),
b= sample(1:100,n,replace=TRUE),
x=rnorm(n))
dt = as.data.table(df)
mutate_if(df,a==3,x=100)
mutate_if(tbl_df(df),a==1,x=200)
mutate_if(as.tbl(df),a==1,x=300,b=400)
mutate_if(dt,a==1 | a==2,x=400)
mutate_if(group_by(dt,a),a==1 | a==2,x=mean(b))
# Quite inefficient implementation
mutate_if(group_by(df,a),a==1 | a==2,x=mean(b))
# Small benchmark
n = 1e6
df = data.frame(a= sample(1:3,n,replace=TRUE),
b= sample(1:100,n,replace=TRUE),
x=rnorm(n))
microbenchmark(times = 5L,
mutate(df, x=ifelse(a==2,x+100,x)),
mutate_if(df, a==2, x=x+100)
)
#Unit: milliseconds
# expr min lq median uq max neval
# mutate(df, x = ifelse(a == 2, x + 100, x)) 749.2954 754.4179 815.06681 820.95872 860.79326 5
# mutate_if(df, a == 2, x = x + 100) 72.2886 75.4189 77.47787 83.64689 86.33666 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment