Skip to content

Instantly share code, notes, and snippets.

@halpo
Created September 21, 2016 21:27
Show Gist options
  • Save halpo/7809c07664efba522ce31c8d37d7169f to your computer and use it in GitHub Desktop.
Save halpo/7809c07664efba522ce31c8d37d7169f to your computer and use it in GitHub Desktop.
spread_each and margins functions which compliment dplyr and tidyr packages.
margins <-
function( grouped #< [grouped_df] A data frame with groups defined.
, ... #< passed to `FUN`
, FUN=dplyr::summarize #< Summary function
, all.name = getOption("margins::all.name", "(All)")
){
"Add margins to summarization"
g <- groups(grouped)
com <- lapply(seq(0, length(g)), combn, x=g, simplify=FALSE)
com <- Reduce(c, com)
com <- com[order(desc(sapply(com, length)))]
l <-
lapply(com, function(g){
g2 <- if(length(g)) group_by_(ungroup(grouped), .dots=g) else ungroup(grouped)
FUN(g2, ...)
}) %>%
bind_rows
for(v in as.character(g)){
if(v %in% names(l))
l[[v]] <- xifnotNA(as.character(l[[v]]), all.name)
else
l[[v]] <- all.name
}
l
}
with_margins <-
function( FUN #< Function to compute one groups and margins.
, all.name = getOption("margins::all.name", "(All)") #< name to fill in margins.
){
#! Compute marginals
#!
#! `with_margins` creates another function that performs the given
#! `FUN` function over the given groups and all possible marginals
#! of the groups, including an overall computation.
#! This will most commonly be used with summarise but can also
#! be used with other functions.
function(.data, ...){
g <- groups(.data)
com <- lapply(seq(0, length(g)), combn, x=g, simplify=FALSE) %>%
Reduce(c, .)
com <- com[order(desc(sapply(com, length)))]
lapply(com, function(x, .data){
FUN(group_by_(.data, .dots=x, add=FALSE), ...) %>%
mutate_( .dots =
structure( replicate(length(setdiff(g, x)), lazyeval::lazy(all.name), simplify=FALSE)
, names = as.character(setdiff(g,x))
)
)
}, .data) %>%
bind_rows
}
}
if(FALSE){#! @example
.data <- expand.grid( x = c( 'a', 'b', 'c')
, y = c( 'd', 'e', 'f')
, .rep = 1:10
, stringsAsFactors=FALSE
) %>%
mutate( v = rnorm(90)) %>%
select(-.rep) %>%
group_by(x, y)
with_margins(summarise)(.data, N=n(), sum=sum(v))
margins(.data, N=n(), sum=sum(v))
}
spread_each <-
function( data #< A <data.frame> or <tbl>
, key #< unquoted variable name to make into columns
, ... #< (sub-)columns of values
, fill=NA #< passed to <spread>.
, convert=FALSE #< passed to <spread>.
, drop=FALSE #< passed to <spread>.
, sep='.' #< Separator that goes between the key value and "..." column names.
){
#! wrapper for spread_each_
spread_each_( data
, key.col = tidyr:::col_name(substitute(key))
, .dots = lazyeval::lazy_dots(...)
, fill=fill, convert=convert, drop=drop, sep=sep
)
}
spread_each_ <-
function( data #< A <data.frame> or <tbl>
, key #< unquoted variable name to make into columns
, ... #< (sub-)columns of values
, .dots #< a <lazy_dots> list.
, key.col = tidyr:::col_name(substitute(key))
#< the character name of the key column.
, fill=NA #< passed to <spread>.
, convert=FALSE #< passed to <spread>.
, drop=FALSE #< passed to <spread>.
, sep='.' #< Separator that goes between the key value and "..." column names.
){
#! Spread a key column with multiple sub columns
#!
#! Creates a <tbl> with a the values of key as columns with
#! the variables listed in ... as sub columns.
dots <- lazyeval::all_dots(.dots, ...)
value.cols <- dplyr:::select_vars_(names(data), dots)
grouping.cols <- names(data) %>% setdiff(key.col) %>% setdiff(value.cols)
grouping.dots <- grouping.cols %>%
(lazyeval::as.lazy_dots)(.) %>%
dplyr:::resolve_vars(tbl_vars(data))
data <- group_by_(data, .dots=grouping.dots)
f <- function(col){
select.vars <- c(key.col, col, grouping.cols) %>%
lapply(as.name) %>%
(lazyeval::as.lazy_dots)(.) %>%
dplyr:::resolve_vars(tbl_vars(data))
x <-
spread_( data = select_(data, .dots=select.vars),
, key_col = key.col
, value_col = col
, fill = fill
, sep = NULL
)
newcols <- setdiff(names(x), grouping.cols)
new.names <-
structure( newcols %>% lapply(as.name) %>% lapply(lazyeval::as.lazy)
, names=paste(newcols, col, sep=sep)
)
rename_(.data=x, .dots=new.names)
}
col.order <-
unique(getElement(data, key.col)) %>% as.character() %>%
lapply(., function(x, env){
lazyeval::as.lazy(call('starts_with', x), env=env)
}, env=environment()) %>%
c(lapply(grouping.cols, lazyeval::as.lazy, env=environment()), .)
lapply(value.cols, f) %>%
Reduce(f=full_join, x=.) %>%
select_(., .dots=col.order)
}
if(FALSE){#! @example
data <- expand.grid( x = c( 'a', 'b', 'c')
, y = c( 'd', 'e', 'f')
, .rep = 1:10
, stringsAsFactors=FALSE
) %>%
mutate( v = rnorm(90)) %>%
group_by(x, y) %>%
summarise(N=n(), sum=sum(v))
# Data is a data.frame with columns x, y, N, and sum.
# Spread column y over columns N and sum
spread_each(data, y, N, sum)
# creates a tbl_df dataset with columns:
# x, d.N, d.sum, e.N, e.sum, f.N, f.sum
}
@namarkus
Copy link

namarkus commented May 5, 2017

Awesome! That margin function is really missing in dplyr.
2 comments on your code:

  • function "xifnotNA" is not defined. could you provide the code for it as well?
  • somehow one group vanishes using margins-function:

mtcars %>% group_by(cyl, gear, carb) %>% summarise(n=n(), mmpg=mean(mpg)) returns a df with 5 columns
mtcars %>% group_by(cyl, gear, carb) %>% summarise(n=n(), mmpg=mean(mpg)) %>% margins(nn=sum(n), mmpg=weighted.mean(mmpg, n)) only returns 4 columns, carb is missing. do you have any idea how to fix this?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment