jhofman/filter_by_group_id.R

## filter_by_group_id.R
library(tidyverse)
library(digest)

# create a dummy dataframe with 100,000 groups and 1,000,000 rows
# where group ids are md5 hash of integers from 1 to 100,000
set.seed(42)
md5 <- Vectorize(function(x) digest(x, algo="md5"))
df <- data.frame(group_id=sample(md5(1:1e4), 1e6, replace=T),
                 val=sample(1:100, 1e6, replace=T))

# group observations by group_id, creating an index on group_id in the background
df <- df %>%
  group_by(group_id)

########################################
# bad: filter by group id, the naive way
########################################

# this is slow for two reasons
#   the first is that it's a linear scan over all rows
#   and the second is that there's overhead created by the grouping

system.time( df1 <- df %>% filter(group_id == "4b5630ee914e848e8d07221556b0a2fb") )
#   user  system elapsed
#  1.416   0.485   1.957

########################################
# better: filter by group id, the smart way
########################################

# this is faster than the above because it uses the group index created by dplyr
# as a result it's linear in the total number of groups + the length of the requested group

# create a function that uses the group indices to filter more efficiently
filter_groups <- function(df, filter_formula) {
  # quosure magic for tidy evaluation
  filter_formula <- enquo(filter_formula)

  # make sure we're given a grouped data frame
  if(!("grouped_df" %in% class(df))) {
    return(data.frame())
  }

  # find the group index for this group label
  labels <- attr(df, "labels") %>%
    rowid_to_column() %>%
    filter(!!filter_formula)

  # find the indices of all rows in this group
  ndx <- unlist(attr(df, "indices")[labels$rowid])

  # return the rows for this group, adjusting for 0-based indexing
  df[ndx + 1, ]
}

system.time( df2 <- filter_groups(df, group_id == "4b5630ee914e848e8d07221556b0a2fb") )
#   user  system elapsed
#  0.002   0.000   0.001

# check that results are the same
all(df1 == df2)

########################################
# much cleaner, slightly slower: created a nested data frame, then filter
########################################

# h/t to @hadleywickham for this solution

system.time( df_nested <- df %>% nest() )
#   user  system elapsed
#  0.607   0.017   0.630
system.time(
  df3 <- df_nested %>%
    filter(group_id == "4b5630ee914e848e8d07221556b0a2fb") %>%
    unnest()
)
#   user  system elapsed
#  0.005   0.000   0.005

all(df1 == df3)
	library(tidyverse)
	library(digest)

	# create a dummy dataframe with 100,000 groups and 1,000,000 rows
	# where group ids are md5 hash of integers from 1 to 100,000
	set.seed(42)
	md5 <- Vectorize(function(x) digest(x, algo="md5"))
	df <- data.frame(group_id=sample(md5(1:1e4), 1e6, replace=T),
	val=sample(1:100, 1e6, replace=T))

	# group observations by group_id, creating an index on group_id in the background
	df <- df %>%
	group_by(group_id)

	########################################
	# bad: filter by group id, the naive way
	########################################

	# this is slow for two reasons
	# the first is that it's a linear scan over all rows
	# and the second is that there's overhead created by the grouping

	system.time( df1 <- df %>% filter(group_id == "4b5630ee914e848e8d07221556b0a2fb") )
	# user system elapsed
	# 1.416 0.485 1.957

	########################################
	# better: filter by group id, the smart way
	########################################

	# this is faster than the above because it uses the group index created by dplyr
	# as a result it's linear in the total number of groups + the length of the requested group

	# create a function that uses the group indices to filter more efficiently
	filter_groups <- function(df, filter_formula) {
	# quosure magic for tidy evaluation
	filter_formula <- enquo(filter_formula)

	# make sure we're given a grouped data frame
	if(!("grouped_df" %in% class(df))) {
	return(data.frame())
	}

	# find the group index for this group label
	labels <- attr(df, "labels") %>%
	rowid_to_column() %>%
	filter(!!filter_formula)

	# find the indices of all rows in this group
	ndx <- unlist(attr(df, "indices")[labels$rowid])

	# return the rows for this group, adjusting for 0-based indexing
	df[ndx + 1, ]
	}

	system.time( df2 <- filter_groups(df, group_id == "4b5630ee914e848e8d07221556b0a2fb") )
	# user system elapsed
	# 0.002 0.000 0.001

	# check that results are the same
	all(df1 == df2)

	########################################
	# much cleaner, slightly slower: created a nested data frame, then filter
	########################################

	# h/t to @hadleywickham for this solution

	system.time( df_nested <- df %>% nest() )
	# user system elapsed
	# 0.607 0.017 0.630
	system.time(
	df3 <- df_nested %>%
	filter(group_id == "4b5630ee914e848e8d07221556b0a2fb") %>%
	unnest()
	)
	# user system elapsed
	# 0.005 0.000 0.005

	all(df1 == df3)