Created
June 23, 2015 04:30
-
-
Save jwdink/9786a6c2cd07ac8e7be8 to your computer and use it in GitHub Desktop.
Dplyr's join functions with a progress bar. Hacky but helpful.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Dplyr's Join w/ Progress Bar | |
# | |
# @author Jacob Dink | |
# jacobwdink@gmail.com | |
# github.com/jwdink | |
# | |
# @created April 8, 2015 | |
# | |
# Dplyr's join functions with a progress bar. Hacky but helpful. | |
# Don't call this function, call the dplyr-named versions (e.g., left_join_pb) | |
# | |
# @param dataframe x | |
# @param dataframe y | |
# @param character.vector by | |
# @param character progress_factor Name of factor column in (one of both of) dfs. Progress indicated by position in levels of factor | |
# @param character subset_which Which df has this factor column? First, second, or both? | |
# | |
# @return dataframe joined | |
join_pb = function(x, y, by, progress_factor, subset_which = 'both', joiner) { | |
require('dplyr') | |
require('pbapply') | |
# Check Args: | |
subset_which = match.arg(arg = subset_which, choices = c('x','y','both')) | |
join_fun = switch(joiner, | |
inner_join = inner_join, | |
left_join = left_join, | |
right_join = right_join, | |
full_join = full_join, | |
semi_join = semi_join | |
) | |
if (length(progress_factor) > 1) stop('Arg progress_factor should be len=1 (name of a column).') | |
# Helper: | |
join_helper = function(flevel) { | |
join_fun(x = left_subsetter(flevel), | |
y = right_subsetter(flevel), | |
by = by ) | |
} | |
# Which Levels: | |
flevels = c() | |
if ( joiner %in% c('inner_join', 'left_join', 'full_join', 'semi_join') ) { | |
flevels = unique( c(flevels, as.character(x[[progress_factor]])) ) | |
} | |
if ( joiner %in% c('inner_join', 'right_join', 'full_join', 'semi_join') ) { | |
flevels = unique( c(flevels, as.character(y[[progress_factor]])) ) | |
} | |
if (is.null(flevels) | length(flevels)==0) stop('Progress factor not present in the df indicated by subset_which.') | |
# Which to Subset: | |
if (subset_which == 'both' | subset_which == 'x') { | |
left_subsetter = function(flevel) filter_(.data = x, .dots = as.formula(paste0("~", progress_factor, "=='", flevel, "'"))) | |
} else { | |
left_subsetter = function(flevel) x | |
} | |
if (subset_which == 'both' | subset_which == 'y') { | |
right_subsetter = function(flevel) filter_(.data = y, .dots = as.formula(paste0("~", progress_factor, "=='", flevel, "'"))) | |
} else { | |
right_subsetter = function(flevel) y | |
} | |
# Run: | |
list_of_dfs = pblapply(X = flevels,FUN = join_helper) | |
return( rbind_all(list_of_dfs) ) | |
} | |
inner_join_pb = function(x, y, by, progress_factor, subset_which = 'both') { | |
join_pb(x, y, by, progress_factor, subset_which = 'both', 'inner_join') | |
} | |
left_join_pb = function(x, y, by, progress_factor, subset_which = 'both') { | |
join_pb(x, y, by, progress_factor, subset_which = 'both', 'left_join') | |
} | |
right_join_pb = function(x, y, by, progress_factor, subset_which = 'both') { | |
join_pb(x, y, by, progress_factor, subset_which = 'both', 'right_join') | |
} | |
full_join_pb = function(x, y, by, progress_factor, subset_which = 'both') { | |
join_pb(x, y, by, progress_factor, subset_which = 'both', 'full_join') | |
} | |
semi_join_pb = function(x, y, by, progress_factor, subset_which = 'both') { | |
join_pb(x, y, by, progress_factor, subset_which = 'both', 'semi_join') | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment