Skip to content

Instantly share code, notes, and snippets.

@jwdink
Created June 23, 2015 04:30
Show Gist options
  • Save jwdink/9786a6c2cd07ac8e7be8 to your computer and use it in GitHub Desktop.
Save jwdink/9786a6c2cd07ac8e7be8 to your computer and use it in GitHub Desktop.
Dplyr's join functions with a progress bar. Hacky but helpful.
# Dplyr's Join w/ Progress Bar
#
# @author Jacob Dink
# jacobwdink@gmail.com
# github.com/jwdink
#
# @created April 8, 2015
#
# Dplyr's join functions with a progress bar. Hacky but helpful.
# Don't call this function, call the dplyr-named versions (e.g., left_join_pb)
#
# @param dataframe x
# @param dataframe y
# @param character.vector by
# @param character progress_factor Name of factor column in (one of both of) dfs. Progress indicated by position in levels of factor
# @param character subset_which Which df has this factor column? First, second, or both?
#
# @return dataframe joined
join_pb = function(x, y, by, progress_factor, subset_which = 'both', joiner) {
require('dplyr')
require('pbapply')
# Check Args:
subset_which = match.arg(arg = subset_which, choices = c('x','y','both'))
join_fun = switch(joiner,
inner_join = inner_join,
left_join = left_join,
right_join = right_join,
full_join = full_join,
semi_join = semi_join
)
if (length(progress_factor) > 1) stop('Arg progress_factor should be len=1 (name of a column).')
# Helper:
join_helper = function(flevel) {
join_fun(x = left_subsetter(flevel),
y = right_subsetter(flevel),
by = by )
}
# Which Levels:
flevels = c()
if ( joiner %in% c('inner_join', 'left_join', 'full_join', 'semi_join') ) {
flevels = unique( c(flevels, as.character(x[[progress_factor]])) )
}
if ( joiner %in% c('inner_join', 'right_join', 'full_join', 'semi_join') ) {
flevels = unique( c(flevels, as.character(y[[progress_factor]])) )
}
if (is.null(flevels) | length(flevels)==0) stop('Progress factor not present in the df indicated by subset_which.')
# Which to Subset:
if (subset_which == 'both' | subset_which == 'x') {
left_subsetter = function(flevel) filter_(.data = x, .dots = as.formula(paste0("~", progress_factor, "=='", flevel, "'")))
} else {
left_subsetter = function(flevel) x
}
if (subset_which == 'both' | subset_which == 'y') {
right_subsetter = function(flevel) filter_(.data = y, .dots = as.formula(paste0("~", progress_factor, "=='", flevel, "'")))
} else {
right_subsetter = function(flevel) y
}
# Run:
list_of_dfs = pblapply(X = flevels,FUN = join_helper)
return( rbind_all(list_of_dfs) )
}
inner_join_pb = function(x, y, by, progress_factor, subset_which = 'both') {
join_pb(x, y, by, progress_factor, subset_which = 'both', 'inner_join')
}
left_join_pb = function(x, y, by, progress_factor, subset_which = 'both') {
join_pb(x, y, by, progress_factor, subset_which = 'both', 'left_join')
}
right_join_pb = function(x, y, by, progress_factor, subset_which = 'both') {
join_pb(x, y, by, progress_factor, subset_which = 'both', 'right_join')
}
full_join_pb = function(x, y, by, progress_factor, subset_which = 'both') {
join_pb(x, y, by, progress_factor, subset_which = 'both', 'full_join')
}
semi_join_pb = function(x, y, by, progress_factor, subset_which = 'both') {
join_pb(x, y, by, progress_factor, subset_which = 'both', 'semi_join')
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment