Skip to content

Instantly share code, notes, and snippets.

@malomarrec
Last active June 8, 2017 03:23
Show Gist options
  • Save malomarrec/3b351b07e38f1991f523827ad2d73224 to your computer and use it in GitHub Desktop.
Save malomarrec/3b351b07e38f1991f523827ad2d73224 to your computer and use it in GitHub Desktop.
"Slide" dataframe (merge two columns that are identical but are NAs in a partition of rows)
#This function "slides" dataframe blocks based on NA values
#Takes a dataframe and two column names
#On each row, one of these columns have to be NA
#The function returns a dataframe with a new column called <newname> with the combined non NA values of the first columns
#Different factor levels in both columns are dealt with by casting to character, then back to factor after the merge
#If newname is not specified, the first columns is overriden by the new column
#Optionnal parameters allows to drop the old columns
slide_df <- function(df,name1,name2,newname = NULL, drop = FALSE){
require(dplyr)
if(is.null(newname)){
newname = name1
}
#Factor data needs to be converted to characters first in order no to lose levels
if(is.factor(df[,name1]) || is.factor(df[,name2])){
df[,name1] <- as.character(df[,name1])
df[,name2] <- as.character(df[,name2])
}
slided_df = dplyr::mutate(df, !!newname := ifelse(is.na(df[,name1]),df[,name2],df[,name1]))
#If factor data, we need to cast character back to factor data
# if(is.factor(df[,name1]) || is.factor(df[,name2])){
# df[,newname] <- as.factor(df[,newname])
# }
if(drop){
if(is.null(newname)){
slided_df[,name1] <- NULL #dropping i only if a newname was specified, otherwise it contains the result
}
slided_df[,name2] <- NULL #dropping name_j
}
return(slided_df)
}
#Demo
a1 <- c(1,2,3,4,5,NA,NA,NA)
a2 <- c(NA,NA,NA,NA,NA,8,8,8)
b <- rep("a",8)
c1 <- c(1,2,3,4,5,NA,NA,NA)
c2 <- c(NA,NA,NA,NA,NA,'1','1','1')
b <- rep("a",8)
c1 <- c(1,2,3,4,5,NA,NA,NA)
c2 <- c(NA,NA,NA,NA,NA,'a','a','1')
df <- data.frame(a1 = a1,b = b, c1 = c1,a2=a2,c2=c2)
# a1 b c1 a2 c2
# 1 1 a 1 NA <NA>
# 2 2 a 2 NA <NA>
# 3 3 a 3 NA <NA>
# 4 4 a 4 NA <NA>
# 5 5 a 5 NA <NA>
# 6 NA a NA 8 a
# 7 NA a NA 8 a
# 8 NA a NA 8 1
> merge_cols(df,"a1","a2","a",drop=T)
merge_cols(df,"a1","a2","a",drop=T)
# a1 b c1 a2 c2
# 1 1 a 1 NA <NA>
# 2 2 a 2 NA <NA>
# 3 3 a 3 NA <NA>
# 4 4 a 4 NA <NA>
# 5 5 a 5 NA <NA>
# 6 NA a NA 8 a
# 7 NA a NA 8 a
# 8 NA a NA 8 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment