Skip to content

Instantly share code, notes, and snippets.

@lissahyacinth
Last active June 12, 2017 14:14
Show Gist options
  • Save lissahyacinth/1906d2776acc2cd1ed4d555c629cc14a to your computer and use it in GitHub Desktop.
Save lissahyacinth/1906d2776acc2cd1ed4d555c629cc14a to your computer and use it in GitHub Desktop.
Base Replacement Regexp_Extract [R]
# regexp_extract - Replacement for base regexp_matching in R
#
# Uses default matching for regular expressions, but supports capture groups.
# Main use is an alternative to bad gsub matching to pull out features.
#
# regexp_extract(pattern = '([0-9])([a-z])',
# text = '3k2n2k',
# perl = TRUE,
# group = 1)
# > 3
# Allows multiple groups with %1, %2 expressions.
# regexp_extract(pattern = '([0-9])([a-z])',
# text = '3k2n2k',
# perl = TRUE,
# group = c("%1", "%2"))
# > "3" "k"
# And more complex matching with a string containing multiple expressions.
# regexp_extract(pattern = '([0-9])([a-z])',
# text = '3k2n2k',
# perl = TRUE,
# group = c("%1,%2"))
# > "3,k"
regexp_extract <- function(pattern,
text,
perl = TRUE,
group = 1){
if(is.null(text)){
return('')
}
if(is.na(text)|text == ''|nchar(text) == 0){
return('')
}
if (is.numeric(group) == FALSE & is.character(group) == FALSE) {
stop("Group must be a number (1) or a character string (\"%1, %2\")")
}
if(is.numeric(group) & is.vector(group) & length(group) > 1){
stop("Use capture expressions (\"%1, %2\") rather than c(1,2)")
}
if(is.character(group) == TRUE){
if(('%' %in% strsplit(group,'')[[1]]) == FALSE){
stop("Use capture expressions (\"%1, %2\") rather than c(1,2)")
}
}
if(is.character(group) == TRUE){
matches = regmatches(text, regexec(pattern, text, perl = perl))[[1]]
group_mod = group
for(X in 1:length(matches)-1){
group_mod = gsub(x = group_mod, pattern = paste0("%", X), replacement = matches[X+1])
}
return(group_mod)
}else{
if(length(group) == 1 & group == 0){
matched_string = regmatches(text, regexec(pattern, text, perl = perl))[[1]]
if(is.na(matched_string)){return('')}else{return(matched_string)}
}
if (length(group) == 1) {
matched_string = regmatches(text, regexec(pattern, text, perl = perl))[[1]][group + 1]
if(is.na(matched_string)){return('')}else{return(matched_string)}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment