Last active
June 12, 2017 14:14
-
-
Save lissahyacinth/1906d2776acc2cd1ed4d555c629cc14a to your computer and use it in GitHub Desktop.
Base Replacement Regexp_Extract [R]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# regexp_extract - Replacement for base regexp_matching in R | |
# | |
# Uses default matching for regular expressions, but supports capture groups. | |
# Main use is an alternative to bad gsub matching to pull out features. | |
# | |
# regexp_extract(pattern = '([0-9])([a-z])', | |
# text = '3k2n2k', | |
# perl = TRUE, | |
# group = 1) | |
# > 3 | |
# Allows multiple groups with %1, %2 expressions. | |
# regexp_extract(pattern = '([0-9])([a-z])', | |
# text = '3k2n2k', | |
# perl = TRUE, | |
# group = c("%1", "%2")) | |
# > "3" "k" | |
# And more complex matching with a string containing multiple expressions. | |
# regexp_extract(pattern = '([0-9])([a-z])', | |
# text = '3k2n2k', | |
# perl = TRUE, | |
# group = c("%1,%2")) | |
# > "3,k" | |
regexp_extract <- function(pattern, | |
text, | |
perl = TRUE, | |
group = 1){ | |
if(is.null(text)){ | |
return('') | |
} | |
if(is.na(text)|text == ''|nchar(text) == 0){ | |
return('') | |
} | |
if (is.numeric(group) == FALSE & is.character(group) == FALSE) { | |
stop("Group must be a number (1) or a character string (\"%1, %2\")") | |
} | |
if(is.numeric(group) & is.vector(group) & length(group) > 1){ | |
stop("Use capture expressions (\"%1, %2\") rather than c(1,2)") | |
} | |
if(is.character(group) == TRUE){ | |
if(('%' %in% strsplit(group,'')[[1]]) == FALSE){ | |
stop("Use capture expressions (\"%1, %2\") rather than c(1,2)") | |
} | |
} | |
if(is.character(group) == TRUE){ | |
matches = regmatches(text, regexec(pattern, text, perl = perl))[[1]] | |
group_mod = group | |
for(X in 1:length(matches)-1){ | |
group_mod = gsub(x = group_mod, pattern = paste0("%", X), replacement = matches[X+1]) | |
} | |
return(group_mod) | |
}else{ | |
if(length(group) == 1 & group == 0){ | |
matched_string = regmatches(text, regexec(pattern, text, perl = perl))[[1]] | |
if(is.na(matched_string)){return('')}else{return(matched_string)} | |
} | |
if (length(group) == 1) { | |
matched_string = regmatches(text, regexec(pattern, text, perl = perl))[[1]][group + 1] | |
if(is.na(matched_string)){return('')}else{return(matched_string)} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment