lissahyacinth/regexp_extract.R

## regexp_extract.R
# regexp_extract - Replacement for base regexp_matching in R
#
# Uses default matching for regular expressions, but supports capture groups.
# Main use is an alternative to bad gsub matching to pull out features.
#
# regexp_extract(pattern = '([0-9])([a-z])',
#                 text = '3k2n2k',
#                 perl = TRUE,
#                 group = 1)
# > 3
# Allows multiple groups with %1, %2 expressions.
# regexp_extract(pattern = '([0-9])([a-z])',
#                 text = '3k2n2k',
#                 perl = TRUE,
#                 group = c("%1", "%2"))
# > "3" "k"
# And more complex matching with a string containing multiple expressions.
# regexp_extract(pattern = '([0-9])([a-z])',
#                 text = '3k2n2k',
#                 perl = TRUE,
#                 group = c("%1,%2"))
# > "3,k"


regexp_extract <- function(pattern,
                           text,
                           perl = TRUE,
                           group = 1){
  if(is.null(text)){
    return('')
  }
  if(is.na(text)|text == ''|nchar(text) == 0){
    return('')
  }
  if (is.numeric(group) == FALSE & is.character(group) == FALSE) {
    stop("Group must be a number (1) or a character string (\"%1, %2\")")
  }
  if(is.numeric(group) & is.vector(group) & length(group) > 1){
    stop("Use capture expressions (\"%1, %2\") rather than c(1,2)")
  }
  if(is.character(group) == TRUE){
    if(('%' %in% strsplit(group,'')[[1]]) == FALSE){
    stop("Use capture expressions (\"%1, %2\") rather than c(1,2)")
    }
  }
  if(is.character(group) == TRUE){
    matches = regmatches(text, regexec(pattern, text, perl = perl))[[1]]
    group_mod = group
    for(X in 1:length(matches)-1){
      group_mod = gsub(x = group_mod, pattern = paste0("%", X), replacement = matches[X+1])
    }
    return(group_mod)
  }else{
    if(length(group) == 1 & group == 0){
      matched_string = regmatches(text, regexec(pattern, text, perl = perl))[[1]]
      if(is.na(matched_string)){return('')}else{return(matched_string)}
    }
    if (length(group) == 1) {
      matched_string = regmatches(text, regexec(pattern, text, perl = perl))[[1]][group + 1]
      if(is.na(matched_string)){return('')}else{return(matched_string)}
    }
  }
}
	# regexp_extract - Replacement for base regexp_matching in R
	#
	# Uses default matching for regular expressions, but supports capture groups.
	# Main use is an alternative to bad gsub matching to pull out features.
	#
	# regexp_extract(pattern = '([0-9])([a-z])',
	# text = '3k2n2k',
	# perl = TRUE,
	# group = 1)
	# > 3
	# Allows multiple groups with %1, %2 expressions.
	# regexp_extract(pattern = '([0-9])([a-z])',
	# text = '3k2n2k',
	# perl = TRUE,
	# group = c("%1", "%2"))
	# > "3" "k"
	# And more complex matching with a string containing multiple expressions.
	# regexp_extract(pattern = '([0-9])([a-z])',
	# text = '3k2n2k',
	# perl = TRUE,
	# group = c("%1,%2"))
	# > "3,k"


	regexp_extract <- function(pattern,
	text,
	perl = TRUE,
	group = 1){
	if(is.null(text)){
	return('')
	}
	if(is.na(text)\|text == ''\|nchar(text) == 0){
	return('')
	}
	if (is.numeric(group) == FALSE & is.character(group) == FALSE) {
	stop("Group must be a number (1) or a character string (\"%1, %2\")")
	}
	if(is.numeric(group) & is.vector(group) & length(group) > 1){
	stop("Use capture expressions (\"%1, %2\") rather than c(1,2)")
	}
	if(is.character(group) == TRUE){
	if(('%' %in% strsplit(group,'')[[1]]) == FALSE){
	stop("Use capture expressions (\"%1, %2\") rather than c(1,2)")
	}
	}
	if(is.character(group) == TRUE){
	matches = regmatches(text, regexec(pattern, text, perl = perl))[[1]]
	group_mod = group
	for(X in 1:length(matches)-1){
	group_mod = gsub(x = group_mod, pattern = paste0("%", X), replacement = matches[X+1])
	}
	return(group_mod)
	}else{
	if(length(group) == 1 & group == 0){
	matched_string = regmatches(text, regexec(pattern, text, perl = perl))[[1]]
	if(is.na(matched_string)){return('')}else{return(matched_string)}
	}
	if (length(group) == 1) {
	matched_string = regmatches(text, regexec(pattern, text, perl = perl))[[1]][group + 1]
	if(is.na(matched_string)){return('')}else{return(matched_string)}
	}
	}
	}