valentinitnelav/Get first name from string.R

## Get first name from string.R
# function to get first name from a string of names (including the particle e.g. von Frisch)
# strg = a string containing names
# NOTE: The function still needs further testing. Use with care!

get_author_name <- function(strg){
    # remove any leading and trailing whitespace
    strg <- trimws(strg)

    # check if string starts with two "nobiliary" particles
    logic_2prtcl <- grepl(pattern = "^van der|^von der", x = strg, perl=TRUE, ignore.case = TRUE)

    # if there are two particles then don't test for one particle anymore
    if (logic_2prtcl) {
        logic_1prtcl <- FALSE
    } else {
        # check if string starts with one "nobiliary" particle
        logic_1prtcl <- grepl(pattern = "^von|^van|^de|^der|^di|^d'|^lord", x = strg, perl=TRUE, ignore.case = TRUE)
    }

    # replace all punctuation with space (except apostrophe (caret symbol) and minus sign)
    strg <- gsub(pattern = "[^[:alnum:][:space:]'-]", replacement = ' ', strg, perl=TRUE)

    if (!any(logic_1prtcl, logic_2prtcl)) {
        # if there is no kind of particle, then take first element when splitting by space
        # also translate characters to upper
        # this is tested first because is the most common case and "if" will often stop here
        # without the need to waste time with further testing
        return(toupper(strsplit(strg, split=" ", fixed=TRUE)[[1]][1]))
    } else if (logic_1prtcl) {
        # if contains particle take first 2 elements when splitting by space
        # also translate characters to upper
        return(toupper(paste0(strsplit(strg, split=" ", fixed=TRUE)[[1]][1:2], collapse=" ")))
    } else {
        # if contains 2 particles take first 3 elements when splitting by space
        # also translate characters to upper
        return(toupper(paste0(strsplit(strg, split=" ", fixed=TRUE)[[1]][1:3], collapse=" ")))
    }
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# EXAMPLES
get_author_name("Anders-Frisch, GJ; Johnson,")
[1] "ANDERS-FRISCH"
get_author_name("von Frisch, GJ; Johnson,")
[1] "VON FRISCH"
get_author_name("von Anders-Frisch, GJ; Johnson,")
[1] "VON ANDERS-FRISCH"
get_author_name("van der Anders-Frisch, GJ; Johnson,")
[1] "VAN DER ANDERS-FRISCH"
get_author_name("lord Anders-Frisch, GJ; Johnson,")
[1] "LORD ANDERS-FRISCH"
get_author_name("De Exemplu, GJ; Johnson,")
[1] "DE EXEMPLU"
get_author_name("An'der-son, GJ; Johnson,")
[1] "AN'DER-SON"
get_author_name("An' ders-on, GJ; Johnson,")
[1] "AN'"
	# function to get first name from a string of names (including the particle e.g. von Frisch)
	# strg = a string containing names
	# NOTE: The function still needs further testing. Use with care!

	get_author_name <- function(strg){
	# remove any leading and trailing whitespace
	strg <- trimws(strg)

	# check if string starts with two "nobiliary" particles
	logic_2prtcl <- grepl(pattern = "^van der\|^von der", x = strg, perl=TRUE, ignore.case = TRUE)

	# if there are two particles then don't test for one particle anymore
	if (logic_2prtcl) {
	logic_1prtcl <- FALSE
	} else {
	# check if string starts with one "nobiliary" particle
	logic_1prtcl <- grepl(pattern = "^von\|^van\|^de\|^der\|^di\|^d'\|^lord", x = strg, perl=TRUE, ignore.case = TRUE)
	}

	# replace all punctuation with space (except apostrophe (caret symbol) and minus sign)
	strg <- gsub(pattern = "[^[:alnum:][:space:]'-]", replacement = ' ', strg, perl=TRUE)

	if (!any(logic_1prtcl, logic_2prtcl)) {
	# if there is no kind of particle, then take first element when splitting by space
	# also translate characters to upper
	# this is tested first because is the most common case and "if" will often stop here
	# without the need to waste time with further testing
	return(toupper(strsplit(strg, split=" ", fixed=TRUE)[[1]][1]))
	} else if (logic_1prtcl) {
	# if contains particle take first 2 elements when splitting by space
	# also translate characters to upper
	return(toupper(paste0(strsplit(strg, split=" ", fixed=TRUE)[[1]][1:2], collapse=" ")))
	} else {
	# if contains 2 particles take first 3 elements when splitting by space
	# also translate characters to upper
	return(toupper(paste0(strsplit(strg, split=" ", fixed=TRUE)[[1]][1:3], collapse=" ")))
	}
	}

	# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# EXAMPLES
	get_author_name("Anders-Frisch, GJ; Johnson,")
	[1] "ANDERS-FRISCH"
	get_author_name("von Frisch, GJ; Johnson,")
	[1] "VON FRISCH"
	get_author_name("von Anders-Frisch, GJ; Johnson,")
	[1] "VON ANDERS-FRISCH"
	get_author_name("van der Anders-Frisch, GJ; Johnson,")
	[1] "VAN DER ANDERS-FRISCH"
	get_author_name("lord Anders-Frisch, GJ; Johnson,")
	[1] "LORD ANDERS-FRISCH"
	get_author_name("De Exemplu, GJ; Johnson,")
	[1] "DE EXEMPLU"
	get_author_name("An'der-son, GJ; Johnson,")
	[1] "AN'DER-SON"
	get_author_name("An' ders-on, GJ; Johnson,")
	[1] "AN'"