Last active
November 18, 2016 10:10
-
-
Save valentinitnelav/1b4111666a034d964b389f79ad47ec95 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function to get first name from a string of names (including the particle e.g. von Frisch) | |
# strg = a string containing names | |
# NOTE: The function still needs further testing. Use with care! | |
get_author_name <- function(strg){ | |
# remove any leading and trailing whitespace | |
strg <- trimws(strg) | |
# check if string starts with two "nobiliary" particles | |
logic_2prtcl <- grepl(pattern = "^van der|^von der", x = strg, perl=TRUE, ignore.case = TRUE) | |
# if there are two particles then don't test for one particle anymore | |
if (logic_2prtcl) { | |
logic_1prtcl <- FALSE | |
} else { | |
# check if string starts with one "nobiliary" particle | |
logic_1prtcl <- grepl(pattern = "^von|^van|^de|^der|^di|^d'|^lord", x = strg, perl=TRUE, ignore.case = TRUE) | |
} | |
# replace all punctuation with space (except apostrophe (caret symbol) and minus sign) | |
strg <- gsub(pattern = "[^[:alnum:][:space:]'-]", replacement = ' ', strg, perl=TRUE) | |
if (!any(logic_1prtcl, logic_2prtcl)) { | |
# if there is no kind of particle, then take first element when splitting by space | |
# also translate characters to upper | |
# this is tested first because is the most common case and "if" will often stop here | |
# without the need to waste time with further testing | |
return(toupper(strsplit(strg, split=" ", fixed=TRUE)[[1]][1])) | |
} else if (logic_1prtcl) { | |
# if contains particle take first 2 elements when splitting by space | |
# also translate characters to upper | |
return(toupper(paste0(strsplit(strg, split=" ", fixed=TRUE)[[1]][1:2], collapse=" "))) | |
} else { | |
# if contains 2 particles take first 3 elements when splitting by space | |
# also translate characters to upper | |
return(toupper(paste0(strsplit(strg, split=" ", fixed=TRUE)[[1]][1:3], collapse=" "))) | |
} | |
} | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
# EXAMPLES | |
get_author_name("Anders-Frisch, GJ; Johnson,") | |
[1] "ANDERS-FRISCH" | |
get_author_name("von Frisch, GJ; Johnson,") | |
[1] "VON FRISCH" | |
get_author_name("von Anders-Frisch, GJ; Johnson,") | |
[1] "VON ANDERS-FRISCH" | |
get_author_name("van der Anders-Frisch, GJ; Johnson,") | |
[1] "VAN DER ANDERS-FRISCH" | |
get_author_name("lord Anders-Frisch, GJ; Johnson,") | |
[1] "LORD ANDERS-FRISCH" | |
get_author_name("De Exemplu, GJ; Johnson,") | |
[1] "DE EXEMPLU" | |
get_author_name("An'der-son, GJ; Johnson,") | |
[1] "AN'DER-SON" | |
get_author_name("An' ders-on, GJ; Johnson,") | |
[1] "AN'" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment