Skip to content

Instantly share code, notes, and snippets.

@hannesdatta
Created March 24, 2020 15:56
Show Gist options
  • Save hannesdatta/aa26a5feecc75bc6a6f43d62117d38ff to your computer and use it in GitHub Desktop.
Save hannesdatta/aa26a5feecc75bc6a6f43d62117d38ff to your computer and use it in GitHub Desktop.
clean clear-text artist names from collaborations and secondary artists
require(stringi)
spelling_variants <- function(x, remove_collabs=F, remove_parentheses=T) {
qualifiers = c(" feat .*", " feat[.].*", " ft.*", " ft[.].*"," featuring.*"," vs[.].*"," vs.*"," versus.*"," with.*","[-].*"," / .*",
"/.*","[|].*", "[[].*[]]", "[)].*", ";.*","[+].*","[&] .*","[&].*",",.*"," and .*", " con .*", " e .*", " et .*",
" x .*")
# remove articles (a, the)
ret = gsub(" a ", "", tolower(str_trim(x)))
removes = c("the ", # article
"^[(][[:digit:]]+[)]", # digits wrapped in parentheses at beginning of string
"^[#][[:digit:]]{1,5}[[:punct:]|[:space:]]", # digits lead by # at beginning of string
"^[0][[:digit:]]{1,2}[[:punct:]|[:space:]]", # digits lead by 0 at beginning of string
"^[[:digit:]]{1,3}[[:punct:]|[:space:]]", # digits at the beginning of a string if they are followed by a punctuation mark (e.g., 12 - Hello keeps Hello)
"[[:digit:]]{2}[[:punct:]][[:digit:]]{2}[[:punct:]][[:digit:]]{2,4}") # date-type objects
for (rem in removes) ret=str_trim(gsub(rem,'',ret))
# Remove dash if occuring within first three characters (e.g., "JAY-Z")
ret = gsub('(^[a-z]{0,3})([-])', '\\1', ret)
# Remove special characters (e.g., - Jodeli -) at the beginning and end of a string
for (i in 1:5) {
first_char = substr(ret,1,1)
last_char = substr(stri_reverse(ret),1,1)
eval_condition = grepl('[[:punct:]]{2}', paste0(first_char,last_char))
start_char = ifelse(eval_condition==T, 2,1)
end_char = ifelse(eval_condition==T, nchar(ret)-1,nchar(ret))
ret=str_trim(substr(ret,start_char,end_char))
}
if(remove_parentheses==T) {
# Remove content from parentheses, unless it's at the beginning of a string
ret = gsub('(?<!^)[(].*','', ret,perl=T)
}
# Remove dash/+/[ at beginning of string
ret = gsub('^[-]|^[+]|^[[]','', ret)
if(remove_collabs==T) {
# remove collaborations
for (qual in qualifiers) ret=sub(qual, "", str_trim(ret))
}
ret = str_trim(sub("30 ", "thirty ", ret))
ret = str_trim(gsub("[^0-9A-Za-z ]", "", ret))
for (i in 1:5) ret = str_trim(gsub(" ", " ", ret)) # remove white space inbetween characters
return(ret)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment