Skip to content

Instantly share code, notes, and snippets.

@briatte
Last active October 23, 2017 18:59
Show Gist options
  • Save briatte/a665f6bb08439e9d0412 to your computer and use it in GitHub Desktop.
Save briatte/a665f6bb08439e9d0412 to your computer and use it in GitHub Desktop.
manipulate full names with stringr
library(stringr)
library(rvest)
test = read_html("http://www.parlament.gv.at/WWER/NR") %>%
html_node(xpath = "//a[contains(text(), 'anzeigen')]") %>%
html_attr("href") %>%
paste0("http://www.parlament.gv.at", .) %>%
read_html %>%
html_nodes(".tabelle td a") %>%
html_text
#' Clean spaces
str_space <- function(x) {
gsub("(\\\\r|\\\\n|\\n|\\\\t|\\s)+", " ", x)
}
#' Adjust punctuation
#' @examples
#' str_punct("a, b, c") # not modified
#' str_punct("a,,b .c") # single comma, fixed spacing before period
#' str_punct("!a--bc;") # single dash, no punctation at start/end
str_punct <- function(x, preserve = c(",", "'", "-", "\\."),
space = c(",", ";", "\\.", "?", "!")) {
# punctation
p = c("!", "\\", "#", "$", "%", "&", "'", "\\(", "\\)", "\\*", "\\+", ",",
"-", "\\.", "/", ":", ";", "<", "=", ">", "?", "@", "\\[" , "\\\\",
"\\]", "\\^", "_", "`", "\\{", "\\|", "\\}", "~", "\\.")
# single punctation marks
for (i in p[ !p %in% preserve ]) {
x = gsub(paste0("(\\s*", i, "\\s*)+"), "", x)
}
# add missing spaces for some reserved marks
for (i in space[ space %in% preserve ]) {
x = gsub(paste0("(\\s*", i, "\\s*)+"), paste0(i, " "), x)
}
# no space before other preserved characters
for(i in preserve[ !preserve %in% space ]) {
x = gsub(paste0("\\s*", i, "+\\s*"), i, x)
}
# no start/end punctuation
x = gsub("^(\\s|[[:punct:]])+|(\\s|[[:punct:]])+$", "", x)
x
}
#' Remove prefixes or suffixes
#' @param sep character separator, defaults to ","
#' @param side where to look, "left" (default) or "right"
#' @param greedy defaults to TRUE
#' @examples
#' ex = c("a", "a, b", "a, b, c", "a, b, c, d")
#' str_filter(ex, side = "left" , greedy = TRUE)
#' str_filter(ex, side = "right" , greedy = TRUE)
#' str_filter(ex, side = "left" , greedy = FALSE)
#' str_filter(ex, side = "right" , greedy = FALSE)
str_filter <- function(x, sep = ",", side = "left", greedy = TRUE) {
gsub(switch(side,
left = c(ifelse(greedy, "", "?"), ")", sep, "\\s*"),
right = c(ifelse(greedy, "?", ""), ")\\s*", sep)) %>%
c("(.*", ., "(.*)") %>%
paste0(collapse = ""),
switch(side, left = "\\2", right = "\\1"), x)
}
x = "A. B. Jon Example, C, D"
str_filter(x, sep = "\\.", side ="left", greedy = FALSE)
# [1] "B. Jon Example, C, D"
str_filter(x, sep = "\\.", side ="left", greedy = TRUE)
# [1] "Jon Example, C, D"
str_filter(x, sep = ",", side ="right", greedy = FALSE)
# [1] "A. B. Jon Example, C"
str_filter(x, sep = ",", side ="right", greedy = TRUE)
# [1] "A. B. Jon Example"
#' Detach prefixes or suffixes
#' @param ... arguments to \code{str_filter}
#' @examples
#' ex = c("a", "a, b", "a, b, c", "a, b, c, d")
#' str_detach(ex, side = "left" , greedy = TRUE)
#' str_detach(ex, side = "right" , greedy = TRUE)
#' str_detach(ex, side = "left" , greedy = FALSE)
#' str_detach(ex, side = "right" , greedy = FALSE)
str_detach <- function(x, sep = ",", side = "left", greedy = TRUE) {
y = str_filter(x, sep, side, greedy) %>% sapply(nchar)
x[ y > 0 ] = lapply(x[ y > 0 ],
function(x, regex = str_filter(x, sep, side, greedy)) {
y = c(
switch(side,
left = str_replace(x, regex, "") %>%
str_replace(str_c(sep, "\\s*$"), ""),
right = NULL),
regex,
switch(side,
left = NULL,
right = str_replace(x, regex, "") %>%
str_replace(str_c("^", sep, "\\s*"), ""))
)
ifelse(!nchar(y), NA, y)
})
sapply(x, function(x){ x[ switch(side, left = 1, right = 2 )]})
}
x = "A. B. Jon Example, C, D"
str_detach(x, sep = "\\.", side ="left", greedy = FALSE)
# [1] "A"
str_detach(x, sep = "\\.", side ="left", greedy = TRUE)
# [1] "A. B"
str_detach(x, sep = ",", side ="right", greedy = FALSE)
# [1] "D"
str_detach(x, sep = ",", side ="right", greedy = TRUE)
# [1] "C, D"
str_detach("Jon Example", side = "left")
# [1] NA
str_detach("Jon Example", side = "right")
# [1] NA
str_clean <- function(x) {
x = str_space(x) %>%
str_punct
data.frame(
prefix = str_filter(x, side = "right", sep = ",") %>%
str_detach(side = "left", sep = "\\.") %>%
str_punct(preserve = c(",", "-")),
name = str_filter(x, side = "right", sep = ",") %>%
str_filter(side = "left", sep = "\\."),
suffix = str_detach(x, sep = ",", side = "right") %>%
str_replace("\\.\\s", ", ") %>%
str_punct(preserve = c(",", "-")),
stringsAsFactors = FALSE
)
}
library(dplyr)
test = test[ -1:-3] %>%
cbind(original = ., str_clean(.)) %>%
# move prefix to name (row #73)
mutate(name = ifelse(name == "", prefix, name),
prefix = ifelse(prefix == name, NA, prefix)) %>%
# drop empty prefix column
select(-prefix)
#' Put first word last
str_invert <- function(x, size = 1) {
sapply(x, function(x) {
x = str_split(x, "\\s") %>% unlist
str_c(str_c(x[ -size ], collapse = " "), x[ size ], sep = " ")
})
}
subset(data.frame(from = test$name,
to = str_replace(test$name, "El\\s", "El_") %>%
str_invert %>%
str_replace("_", " ")),
str_count(to, "\\s") > 1)
test = select(-original) %>%
# invert names, protecting a prefix
mutate(test, name = str_replace(name, "El\\s", "El_") %>%
str_invert %>%
str_replace("_", " ")) %>%
# find and number duplicate names
group_by(name) %>%
mutate(n = n(), o = 1:n()) %>%
group_by() %>%
mutate(name = ifelse(n > 1, str_c(name, "-", o), name)) %>%
select(-n, -o)
# sanity check: no duplicates
rownames(test) = test$name
# or, alternately
stopifnot(!duplicated(test$name))
# test = c("Doris Bures", "Karlheinz Kopf", "Ing. Norbert Hofer", "\r\n\t\t\r\nAlm Nikolaus, Mag.\r\n",
# "\r\n\t\t\r\nAmon Werner, MBA\r\n", "\r\n\t\t\r\nAngerer Erwin\r\n",
# "\r\n\t\t\r\nAntoni Konrad\r\n", "\r\n\t\t\r\nAslan Aygül Berivan, Mag.\r\n",
# "\r\n\t\t\r\nAubauer Gertrude, Mag.\r\n", "\r\n\t\t\r\nAuer Jakob\r\n",
# "\r\n\t\t\r\nBacher Walter\r\n", "\r\n\t\t\r\nBayr Petra, MA\r\n",
# "\r\n\t\t\r\nBecher Ruth, Mag.\r\n", "\r\n\t\t\r\nBelakowitsch-Jenewein Dagmar, Dr.\r\n",
# "\r\n\t\t\r\nBerlakovich Nikolaus, Dipl.-Ing.\r\n", "\r\n\t\t\r\nBösch Reinhard Eugen, Dr.\r\n",
# "\r\n\t\t\r\nBrosz Dieter, MSc\r\n", "\r\n\t\t\r\nBrückl Hermann\r\n",
# "\r\n\t\t\r\nBrunner Christiane, Mag.\r\n", "\r\n\t\t\r\nBuchmayr Harry\r\n",
# "\r\n\t\t\r\nBures Doris\r\n", "\r\n\t\t\r\nCap Josef, Dr.\r\n",
# "\r\n\t\t\r\nDarmann Gernot, Mag.\r\n", "\r\n\t\t\r\nDeimek Gerhard, Dipl.-Ing.\r\n",
# "\r\n\t\t\r\nDiesner-Wais Martina\r\n", "\r\n\t\t\r\nDietrich Waltraud, Ing.\r\n",
# "\r\n\t\t\r\nDoppler Rupert\r\n", "\r\n\t\t\r\nDurchschlag Claudia\r\n",
# "\r\n\t\t\r\nEcker Cornelia\r\n", "\r\n\t\t\r\nEhmann Michael\r\n",
# "\r\n\t\t\r\nEl Habbassi Asdin, BA\r\n", "\r\n\t\t\r\nErtlschweiger Rouven, MSc\r\n",
# "\r\n\t\t\r\nEßl Franz Leonhard\r\n", "\r\n\t\t\r\nFazekas Hannes\r\n",
# "\r\n\t\t\r\nFeichtinger Klaus Uwe, Mag. Dr.\r\n", "\r\n\t\t\r\nFekter Maria Theresia, Mag. Dr.\r\n",
# "\r\n\t\t\r\nFichtinger Angela\r\n", "\r\n\t\t\r\nFranz Marcus, Dr.\r\n",
# "\r\n\t\t\r\nFuchs Hubert, MMag. DDr.\r\n", "\r\n\t\t\r\nGahr Hermann\r\n",
# "\r\n\t\t\r\nGamon Claudia Angela, MSc (WU)\r\n", "\r\n\t\t\r\nGerstl Wolfgang, Mag.\r\n",
# "\r\n\t\t\r\nGessl-Ranftl Andrea\r\n", "\r\n\t\t\r\nGlawischnig-Piesczek Eva, Dr.\r\n",
# "\r\n\t\t\r\nGreiner Karin, Mag.\r\n", "\r\n\t\t\r\nGrillitsch Fritz\r\n",
# "\r\n\t\t\r\nGroiß Werner, Ing. Mag.\r\n", "\r\n\t\t\r\nGrossmann Elisabeth, Mag.\r\n",
# "\r\n\t\t\r\nGusenbauer-Jäger Marianne\r\n", "\r\n\t\t\r\nHable Rainer, Dr.\r\n",
# "\r\n\t\t\r\nHackl Heinz-Peter, Ing.\r\n", "\r\n\t\t\r\nHafenecker Christian, MA\r\n",
# "\r\n\t\t\r\nHagen Christoph\r\n", "\r\n\t\t\r\nHaider Roman, Mag.\r\n",
# "\r\n\t\t\r\nHakel Elisabeth\r\n", "\r\n\t\t\r\nHammer Michael, Mag.\r\n",
# "\r\n\t\t\r\nHanger Andreas, Mag.\r\n", "\r\n\t\t\r\nHaubner Peter\r\n",
# "\r\n\t\t\r\nHauser Gerald, Mag.\r\n", "\r\n\t\t\r\nHechtl Johann\r\n",
# "\r\n\t\t\r\nHeinzl Anton\r\n", "\r\n\t\t\r\nHell Johann\r\n",
# "\r\n\t\t\r\nHimmelbauer Eva-Maria, BSc\r\n", "\r\n\t\t\r\nHöbart Christian, Ing.\r\n",
# "\r\n\t\t\r\nHofer Norbert, Ing.\r\n", "\r\n\t\t\r\nHofinger Manfred, Ing.\r\n",
# "\r\n\t\t\r\nHöfinger Johann\r\n", "\r\n\t\t\r\nHolzinger-Vogtenhuber Daniela, BA\r\n",
# "\r\n\t\t\r\nHuainigg Franz-Joseph, Dr.\r\n", "\r\n\t\t\r\nHübner Johannes, Dr.\r\n",
# "\r\n\t\t\r\nJank Brigitte\r\n", "\r\n\t\t\r\nJannach Harald\r\n",
# "\r\n\t\t\r\nJarmer Helene, Mag.\r\n", "\r\n\t\t\r\nJarolim Johannes, Dr.\r\n",
# "\r\n\t\t\r\nKarl Beatrix, Mag. Dr.\r\n", "\r\n\t\t\r\nKarlsböck Andreas F., Dr.\r\n",
# "\r\n\t\t\r\nKassegger Axel, MMMag. Dr.\r\n", "\r\n\t\t\r\nKatzian Wolfgang\r\n",
# "\r\n\t\t\r\nKeck Dietmar\r\n", "\r\n\t\t\r\nKickl Herbert\r\n",
# "\r\n\t\t\r\nKirchgatterer Franz\r\n", "\r\n\t\t\r\nKitzmüller Anneliese\r\n",
# "\r\n\t\t\r\nKnes Wolfgang\r\n", "\r\n\t\t\r\nKöchl Matthias\r\n",
# "\r\n\t\t\r\nKogler Werner, Mag.\r\n", "\r\n\t\t\r\nKönigsberger-Ludwig Ulrike\r\n",
# "\r\n\t\t\r\nKopf Karlheinz\r\n", "\r\n\t\t\r\nKorun Alev, Mag.\r\n",
# "\r\n\t\t\r\nKrainer Kai Jan\r\n", "\r\n\t\t\r\nKrist Hermann\r\n",
# "\r\n\t\t\r\nKucharowits Katharina\r\n", "\r\n\t\t\r\nKucher Philip\r\n",
# "\r\n\t\t\r\nKumpitsch Günther, Mag.\r\n", "\r\n\t\t\r\nKuntzl Andrea, Mag.\r\n",
# "\r\n\t\t\r\nLausch Christian\r\n", "\r\n\t\t\r\nLettenbichler Josef, Mag.\r\n",
# "\r\n\t\t\r\nLichtenecker Ruperta, Dr.\r\n", "\r\n\t\t\r\nLintl Jessi, Dr.\r\n",
# "\r\n\t\t\r\nLipitsch Hermann\r\n", "\r\n\t\t\r\nLoacker Gerald, Mag.\r\n",
# "\r\n\t\t\r\nLopatka Reinhold, Dr.\r\n", "\r\n\t\t\r\nLueger Angela\r\n",
# "\r\n\t\t\r\nLugar Robert, Ing.\r\n", "\r\n\t\t\r\nMatznetter Christoph, Dr.\r\n",
# "\r\n\t\t\r\nMaurer Sigrid\r\n", "\r\n\t\t\r\nMayer Elmar\r\n",
# "\r\n\t\t\r\nMölzer Wendelin\r\n", "\r\n\t\t\r\nMoser Gabriela, Dr.\r\n",
# "\r\n\t\t\r\nMuchitsch Josef\r\n", "\r\n\t\t\r\nMückstein Eva, Dr.\r\n",
# "\r\n\t\t\r\nMühlberghuber Edith\r\n", "\r\n\t\t\r\nMusiol Daniela, Mag.\r\n",
# "\r\n\t\t\r\nMuttonen Christine, Mag.\r\n", "\r\n\t\t\r\nNachbaur Kathrin, Dr.\r\n",
# "\r\n\t\t\r\nNeubauer Werner\r\n", "\r\n\t\t\r\nObernosterer Gabriel\r\n",
# "\r\n\t\t\r\nOfenauer Friedrich, Mag.\r\n", "\r\n\t\t\r\nOttenschläger Andreas\r\n",
# "\r\n\t\t\r\nPendl Otto\r\n", "\r\n\t\t\r\nPfurtscheller Elisabeth, Dipl.-Kffr. (FH)\r\n",
# "\r\n\t\t\r\nPilz Peter, Dr.\r\n", "\r\n\t\t\r\nPirklhuber Wolfgang, Dipl.-Ing. Dr.\r\n",
# "\r\n\t\t\r\nPlessl Rudolf\r\n", "\r\n\t\t\r\nPock Michael\r\n",
# "\r\n\t\t\r\nPreiner Erwin\r\n", "\r\n\t\t\r\nPrinz Nikolaus\r\n",
# "\r\n\t\t\r\nRädler Johann\r\n", "\r\n\t\t\r\nRasinger Erwin, Dr.\r\n",
# "\r\n\t\t\r\nRauch Johannes, Mag.\r\n", "\r\n\t\t\r\nRauch Walter\r\n",
# "\r\n\t\t\r\nRiemer Josef A.\r\n", "\r\n\t\t\r\nRosenkranz Walter, Dr.\r\n",
# "\r\n\t\t\r\nRosenkranz Barbara\r\n", "\r\n\t\t\r\nRossmann Bruno, Mag.\r\n",
# "\r\n\t\t\r\nSchabhüttl Jürgen\r\n", "\r\n\t\t\r\nSchatz Birgit, Mag.\r\n",
# "\r\n\t\t\r\nSchellenbacher Thomas, Ing.\r\n", "\r\n\t\t\r\nSchellhorn Josef\r\n",
# "\r\n\t\t\r\nSchenk Martina\r\n", "\r\n\t\t\r\nScherak Nikolaus, Dr.\r\n",
# "\r\n\t\t\r\nSchieder Andreas, Mag.\r\n", "\r\n\t\t\r\nSchimanek Carmen\r\n",
# "\r\n\t\t\r\nSchittenhelm Dorothea\r\n", "\r\n\t\t\r\nSchmid Julian, BA\r\n",
# "\r\n\t\t\r\nSchmid Gerhard\r\n", "\r\n\t\t\r\nSchmuckenschlager Johannes\r\n",
# "\r\n\t\t\r\nSchönegger Bernd, Mag.\r\n", "\r\n\t\t\r\nSchopf Walter\r\n",
# "\r\n\t\t\r\nSchrangl Philipp, Mag.\r\n", "\r\n\t\t\r\nSchultes Hermann, Ing.\r\n",
# "\r\n\t\t\r\nSchwentner Judith, Mag.\r\n", "\r\n\t\t\r\nSieber Norbert\r\n",
# "\r\n\t\t\r\nSinger Johann\r\n", "\r\n\t\t\r\nSpindelberger Erwin\r\n",
# "\r\n\t\t\r\nStefan Harald, Mag.\r\n", "\r\n\t\t\r\nSteger Petra\r\n",
# "\r\n\t\t\r\nSteinacker Michaela, Mag.\r\n", "\r\n\t\t\r\nSteinbichler Leopold\r\n",
# "\r\n\t\t\r\nSteinhauser Albert, Mag.\r\n", "\r\n\t\t\r\nStrache Heinz-Christian\r\n",
# "\r\n\t\t\r\nStrasser Georg, Dipl.-Ing.\r\n", "\r\n\t\t\r\nStrolz Matthias, Mag. Dr.\r\n",
# "\r\n\t\t\r\nTamandl Gabriele\r\n", "\r\n\t\t\r\nThemessl Bernhard\r\n",
# "\r\n\t\t\r\nTöchterle Karlheinz, Dr.\r\n", "\r\n\t\t\r\nTroch Harald, Dr.\r\n",
# "\r\n\t\t\r\nUnterrainer Maximilian, Mag.\r\n", "\r\n\t\t\r\nVavrik Christoph, Mag.\r\n",
# "\r\n\t\t\r\nVetter Georg, Dr.\r\n", "\r\n\t\t\r\nVogl Markus, Ing.\r\n",
# "\r\n\t\t\r\nWalser Harald, Dr.\r\n", "\r\n\t\t\r\nWeigerstorfer Ulrike\r\n",
# "\r\n\t\t\r\nWeninger Hannes\r\n", "\r\n\t\t\r\nWilli Georg\r\n",
# "\r\n\t\t\r\nWimmer Rainer\r\n", "\r\n\t\t\r\nWindbüchler-Souschill Tanja\r\n",
# "\r\n\t\t\r\nWinter Susanne, Dr.\r\n", "\r\n\t\t\r\nWinzig Angelika, Dr.\r\n",
# "\r\n\t\t\r\nWittmann Peter, Dr.\r\n", "\r\n\t\t\r\nWöginger August\r\n",
# "\r\n\t\t\r\nWurm Gisela, Mag.\r\n", "\r\n\t\t\r\nWurm Peter\r\n",
# "\r\n\t\t\r\nYilmaz Nurten\r\n", "\r\n\t\t\r\nZakostelsky Andreas, Mag.\r\n",
# "\r\n\t\t\r\nZanger Wolfgang\r\n", "\r\n\t\t\r\nZinggl Wolfgang, Mag. Dr.\r\n"
# )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment