Last active
November 1, 2019 15:16
-
-
Save romain9292/58c438d3a441b36bbf5161e1317a1ca4 to your computer and use it in GitHub Desktop.
[Nettoyer un texte avec R] Supprimer les sauts de ligne, balises HTML, espaces et plus #R #text #clean #datacleansing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clean_text <- function(text){ | |
#Retrait du saut de ligne \n | |
text <- gsub("\n"," ",text) | |
#Retrait des URLs | |
text <- gsub('http\\S+\\s*',"",text) | |
#Retrait des espaces en trop | |
text <- gsub("\\s+"," ",text) | |
#Retrait des backslash "\" | |
text <- gsub("[\\]","",text) | |
#Retrait des espaces en fin de texte | |
text <- gsub("\\s*$","",text) | |
#Harmonisation du text - passage en minuscule | |
text <- tolower(text) | |
#Retrait des balises HTML | |
text <- gsub("<.*?>", "",text) | |
#Retrait de la ponctuation, remplacé par un espace pour ne pas coller les mots | |
text <- gsub("[[:punct:]]", " ",text) | |
#Retrait des valeurs numériques | |
text <- gsub("\\d+","",text) | |
#Retrait des lignes | |
text <- gsub("[[:blank:]]+$","",text) | |
} | |
#Compter les mots dans une phrase | |
wordcount <- function(str) { | |
sapply(gregexpr("\\b\\W+\\b", str, perl=TRUE), function(x) sum(x>0) ) + 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment