Last active
February 20, 2018 04:06
-
-
Save gomesfellipe/141f5b1bb6f4f023baffeb0abeca9cca to your computer and use it in GitHub Desktop.
Funções para limpeza da base de dados para Mineração de texto
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
#+ Captação de erros de codificacao: + | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
catch.error = function(x){ | |
y = NA # Cria um vetor com valor faltante para teste | |
catch_error = tryCatch(tolower(x), error=function(e) e) # Tente pegar esse erro (NA) que acabamos de criar | |
if (!inherits(catch_error, "error")) # Se não for um erro | |
y = tolower(x) # verificar resultado se houver erro, caso contrário, a função funciona normalmente | |
return(y) | |
} | |
#Fonte: https://sites.google.com/site/miningtwitter/questions/talking-about/given-topic | |
#----------------------------------------------------------------------------------------------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
#+ Limpeza de caracteres especiais + | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
cleanTweets<- function(tweet){ | |
# Limpe o tweet para análise de sentimentos | |
tweet = gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", " ", tweet) # Remove html links | |
tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", tweet) # Remove retweet | |
tweet = gsub("#\\w+", " ", tweet) # Remove todos "#Hashtag" | |
tweet = gsub("@\\w+", " ", tweet) # Remove todos "@people" | |
tweet = gsub("[[:punct:]]", " ", tweet) # Remove todas as pontuacoes | |
tweet = gsub("[[:digit:]]", " ", tweet) # Remover numeros, precisamos apenas de texto para análise | |
tweet = gsub("[ \t]{2,}", " ", tweet) # Remove espaços desnecessarios | |
tweet = gsub("^\\s+|\\s+$", "", tweet) # (espacos em branco, tabs etc) | |
tweet = gsub('https://','',tweet) # Remove https:// | |
tweet = gsub('http://','',tweet) # Remove http:// | |
tweet = gsub('[^[:graph:]]', ' ',tweet) # Remove strings gráficos como emoticons | |
tweet = gsub('[[:punct:]]', '', tweet) # Remove pontuacao | |
tweet = gsub('[[:cntrl:]]', '', tweet) # Remove strings de controle | |
tweet = gsub('\\d+', '', tweet) # Remove numeros | |
tweet=str_replace_all(tweet,"[^[:graph:]]", " ") # Remove strings gráficos como emoticons | |
#tweet=SnowballC::wordStem(tweet,language = lang) # Aplica steamming (desativado) | |
#Converte tudo para minusculo | |
tweet = catch.error(tweet) # Aplica a funcao catch.error | |
return(tweet) | |
} | |
#Referencia: https://sites.google.com/site/miningtwitter/questions/talking-about/wordclouds/comparison-cloud | |
#----------------------------------------------------------------------------------------------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Remover NAs | |
cleanTweetsAndRemoveNAs<- function(Tweets) { | |
TweetsCleaned = sapply(Tweets, cleanTweets) | |
# Remove the "NA" tweets from this tweet list | |
TweetsCleaned = TweetsCleaned[!is.na(TweetsCleaned)] | |
names(TweetsCleaned) = NULL | |
# Remove the repetitive tweets from this tweet list | |
TweetsCleaned = unique(TweetsCleaned) | |
TweetsCleaned | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#++++++++++++++++++++++++++++++++++ | |
# rm_accent() versao inicial retirada: | |
# - https://pt.stackoverflow.com/questions/46473/remover-acentos | |
#+++++++++++++++++++++++++++++++++++ | |
# Remover acentos | |
rm_accent <- function(str,pattern="all") { | |
# Rotinas e funções úteis V 1.0 | |
# rm.accent - REMOVE ACENTOS DE PALAVRAS | |
# Função que tira todos os acentos e pontuações de um vetor de strings. | |
# Parâmetros: | |
# str - vetor de strings que terão seus acentos retirados. | |
# patterns - vetor de strings com um ou mais elementos indicando quais acentos deverão ser retirados. | |
# Para indicar quais acentos deverão ser retirados, um vetor com os símbolos deverão ser passados. | |
# Exemplo: pattern = c("´", "^") retirará os acentos agudos e circunflexos apenas. | |
# Outras palavras aceitas: "all" (retira todos os acentos, que são "´", "`", "^", "~", "¨", "ç") | |
if(!is.character(str)) | |
str <- as.character(str) | |
pattern <- unique(pattern) | |
if(any(pattern=="Ç")) | |
pattern[pattern=="Ç"] <- "ç" | |
symbols <- c( | |
acute = "áéíóúÁÉÍÓÚýÝ", | |
grave = "àèìòùÀÈÌÒÙ", | |
circunflex = "âêîôûÂÊÎÔÛ", | |
tilde = "ãõÃÕñÑ", | |
umlaut = "äëïöüÄËÏÖÜÿ", | |
cedil = "çÇ" | |
) | |
nudeSymbols <- c( | |
acute = "aeiouAEIOUyY", | |
grave = "aeiouAEIOU", | |
circunflex = "aeiouAEIOU", | |
tilde = "aoAOnN", | |
umlaut = "aeiouAEIOUy", | |
cedil = "cC" | |
) | |
accentTypes <- c("´","`","^","~","¨","ç") | |
if(any(c("all","al","a","todos","t","to","tod","todo")%in%pattern)) # opcao retirar todos | |
return(chartr(paste(symbols, collapse=""), paste(nudeSymbols, collapse=""), str)) | |
for(i in which(accentTypes%in%pattern)) | |
str <- chartr(symbols[i],nudeSymbols[i], str) | |
return(str) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#++++++++++++++++++++++++++++++++++ | |
# Funcao para analise de sentimentos | |
# que soma o score de cada frase de | |
# acordo com a polaridade informada | |
# na base lexiconPT::sentiLex_lem_PT02 | |
#+++++++++++++++++++++++++++++++++++ | |
library(lexiconPT) | |
library(dplyr) | |
pos.words=sentiLex_lem_PT02%>% | |
filter(polarity>0) | |
neg.words=sentiLex_lem_PT02%>% | |
filter(polarity<0) | |
score.sentiment = function(tweets) | |
{ | |
require(plyr) | |
require(stringr) | |
scores = laply(tweets, function(tweet, pos.words, neg.words) { | |
word.list = str_split(tweet, '\\s+') # splits the tweets by word in a list | |
words = unlist(word.list) # turns the list into vector | |
pos.matches = match(words, pos.words) ## returns matching | |
#values for words from list | |
neg.matches = match(words, neg.words) | |
pos.matches = !is.na(pos.matches) ## converts matching values to true of false | |
neg.matches = !is.na(neg.matches) | |
score = sum(pos.matches) - sum(neg.matches) # true and false are | |
#treated as 1 and 0 so they can be added | |
return(score) | |
}, pos.words, neg.words ) | |
scores.df = data.frame(score=scores, text=tweets) | |
return(scores.df) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rJava) | |
library(RWeka) | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
#+ ngrams com RWeka: + | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams)) | |
myDTM = TermDocumentMatrix(myCorpus,control = list(tokenize = Tokenizer)) | |
# ----------------------------------------------------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment