Skip to content

Instantly share code, notes, and snippets.

@gomesfellipe
Last active February 20, 2018 04:06
Show Gist options
  • Save gomesfellipe/141f5b1bb6f4f023baffeb0abeca9cca to your computer and use it in GitHub Desktop.
Save gomesfellipe/141f5b1bb6f4f023baffeb0abeca9cca to your computer and use it in GitHub Desktop.
Funções para limpeza da base de dados para Mineração de texto
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#+ Captação de erros de codificacao: +
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
catch.error = function(x){
y = NA # Cria um vetor com valor faltante para teste
catch_error = tryCatch(tolower(x), error=function(e) e) # Tente pegar esse erro (NA) que acabamos de criar
if (!inherits(catch_error, "error")) # Se não for um erro
y = tolower(x) # verificar resultado se houver erro, caso contrário, a função funciona normalmente
return(y)
}
#Fonte: https://sites.google.com/site/miningtwitter/questions/talking-about/given-topic
#-----------------------------------------------------------------------------------------------
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#+ Limpeza de caracteres especiais +
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
cleanTweets<- function(tweet){
# Limpe o tweet para análise de sentimentos
tweet = gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", " ", tweet) # Remove html links
tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", tweet) # Remove retweet
tweet = gsub("#\\w+", " ", tweet) # Remove todos "#Hashtag"
tweet = gsub("@\\w+", " ", tweet) # Remove todos "@people"
tweet = gsub("[[:punct:]]", " ", tweet) # Remove todas as pontuacoes
tweet = gsub("[[:digit:]]", " ", tweet) # Remover numeros, precisamos apenas de texto para análise
tweet = gsub("[ \t]{2,}", " ", tweet) # Remove espaços desnecessarios
tweet = gsub("^\\s+|\\s+$", "", tweet) # (espacos em branco, tabs etc)
tweet = gsub('https://','',tweet) # Remove https://
tweet = gsub('http://','',tweet) # Remove http://
tweet = gsub('[^[:graph:]]', ' ',tweet) # Remove strings gráficos como emoticons
tweet = gsub('[[:punct:]]', '', tweet) # Remove pontuacao
tweet = gsub('[[:cntrl:]]', '', tweet) # Remove strings de controle
tweet = gsub('\\d+', '', tweet) # Remove numeros
tweet=str_replace_all(tweet,"[^[:graph:]]", " ") # Remove strings gráficos como emoticons
#tweet=SnowballC::wordStem(tweet,language = lang) # Aplica steamming (desativado)
#Converte tudo para minusculo
tweet = catch.error(tweet) # Aplica a funcao catch.error
return(tweet)
}
#Referencia: https://sites.google.com/site/miningtwitter/questions/talking-about/wordclouds/comparison-cloud
#-----------------------------------------------------------------------------------------------
# Remover NAs
cleanTweetsAndRemoveNAs<- function(Tweets) {
TweetsCleaned = sapply(Tweets, cleanTweets)
# Remove the "NA" tweets from this tweet list
TweetsCleaned = TweetsCleaned[!is.na(TweetsCleaned)]
names(TweetsCleaned) = NULL
# Remove the repetitive tweets from this tweet list
TweetsCleaned = unique(TweetsCleaned)
TweetsCleaned
}
#++++++++++++++++++++++++++++++++++
# rm_accent() versao inicial retirada:
# - https://pt.stackoverflow.com/questions/46473/remover-acentos
#+++++++++++++++++++++++++++++++++++
# Remover acentos
rm_accent <- function(str,pattern="all") {
# Rotinas e funções úteis V 1.0
# rm.accent - REMOVE ACENTOS DE PALAVRAS
# Função que tira todos os acentos e pontuações de um vetor de strings.
# Parâmetros:
# str - vetor de strings que terão seus acentos retirados.
# patterns - vetor de strings com um ou mais elementos indicando quais acentos deverão ser retirados.
# Para indicar quais acentos deverão ser retirados, um vetor com os símbolos deverão ser passados.
# Exemplo: pattern = c("´", "^") retirará os acentos agudos e circunflexos apenas.
# Outras palavras aceitas: "all" (retira todos os acentos, que são "´", "`", "^", "~", "¨", "ç")
if(!is.character(str))
str <- as.character(str)
pattern <- unique(pattern)
if(any(pattern=="Ç"))
pattern[pattern=="Ç"] <- "ç"
symbols <- c(
acute = "áéíóúÁÉÍÓÚýÝ",
grave = "àèìòùÀÈÌÒÙ",
circunflex = "âêîôûÂÊÎÔÛ",
tilde = "ãõÃÕñÑ",
umlaut = "äëïöüÄËÏÖÜÿ",
cedil = "çÇ"
)
nudeSymbols <- c(
acute = "aeiouAEIOUyY",
grave = "aeiouAEIOU",
circunflex = "aeiouAEIOU",
tilde = "aoAOnN",
umlaut = "aeiouAEIOUy",
cedil = "cC"
)
accentTypes <- c("´","`","^","~","¨","ç")
if(any(c("all","al","a","todos","t","to","tod","todo")%in%pattern)) # opcao retirar todos
return(chartr(paste(symbols, collapse=""), paste(nudeSymbols, collapse=""), str))
for(i in which(accentTypes%in%pattern))
str <- chartr(symbols[i],nudeSymbols[i], str)
return(str)
}
#++++++++++++++++++++++++++++++++++
# Funcao para analise de sentimentos
# que soma o score de cada frase de
# acordo com a polaridade informada
# na base lexiconPT::sentiLex_lem_PT02
#+++++++++++++++++++++++++++++++++++
library(lexiconPT)
library(dplyr)
pos.words=sentiLex_lem_PT02%>%
filter(polarity>0)
neg.words=sentiLex_lem_PT02%>%
filter(polarity<0)
score.sentiment = function(tweets)
{
require(plyr)
require(stringr)
scores = laply(tweets, function(tweet, pos.words, neg.words) {
word.list = str_split(tweet, '\\s+') # splits the tweets by word in a list
words = unlist(word.list) # turns the list into vector
pos.matches = match(words, pos.words) ## returns matching
#values for words from list
neg.matches = match(words, neg.words)
pos.matches = !is.na(pos.matches) ## converts matching values to true of false
neg.matches = !is.na(neg.matches)
score = sum(pos.matches) - sum(neg.matches) # true and false are
#treated as 1 and 0 so they can be added
return(score)
}, pos.words, neg.words )
scores.df = data.frame(score=scores, text=tweets)
return(scores.df)
}
library(rJava)
library(RWeka)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#+ ngrams com RWeka: +
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams))
myDTM = TermDocumentMatrix(myCorpus,control = list(tokenize = Tokenizer))
# -----------------------------------------------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment