gomesfellipe/Tokenizer.R

## catch_error.R
  #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  #+                         Captação de erros de codificacao:                                    +
  #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  catch.error = function(x){
    y = NA                                                     # Cria um vetor com valor faltante para teste
    catch_error = tryCatch(tolower(x), error=function(e) e)    # Tente pegar esse erro (NA) que acabamos de criar
    if (!inherits(catch_error, "error"))                       # Se não for um erro
      y = tolower(x)                                           # verificar resultado se houver erro, caso contrário, a função funciona normalmente
    return(y)
  }
  #Fonte: https://sites.google.com/site/miningtwitter/questions/talking-about/given-topic
  #-----------------------------------------------------------------------------------------------

## cleanTweets.R
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  #+                            Limpeza de caracteres especiais                                   +
  #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  cleanTweets<- function(tweet){

    # Limpe o tweet para análise de sentimentos

    tweet = gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", " ", tweet)  # Remove html links
    tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", tweet)       # Remove retweet
    tweet = gsub("#\\w+", " ", tweet)                             # Remove todos "#Hashtag"
    tweet = gsub("@\\w+", " ", tweet)                             # Remove todos "@people"
    tweet = gsub("[[:punct:]]", " ", tweet)                       # Remove todas as pontuacoes
    tweet = gsub("[[:digit:]]", " ", tweet)                       # Remover numeros, precisamos apenas de texto para análise

    tweet = gsub("[ \t]{2,}", " ", tweet)                         # Remove espaços desnecessarios
    tweet = gsub("^\\s+|\\s+$", "", tweet)                        # (espacos em branco, tabs etc)

    tweet = gsub('https://','',tweet)                             # Remove https://
    tweet = gsub('http://','',tweet)                              # Remove http://
    tweet = gsub('[^[:graph:]]', ' ',tweet)                       # Remove strings gráficos como emoticons
    tweet = gsub('[[:punct:]]', '', tweet)                        # Remove pontuacao
    tweet = gsub('[[:cntrl:]]', '', tweet)                        # Remove strings de controle
    tweet = gsub('\\d+', '', tweet)                               # Remove numeros
    tweet=str_replace_all(tweet,"[^[:graph:]]", " ")              # Remove strings gráficos como emoticons
    #tweet=SnowballC::wordStem(tweet,language = lang)     # Aplica steamming (desativado)

    #Converte tudo para minusculo
    tweet = catch.error(tweet)                                    # Aplica a funcao catch.error

    return(tweet)
  }
  #Referencia: https://sites.google.com/site/miningtwitter/questions/talking-about/wordclouds/comparison-cloud
  #-----------------------------------------------------------------------------------------------

## cleanTweetsAndRemoveNAs.R
# Remover NAs
cleanTweetsAndRemoveNAs<- function(Tweets) {

  TweetsCleaned = sapply(Tweets, cleanTweets)

  # Remove the "NA" tweets from this tweet list
  TweetsCleaned = TweetsCleaned[!is.na(TweetsCleaned)]

  names(TweetsCleaned) = NULL
  # Remove the repetitive tweets from this tweet list

  TweetsCleaned = unique(TweetsCleaned)

  TweetsCleaned
}

## rm_accent.R
#++++++++++++++++++++++++++++++++++
# rm_accent() versao inicial retirada:
# - https://pt.stackoverflow.com/questions/46473/remover-acentos
#+++++++++++++++++++++++++++++++++++

# Remover acentos
rm_accent <- function(str,pattern="all") {
  # Rotinas e funções úteis V 1.0
  # rm.accent - REMOVE ACENTOS DE PALAVRAS
  # Função que tira todos os acentos e pontuações de um vetor de strings.
  # Parâmetros:
  # str - vetor de strings que terão seus acentos retirados.
  # patterns - vetor de strings com um ou mais elementos indicando quais acentos deverão ser retirados.
  #            Para indicar quais acentos deverão ser retirados, um vetor com os símbolos deverão ser passados.
  #            Exemplo: pattern = c("´", "^") retirará os acentos agudos e circunflexos apenas.
  #            Outras palavras aceitas: "all" (retira todos os acentos, que são "´", "`", "^", "~", "¨", "ç")
  if(!is.character(str))
    str <- as.character(str)

  pattern <- unique(pattern)

  if(any(pattern=="Ç"))
    pattern[pattern=="Ç"] <- "ç"

  symbols <- c(
    acute = "áéíóúÁÉÍÓÚýÝ",
    grave = "àèìòùÀÈÌÒÙ",
    circunflex = "âêîôûÂÊÎÔÛ",
    tilde = "ãõÃÕñÑ",
    umlaut = "äëïöüÄËÏÖÜÿ",
    cedil = "çÇ"
  )

  nudeSymbols <- c(
    acute = "aeiouAEIOUyY",
    grave = "aeiouAEIOU",
    circunflex = "aeiouAEIOU",
    tilde = "aoAOnN",
    umlaut = "aeiouAEIOUy",
    cedil = "cC"
  )

  accentTypes <- c("´","`","^","~","¨","ç")

  if(any(c("all","al","a","todos","t","to","tod","todo")%in%pattern)) # opcao retirar todos
    return(chartr(paste(symbols, collapse=""), paste(nudeSymbols, collapse=""), str))

  for(i in which(accentTypes%in%pattern))
    str <- chartr(symbols[i],nudeSymbols[i], str)

  return(str)
}

## score_sentiment.R
#++++++++++++++++++++++++++++++++++
# Funcao para analise de sentimentos
# que soma o score de cada frase de
# acordo com a polaridade informada
# na base lexiconPT::sentiLex_lem_PT02
#+++++++++++++++++++++++++++++++++++

library(lexiconPT)
library(dplyr)

pos.words=sentiLex_lem_PT02%>%
  filter(polarity>0)
neg.words=sentiLex_lem_PT02%>%
  filter(polarity<0)

score.sentiment = function(tweets)
{
  require(plyr)
  require(stringr)

  scores = laply(tweets, function(tweet, pos.words, neg.words) {

    word.list = str_split(tweet, '\\s+') # splits the tweets by word in a list

    words = unlist(word.list) # turns the list into vector

    pos.matches = match(words, pos.words) ## returns matching
    #values for words from list
    neg.matches = match(words, neg.words)

    pos.matches = !is.na(pos.matches) ## converts matching values to true of false
    neg.matches = !is.na(neg.matches)

    score = sum(pos.matches) - sum(neg.matches) # true and false are
    #treated as 1 and 0 so they can be added

    return(score)

  }, pos.words, neg.words )

  scores.df = data.frame(score=scores, text=tweets)

  return(scores.df)
}

## Tokenizer.R
library(rJava)
library(RWeka)

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#+                                   ngrams com RWeka:                                          +
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams))
myDTM = TermDocumentMatrix(myCorpus,control = list(tokenize = Tokenizer))
# -----------------------------------------------------------------------------------------------
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	#+ Captação de erros de codificacao: +
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	catch.error = function(x){
	y = NA # Cria um vetor com valor faltante para teste
	catch_error = tryCatch(tolower(x), error=function(e) e) # Tente pegar esse erro (NA) que acabamos de criar
	if (!inherits(catch_error, "error")) # Se não for um erro
	y = tolower(x) # verificar resultado se houver erro, caso contrário, a função funciona normalmente
	return(y)
	}
	#Fonte: https://sites.google.com/site/miningtwitter/questions/talking-about/given-topic
	#-----------------------------------------------------------------------------------------------
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	#+ Limpeza de caracteres especiais +
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	cleanTweets<- function(tweet){

	# Limpe o tweet para análise de sentimentos

	tweet = gsub("(f\|ht)(tp)(s?)(://)(.)[.\|/](.)", " ", tweet) # Remove html links
	tweet = gsub("(RT\|via)((?:\\b\\W*@\\w+)+)", " ", tweet) # Remove retweet
	tweet = gsub("#\\w+", " ", tweet) # Remove todos "#Hashtag"
	tweet = gsub("@\\w+", " ", tweet) # Remove todos "@people"
	tweet = gsub("[[:punct:]]", " ", tweet) # Remove todas as pontuacoes
	tweet = gsub("[[:digit:]]", " ", tweet) # Remover numeros, precisamos apenas de texto para análise

	tweet = gsub("[ \t]{2,}", " ", tweet) # Remove espaços desnecessarios
	tweet = gsub("^\\s+\|\\s+$", "", tweet) # (espacos em branco, tabs etc)

	tweet = gsub('https://','',tweet) # Remove https://
	tweet = gsub('http://','',tweet) # Remove http://
	tweet = gsub('[^[:graph:]]', ' ',tweet) # Remove strings gráficos como emoticons
	tweet = gsub('[[:punct:]]', '', tweet) # Remove pontuacao
	tweet = gsub('[[:cntrl:]]', '', tweet) # Remove strings de controle
	tweet = gsub('\\d+', '', tweet) # Remove numeros
	tweet=str_replace_all(tweet,"[^[:graph:]]", " ") # Remove strings gráficos como emoticons
	#tweet=SnowballC::wordStem(tweet,language = lang) # Aplica steamming (desativado)

	#Converte tudo para minusculo
	tweet = catch.error(tweet) # Aplica a funcao catch.error

	return(tweet)
	}
	#Referencia: https://sites.google.com/site/miningtwitter/questions/talking-about/wordclouds/comparison-cloud
	#-----------------------------------------------------------------------------------------------
	# Remover NAs
	cleanTweetsAndRemoveNAs<- function(Tweets) {

	TweetsCleaned = sapply(Tweets, cleanTweets)

	# Remove the "NA" tweets from this tweet list
	TweetsCleaned = TweetsCleaned[!is.na(TweetsCleaned)]

	names(TweetsCleaned) = NULL
	# Remove the repetitive tweets from this tweet list

	TweetsCleaned = unique(TweetsCleaned)

	TweetsCleaned
	}
	#++++++++++++++++++++++++++++++++++
	# rm_accent() versao inicial retirada:
	# - https://pt.stackoverflow.com/questions/46473/remover-acentos
	#+++++++++++++++++++++++++++++++++++

	# Remover acentos
	rm_accent <- function(str,pattern="all") {
	# Rotinas e funções úteis V 1.0
	# rm.accent - REMOVE ACENTOS DE PALAVRAS
	# Função que tira todos os acentos e pontuações de um vetor de strings.
	# Parâmetros:
	# str - vetor de strings que terão seus acentos retirados.
	# patterns - vetor de strings com um ou mais elementos indicando quais acentos deverão ser retirados.
	# Para indicar quais acentos deverão ser retirados, um vetor com os símbolos deverão ser passados.
	# Exemplo: pattern = c("´", "^") retirará os acentos agudos e circunflexos apenas.
	# Outras palavras aceitas: "all" (retira todos os acentos, que são "´", "`", "^", "~", "¨", "ç")
	if(!is.character(str))
	str <- as.character(str)

	pattern <- unique(pattern)

	if(any(pattern=="Ç"))
	pattern[pattern=="Ç"] <- "ç"

	symbols <- c(
	acute = "áéíóúÁÉÍÓÚýÝ",
	grave = "àèìòùÀÈÌÒÙ",
	circunflex = "âêîôûÂÊÎÔÛ",
	tilde = "ãõÃÕñÑ",
	umlaut = "äëïöüÄËÏÖÜÿ",
	cedil = "çÇ"
	)

	nudeSymbols <- c(
	acute = "aeiouAEIOUyY",
	grave = "aeiouAEIOU",
	circunflex = "aeiouAEIOU",
	tilde = "aoAOnN",
	umlaut = "aeiouAEIOUy",
	cedil = "cC"
	)

	accentTypes <- c("´","`","^","~","¨","ç")

	if(any(c("all","al","a","todos","t","to","tod","todo")%in%pattern)) # opcao retirar todos
	return(chartr(paste(symbols, collapse=""), paste(nudeSymbols, collapse=""), str))

	for(i in which(accentTypes%in%pattern))
	str <- chartr(symbols[i],nudeSymbols[i], str)

	return(str)
	}
	#++++++++++++++++++++++++++++++++++
	# Funcao para analise de sentimentos
	# que soma o score de cada frase de
	# acordo com a polaridade informada
	# na base lexiconPT::sentiLex_lem_PT02
	#+++++++++++++++++++++++++++++++++++

	library(lexiconPT)
	library(dplyr)

	pos.words=sentiLex_lem_PT02%>%
	filter(polarity>0)
	neg.words=sentiLex_lem_PT02%>%
	filter(polarity<0)

	score.sentiment = function(tweets)
	{
	require(plyr)
	require(stringr)

	scores = laply(tweets, function(tweet, pos.words, neg.words) {

	word.list = str_split(tweet, '\\s+') # splits the tweets by word in a list

	words = unlist(word.list) # turns the list into vector

	pos.matches = match(words, pos.words) ## returns matching
	#values for words from list
	neg.matches = match(words, neg.words)

	pos.matches = !is.na(pos.matches) ## converts matching values to true of false
	neg.matches = !is.na(neg.matches)

	score = sum(pos.matches) - sum(neg.matches) # true and false are
	#treated as 1 and 0 so they can be added

	return(score)

	}, pos.words, neg.words )

	scores.df = data.frame(score=scores, text=tweets)

	return(scores.df)
	}
	library(rJava)
	library(RWeka)

	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	#+ ngrams com RWeka: +
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams))
	myDTM = TermDocumentMatrix(myCorpus,control = list(tokenize = Tokenizer))
	# -----------------------------------------------------------------------------------------------