Skip to content

Instantly share code, notes, and snippets.

@sillasgonzaga
Created March 14, 2017 22:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sillasgonzaga/ae893e8f7884d688fd8320a78d5303b2 to your computer and use it in GitHub Desktop.
Save sillasgonzaga/ae893e8f7884d688fd8320a78d5303b2 to your computer and use it in GitHub Desktop.
# projeto analise de sentimento
library(tidyr)
library(dplyr)
library(magrittr)
library(stringr)
library(lubridate)
library(ggplot2)
library(tm)
library(SnowballC)
library(wordcloud)
chat <- read.csv("chat.csv", header = FALSE, sep =";",
na.strings = "", stringsAsFactors = FALSE, fileEncoding = "ISO-8859-1")
length(grep("^\\D", chat[,1]))
linhas_sem_digito <- grep("^\\D", chat[,1])
chat <- cbind(chat, matrix(nrow = nrow(chat), ncol = 20))
for(row in linhas_sem_digito){
end <- which(is.na(chat[row,]))[1] #first column without text in it
chat[row, 6:(5+end)]<- chat[row, 1:(end-1)]
chat[row, 1:(end-1)] <- NA
}
chat <- chat[-which(apply(chat, 1, function(x) all(is.na(x))) == TRUE),]
# copy down timestamps
for(row in which(is.na(chat[,1]))){
chat[row,1:5] <- chat[(row-1), 1:5]
}
#Delete column 3, contains only "-"
chat <- chat[,-c(3, 4)]
#Merge columns 1 and 2 (date and time) to simplify things
chat[,1] <- paste(chat[,1], chat[,2])
#Remove now redundant second column
chat <- chat[,-2]
#Name the first three columns
colnames(chat)[1:3] <- c("time", "pessoa", "V3")
# excluir virgula da primeira coluna
chat$time <- strptime(chat$time, "%d/%m/%y, %H:%M")
chat$time %<>% as.POSIXct()
# remover dois pontos do nome da pessoa
chat$pessoa %<>% str_replace(":$", "")
# renomear colunas
cols_msgs <- names(chat)[3:ncol(chat)]
cols_msgs <- paste0("V", 3:length(cols_msgs))
names(chat)[3:ncol(chat)] <- cols_msgs
ult_coluna <- which(is.na(chat[1, ]))[1]
# salvar msg em vetor
msg <- vector("character", length = nrow(chat))
for (i in 1:length(msg)) {
ult_coluna <- which(is.na(chat[i, ]))[1] - 1
colunas <- chat[i, 3:ult_coluna]
msg[i] <- str_c(colunas, sep = " ", collapse = " ")
}
chat <- chat[, c(1, 2)]
chat$msg <- str_to_lower(msg)
chat$id_msg <- 1:nrow(chat)
chat %<>% select(id_msg, everything())
# retirar msgs com link
chat %<>% filter(!grepl("http", msg))
# retirar msgs de mídia enviada
chat %<>% filter(msg != "<mídia omitida>")
## analise: quem digitou mais amor?
chat$amor <- ifelse(grepl("\\bamor\\b", chat$msg) | grepl("\\bamorzinho\\b", chat$msg), 1, 0)
chat %<>% mutate(dia = as.Date(time))
# quantidade de mensagens por dia
chat %>%
#mutate(dia = as.Date(time)) %>%
count(dia, pessoa) %>%
ggplot(aes(x = dia, y = n, color = pessoa)) +
geom_line() +
geom_smooth(se = FALSE, size = 1, linetype = "dashed") +
labs(x = NULL, y = "Quantidade de mensagens", labs = NULL,
title = "Quantidade de mensagens por pessoa por dia") +
theme(legend.position = "bottom")
# uso da palavra amor
chat %>%
group_by(dia, pessoa) %>%
summarise(amor = sum(amor)) %>%
ggplot(aes(x = dia, y = amor, color = pessoa)) +
geom_line() +
labs(x = "", y = "Quantidade de mensagens", color = "Pessoa",
title = "Quantidade de mensagens por pessoa com a palavra amor") +
theme(legend.position = "bottom")
# mensagens por dia da semana
chat %>%
mutate(wday = wday(dia, label = TRUE)) %>%
count(wday, pessoa) %>%
ggplot(aes(x = wday, y = n, fill = pessoa)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Dia da semana", y = "Quantidade de mensagens",
title = "Quantidade de mensagens por dia da semana") +
theme(legend.position = "bottom")
# tamanho medio das mensagens
chat$tamanho_msg <- nchar(chat$msg)
chat %>%
mutate(nchar_msg = nchar(msg)) %>%
ggplot(aes(x = pessoa, y = nchar_msg, fill = pessoa)) +
geom_boxplot() +
labs(x = NULL, y = "Tamanho das mensagens",
title = "Distribuição do tamanho das mensagens em caracteres") +
theme(legend.position = "none")
# wordcloud
eu <- chat$msg[chat$pessoa == "Eu"]
ela <- chat$msg[chat$pessoa == "Ela"]
mystopwords <- c("pra", "nao", "omitida")
mystopwords <- c(mystopwords, stopwords("portuguese"))
eu %>%
VectorSource() %>%
Corpus() %>%
tm_map(removeWords, mystopwords) %>%
wordcloud(max.words = 50, random.order = FALSE, scale = c(3, .6))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment