Skip to content

Instantly share code, notes, and snippets.

@joaovissoci
Forked from rpietro/lsa_hack.r
Created December 4, 2013 16:32
Show Gist options
  • Save joaovissoci/7790701 to your computer and use it in GitHub Desktop.
Save joaovissoci/7790701 to your computer and use it in GitHub Desktop.
# script stolen from http://goo.gl/YbQyAQ
# install.packages("tm")
# install.packages("ggplot2")
# install.packages("lsa")
# install.packages("scatterplot3d")
library(tm)
library(ggplot2)
library(lsa)
library(scatterplot3d)
setwd("/Users/rpietro/Google Drive/ToDos")
#lendo o arquivo "noname"
#file <- file.choose()
text <- read.csv("entrevistas_esporte.csv", header = TRUE)
#------------------------------------------------------------------------------
# 1. Prepare data from http://goo.gl/1RB32f
#view <- factor(rep(c("view 1", "view 2", "view 3"), each = 3))
#view
df <- data.frame(text, stringsAsFactors = FALSE)
df$text<-df$Entrevista
#------------------------------------------------------------------------------
# prepare corpus
corpus <- Corpus(VectorSource(df$text))
corpus<- tm_map(corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords("portuguese")))
### Need package SnowballC
corpus <- tm_map(corpus, stemDocument, language = "portuguese")
corpus
#------------------------------------------------------------------------------
# MDS with raw term-document matrix compute distance matrix
td.mat <- as.matrix(TermDocumentMatrix(corpus))
td.mat
dist.mat <- dist(t(as.matrix(td.mat)))
dist.mat # check distance matrix
#------------------------------------------------------------------------------
# MDS
fit <- cmdscale(dist.mat, eig = TRUE, k = 2)
points <- data.frame(x = fit$points[, 1], y = fit$points[, 2])
ggplot(points, aes(x = x, y = y)) + geom_point(data = points, aes(x = x, y = y)) + geom_text(data = points, aes(x = x, y = y - 0.2, label =
row.names(df)))
#------------------------------------------------------------------------------
# MDS with LSA
td.mat.lsa <- lw_bintf(td.mat) * gw_idf(td.mat) # weighting
lsaSpace <- lsa(td.mat.lsa) # create LSA space
dist.mat.lsa <- dist(t(as.textmatrix(lsaSpace))) # compute distance matrix
dist.mat.lsa # check distance mantrix
print(dist.mat.lsa,bag_cols=5) # check distance mantrix
summary.textmatrix(dist.mat.lsa)
cosine(dist.mat.lsa)
#------------------------------------------------------------------------------
# MDS
fit <- cmdscale(dist.mat.lsa, eig = TRUE, k = 2)
points <- data.frame(x = fit$points[, 1], y = fit$points[, 2])
ggplot(points, aes(x = x, y = y)) + geom_point(data = points, aes(x = x, y = y)) + geom_text(data = points, aes(x = x, y = y - 0.2, label = row.names(df)))
#------------------------------------------------------------------------------
# plot
fit <- cmdscale(dist.mat.lsa, eig = TRUE, k = 3)
colors <- rep(c("blue", "green", "red"), each = 3)
scatterplot3d(fit$points[, 1], fit$points[, 2], fit$points[, 3],
pch = 16, main = "Semantic Space Scaled to 3D", xlab = "x", ylab = "y",
zlab = "z", type = "h")
library(qgraph)
Q1 <- qgraph(dist.mat.lsa, borders = TRUE, cut=80, minimum = 50, label.cex = 4, layout = "spring", label.norm = "OOOO")
#vsize = 3, cut = 5,
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment