Skip to content

Instantly share code, notes, and snippets.

@teos0009
Created July 10, 2016 09:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save teos0009/b182b3f90e426747712e9a3b8aaae5b1 to your computer and use it in GitHub Desktop.
Save teos0009/b182b3f90e426747712e9a3b8aaae5b1 to your computer and use it in GitHub Desktop.
install.packages("RWeka")
install.packages("rJava")
install.packages("Snowball")
library(RXKCD)
library(XML)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(rJava)
library(RWeka)
library(Snowball)
require(XML)
require(RXKCD)
require(tm)
require(wordcloud)
require(RColorBrewer)
require(rJava)
require(RWeka)
require(Snowball)
#install.packages("Snowball")#use by stemming of a word
#install.packages("rJava")#need rWeka
##Note: there are some handy, basic Twitter related functions here:
##https://github.com/matteoredaelli/twitter-r-utils
#For example:
#RemoveAtPeople <- function(tweet) {
# gsub("@\\w+", "", tweet)
#}
#Then for example, remove @'d names
#tweets <- as.vector(sapply(tw.df$text, RemoveAtPeople))
ap.df<-read.csv("sjteo links posted.csv",header = TRUE, sep="," ,
stringsAsFactor=FALSE,na.strings = c("NA","","NULL"))
attach(ap.df)#changes to data will not update. detach then attach to reflect changes
#check col name
names(ap.df)
#check particular col
ap.df$url#disp all data in col url
ap.df$url[766]#disp data[766] in col url
ap.sub1<-ap.df[,c(3,4,5,6)]
#3 is owner comment,4,time created, 5 title,6 summary
ap.sub1[,c(1)]#comment by me
ap.sub1[,c(3)]#title by me
ap.sub1[,c(4)]#summary text by ori source
#ap.df<-read.csv("sjteo inbox1.csv",header = TRUE, stringsAsFactor=FALSE)
#ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 3])))#title of post
#ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 1])))#comment by me
ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 4])))#summary by ori
ap.corpus <- tm_map(ap.corpus, removeNumbers)
#ap.corpus <- tm_map(ap.corpus, stripWhitespace)
ap.corpus <- tm_map(ap.corpus, tolower)
ap.corpus <- tm_map(ap.corpus,stemDocument)#preserve root words
#cant stem coz rJava No CurrentVersion entry in key
ap.corpus <- tm_map(ap.corpus, removePunctuation)
#ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english")))
# remove generic and custom stopwords
my_stopwords <- c(stopwords("english"),"quot","quotquotquot","null","sj","teo", "pls","mrchua","ill","dun","lol","btw","dont","yeah",
"thx","name","word","nope","teo","dad","nov","earlier","cockney","time","sir","okok","lady",
"coz","guys","using","girls","ercan","didnt","etc","lots","hehehe","stuff","hows","previous","shld"
,"haha","cedric","anyway","okie","sorry","timeline","photos","http","www","com","chk","amp")
ap.corpus <- tm_map(ap.corpus, removeWords, my_stopwords)
inspect(ap.corpus)#debug only
ap.tdm <- TermDocumentMatrix(ap.corpus)
#inspect(ap.tdm[1:100,1:10])#debug only
#head(ap.tdm)#debug only
#names(ap.tdm)#debug only
ap.m <- as.matrix(ap.tdm)
ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
ap.d <- data.frame(word = names(ap.v),freq=ap.v)
table(ap.d$freq)
pal2 <- brewer.pal(8,"Dark2")
#find frequent terms
findFreqTerms(ap.tdm, 5)
#find correlations
findAssocs(ap.tdm,"arduino",0.20)
findAssocs(ap.tdm,"makers",0.35)
#clustering
#d<-dist(ap.df, method="euclidean")#dissimilarity matrix
#clusters<-hclust(d=d,method="ward")#ward's method to find clusters
png("STEMM owner summary ori.png", width=3280,height=1800)
#png("owner comment.png", width=2280,height=1800)
#wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=3,#word could not fit if canvas too small
wordcloud(ap.d$word,ap.d$freq, scale=c(10,0.9),min.freq=4,
max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment