teos0009/tmsubset.r

## tmsubset.r
install.packages("RWeka")
install.packages("rJava")
install.packages("Snowball")

library(RXKCD)
library(XML)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(rJava)
library(RWeka)
library(Snowball)

require(XML)
require(RXKCD)
require(tm)
require(wordcloud)
require(RColorBrewer)
require(rJava)
require(RWeka)
require(Snowball)

#install.packages("Snowball")#use by stemming of a word
#install.packages("rJava")#need rWeka

##Note: there are some handy, basic Twitter related functions here:
##https://github.com/matteoredaelli/twitter-r-utils
#For example:
#RemoveAtPeople <- function(tweet) {
#  gsub("@\\w+", "", tweet)
#}
#Then for example, remove @'d names
#tweets <- as.vector(sapply(tw.df$text, RemoveAtPeople))

ap.df<-read.csv("sjteo links posted.csv",header = TRUE, sep="," ,
stringsAsFactor=FALSE,na.strings = c("NA","","NULL"))

attach(ap.df)#changes to data will not update. detach then attach to reflect changes

#check col name
names(ap.df)
#check particular col
ap.df$url#disp all data in col url
ap.df$url[766]#disp data[766] in col url

ap.sub1<-ap.df[,c(3,4,5,6)]
#3 is owner comment,4,time created, 5 title,6 summary
ap.sub1[,c(1)]#comment by me
ap.sub1[,c(3)]#title by me
ap.sub1[,c(4)]#summary text by ori source


#ap.df<-read.csv("sjteo inbox1.csv",header = TRUE, stringsAsFactor=FALSE)
#ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 3])))#title of post
#ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 1])))#comment by me
ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 4])))#summary by ori


ap.corpus <- tm_map(ap.corpus, removeNumbers)
#ap.corpus <- tm_map(ap.corpus, stripWhitespace)
ap.corpus <- tm_map(ap.corpus, tolower)
ap.corpus <- tm_map(ap.corpus,stemDocument)#preserve root words
#cant stem coz rJava No CurrentVersion entry in key
ap.corpus <- tm_map(ap.corpus, removePunctuation)


#ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english")))
# remove generic and custom stopwords
my_stopwords <- c(stopwords("english"),"quot","quotquotquot","null","sj","teo", "pls","mrchua","ill","dun","lol","btw","dont","yeah",
"thx","name","word","nope","teo","dad","nov","earlier","cockney","time","sir","okok","lady",
"coz","guys","using","girls","ercan","didnt","etc","lots","hehehe","stuff","hows","previous","shld"
,"haha","cedric","anyway","okie","sorry","timeline","photos","http","www","com","chk","amp")
ap.corpus <- tm_map(ap.corpus, removeWords, my_stopwords)


inspect(ap.corpus)#debug only
ap.tdm <- TermDocumentMatrix(ap.corpus)
#inspect(ap.tdm[1:100,1:10])#debug only
#head(ap.tdm)#debug only
#names(ap.tdm)#debug only

ap.m <- as.matrix(ap.tdm)
ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
ap.d <- data.frame(word = names(ap.v),freq=ap.v)
table(ap.d$freq)
pal2 <- brewer.pal(8,"Dark2")


#find frequent terms
findFreqTerms(ap.tdm, 5)

#find correlations
findAssocs(ap.tdm,"arduino",0.20)
findAssocs(ap.tdm,"makers",0.35)

#clustering
#d<-dist(ap.df, method="euclidean")#dissimilarity matrix
#clusters<-hclust(d=d,method="ward")#ward's method to find clusters


png("STEMM owner summary ori.png", width=3280,height=1800)
#png("owner comment.png", width=2280,height=1800)
#wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=3,#word could not fit if canvas too small
wordcloud(ap.d$word,ap.d$freq, scale=c(10,0.9),min.freq=4,
max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
dev.off()
	install.packages("RWeka")
	install.packages("rJava")
	install.packages("Snowball")

	library(RXKCD)
	library(XML)
	library(tm)
	library(wordcloud)
	library(RColorBrewer)
	library(rJava)
	library(RWeka)
	library(Snowball)

	require(XML)
	require(RXKCD)
	require(tm)
	require(wordcloud)
	require(RColorBrewer)
	require(rJava)
	require(RWeka)
	require(Snowball)

	#install.packages("Snowball")#use by stemming of a word
	#install.packages("rJava")#need rWeka

	##Note: there are some handy, basic Twitter related functions here:
	##https://github.com/matteoredaelli/twitter-r-utils
	#For example:
	#RemoveAtPeople <- function(tweet) {
	# gsub("@\\w+", "", tweet)
	#}
	#Then for example, remove @'d names
	#tweets <- as.vector(sapply(tw.df$text, RemoveAtPeople))

	ap.df<-read.csv("sjteo links posted.csv",header = TRUE, sep="," ,
	stringsAsFactor=FALSE,na.strings = c("NA","","NULL"))

	attach(ap.df)#changes to data will not update. detach then attach to reflect changes

	#check col name
	names(ap.df)
	#check particular col
	ap.df$url#disp all data in col url
	ap.df$url[766]#disp data[766] in col url

	ap.sub1<-ap.df[,c(3,4,5,6)]
	#3 is owner comment,4,time created, 5 title,6 summary
	ap.sub1[,c(1)]#comment by me
	ap.sub1[,c(3)]#title by me
	ap.sub1[,c(4)]#summary text by ori source


	#ap.df<-read.csv("sjteo inbox1.csv",header = TRUE, stringsAsFactor=FALSE)
	#ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 3])))#title of post
	#ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 1])))#comment by me
	ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 4])))#summary by ori


	ap.corpus <- tm_map(ap.corpus, removeNumbers)
	#ap.corpus <- tm_map(ap.corpus, stripWhitespace)
	ap.corpus <- tm_map(ap.corpus, tolower)
	ap.corpus <- tm_map(ap.corpus,stemDocument)#preserve root words
	#cant stem coz rJava No CurrentVersion entry in key
	ap.corpus <- tm_map(ap.corpus, removePunctuation)


	#ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english")))
	# remove generic and custom stopwords
	my_stopwords <- c(stopwords("english"),"quot","quotquotquot","null","sj","teo", "pls","mrchua","ill","dun","lol","btw","dont","yeah",
	"thx","name","word","nope","teo","dad","nov","earlier","cockney","time","sir","okok","lady",
	"coz","guys","using","girls","ercan","didnt","etc","lots","hehehe","stuff","hows","previous","shld"
	,"haha","cedric","anyway","okie","sorry","timeline","photos","http","www","com","chk","amp")
	ap.corpus <- tm_map(ap.corpus, removeWords, my_stopwords)


	inspect(ap.corpus)#debug only
	ap.tdm <- TermDocumentMatrix(ap.corpus)
	#inspect(ap.tdm[1:100,1:10])#debug only
	#head(ap.tdm)#debug only
	#names(ap.tdm)#debug only

	ap.m <- as.matrix(ap.tdm)
	ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
	ap.d <- data.frame(word = names(ap.v),freq=ap.v)
	table(ap.d$freq)
	pal2 <- brewer.pal(8,"Dark2")


	#find frequent terms
	findFreqTerms(ap.tdm, 5)

	#find correlations
	findAssocs(ap.tdm,"arduino",0.20)
	findAssocs(ap.tdm,"makers",0.35)

	#clustering
	#d<-dist(ap.df, method="euclidean")#dissimilarity matrix
	#clusters<-hclust(d=d,method="ward")#ward's method to find clusters


	png("STEMM owner summary ori.png", width=3280,height=1800)
	#png("owner comment.png", width=2280,height=1800)
	#wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=3,#word could not fit if canvas too small
	wordcloud(ap.d$word,ap.d$freq, scale=c(10,0.9),min.freq=4,
	max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
	dev.off()