jkuruzovich/Class 11 Examples

## Class 11 Examples
#This is a cool demo that shows the potential of connecting with Google Visualizations
install.packages("googleVis")
library(googleVis)
demo(WorldBank)


#Start of Kaggle See Click Predict
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
train <- read.csv("train.csv")
test <- read.csv("test.csv")
str(train)
head(train)

#We know there are 4 Cities
geo <- train[,2:3]
set.seed(2)
fit <- kmeans(geo$latitude, 4) # 4 cluster solution

#Make sure clustering worked, should show 4 distinct areas You might have to run again if dots not colored right
plot(geo$latitude, geo$longitude, col = fit$cluster)

#caputure cluster assignment
citycl<-fit$cluster
geob<-cbind(geo,citycl)
cities<-aggregate(geob, by=list(geob$citycl), FUN=mean)
View(cities)

#Once you have the average long/Latitude for each cluster you can find city online.
#1	41.85662	-87.68507	   1 Chicago, IL
#2	2	37.54046	-77.46269	 2 Richmond, VA
#3	3	41.31132	-72.92412	 3 New Haven, CT
#4	4	37.80239	-122.24116 4 Oakland, CA

#You might need to change the ordering
levels(citycl) = c('Chicago','Richmond','New Haven', 'Oakland')

#Let's add our new feature back to the training dataset
train<-cbind(train, citycl)

#Intro to text data analysis

#Basic Text Data Features
train$summarync<-nchar(as.character(train$summary))
train$descriptionnc<-nchar(as.character(train$description))

#Currently variables are treated as factors, and they have repeating digits.
library(tm)
myCorpus <- Corpus(VectorSource(train$summary))
myCorpus2 <- Corpus(VectorSource(train$description))
myCorpus <- tm_map(myCorpus, tolower)
myCorpus2 <- tm_map(myCorpus2, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus2 <- tm_map(myCorpus2, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus2 <- tm_map(myCorpus2, removeNumbers)

myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 3))
myDtm2 <- TermDocumentMatrix(myCorpus2, control = list(minWordLength = 3))

library(wordcloud)
m <- as.matrix(myDtm)
> # calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
k <- which(names(v)=="miners")
myNames[k] <- "mining"
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=15)

#This provides a list of frequently used terms.
findFreqTerms(myDtm, lowfreq=30)

#This will get rid of some irrelevant terms
myStopwords <- c(stopwords('english'), "the", "you")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)

#This will create a feature based on one item
train$s_illeg <- grepl("graffiti", train$summary, ignore.case=TRUE)

#This will create a feature based on an array of related terms.
illegal<-c("graffiti", "illegal", "drug")
train$s_illeg <- grepl(paste(illegal, collapse='|'), train$summary, ignore.case=TRUE)

trash<-c("dump", "abandon", "trash", "pickup","recycling","pile", "refuse")
train$s_illeg <- grepl(paste(illegal, collapse='|'), train$summary, ignore.case=TRUE)
	#This is a cool demo that shows the potential of connecting with Google Visualizations
	install.packages("googleVis")
	library(googleVis)
	demo(WorldBank)



	#Start of Kaggle See Click Predict
	setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
	train <- read.csv("train.csv")
	test <- read.csv("test.csv")
	str(train)
	head(train)

	#We know there are 4 Cities
	geo <- train[,2:3]
	set.seed(2)
	fit <- kmeans(geo$latitude, 4) # 4 cluster solution

	#Make sure clustering worked, should show 4 distinct areas You might have to run again if dots not colored right
	plot(geo$latitude, geo$longitude, col = fit$cluster)

	#caputure cluster assignment
	citycl<-fit$cluster
	geob<-cbind(geo,citycl)
	cities<-aggregate(geob, by=list(geob$citycl), FUN=mean)
	View(cities)

	#Once you have the average long/Latitude for each cluster you can find city online.
	#1 41.85662 -87.68507 1 Chicago, IL
	#2 2 37.54046 -77.46269 2 Richmond, VA
	#3 3 41.31132 -72.92412 3 New Haven, CT
	#4 4 37.80239 -122.24116 4 Oakland, CA

	#You might need to change the ordering
	levels(citycl) = c('Chicago','Richmond','New Haven', 'Oakland')

	#Let's add our new feature back to the training dataset
	train<-cbind(train, citycl)

	#Intro to text data analysis

	#Basic Text Data Features
	train$summarync<-nchar(as.character(train$summary))
	train$descriptionnc<-nchar(as.character(train$description))

	#Currently variables are treated as factors, and they have repeating digits.
	library(tm)
	myCorpus <- Corpus(VectorSource(train$summary))
	myCorpus2 <- Corpus(VectorSource(train$description))
	myCorpus <- tm_map(myCorpus, tolower)
	myCorpus2 <- tm_map(myCorpus2, tolower)
	myCorpus <- tm_map(myCorpus, removePunctuation)
	myCorpus2 <- tm_map(myCorpus2, removePunctuation)
	myCorpus <- tm_map(myCorpus, removeNumbers)
	myCorpus2 <- tm_map(myCorpus2, removeNumbers)

	myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 3))
	myDtm2 <- TermDocumentMatrix(myCorpus2, control = list(minWordLength = 3))

	library(wordcloud)
	m <- as.matrix(myDtm)
	> # calculate the frequency of words
	v <- sort(rowSums(m), decreasing=TRUE)
	myNames <- names(v)
	k <- which(names(v)=="miners")
	myNames[k] <- "mining"
	d <- data.frame(word=myNames, freq=v)
	wordcloud(d$word, d$freq, min.freq=15)

	#This provides a list of frequently used terms.
	findFreqTerms(myDtm, lowfreq=30)

	#This will get rid of some irrelevant terms
	myStopwords <- c(stopwords('english'), "the", "you")
	idx <- which(myStopwords == "r")
	myStopwords <- myStopwords[-idx]
	myCorpus <- tm_map(myCorpus, removeWords, myStopwords)

	#This will create a feature based on one item
	train$s_illeg <- grepl("graffiti", train$summary, ignore.case=TRUE)

	#This will create a feature based on an array of related terms.
	illegal<-c("graffiti", "illegal", "drug")
	train$s_illeg <- grepl(paste(illegal, collapse='\|'), train$summary, ignore.case=TRUE)

	trash<-c("dump", "abandon", "trash", "pickup","recycling","pile", "refuse")
	train$s_illeg <- grepl(paste(illegal, collapse='\|'), train$summary, ignore.case=TRUE)