Created
November 11, 2013 22:55
-
-
Save jkuruzovich/7422079 to your computer and use it in GitHub Desktop.
New Code for Class - See Click Predict Fix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This is a cool demo that shows the potential of connecting with Google Visualizations | |
install.packages("googleVis") | |
library(googleVis) | |
demo(WorldBank) | |
#Start of Kaggle See Click Predict | |
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2") | |
train <- read.csv("train.csv") | |
test <- read.csv("test.csv") | |
str(train) | |
head(train) | |
#We know there are 4 Cities | |
geo <- train[,2:3] | |
set.seed(2) | |
fit <- kmeans(geo$latitude, 4) # 4 cluster solution | |
#Make sure clustering worked, should show 4 distinct areas You might have to run again if dots not colored right | |
plot(geo$latitude, geo$longitude, col = fit$cluster) | |
#caputure cluster assignment | |
citycl<-fit$cluster | |
geob<-cbind(geo,citycl) | |
cities<-aggregate(geob, by=list(geob$citycl), FUN=mean) | |
View(cities) | |
#Once you have the average long/Latitude for each cluster you can find city online. | |
#1 41.85662 -87.68507 1 Chicago, IL | |
#2 2 37.54046 -77.46269 2 Richmond, VA | |
#3 3 41.31132 -72.92412 3 New Haven, CT | |
#4 4 37.80239 -122.24116 4 Oakland, CA | |
#You might need to change the ordering | |
levels(citycl) = c('Chicago','Richmond','New Haven', 'Oakland') | |
#Let's add our new feature back to the training dataset | |
train<-cbind(train, citycl) | |
#Intro to text data analysis | |
#Basic Text Data Features | |
train$summarync<-nchar(as.character(train$summary)) | |
train$descriptionnc<-nchar(as.character(train$description)) | |
#Currently variables are treated as factors, and they have repeating digits. | |
library(tm) | |
myCorpus <- Corpus(VectorSource(train$summary)) | |
myCorpus2 <- Corpus(VectorSource(train$description)) | |
myCorpus <- tm_map(myCorpus, tolower) | |
myCorpus2 <- tm_map(myCorpus2, tolower) | |
myCorpus <- tm_map(myCorpus, removePunctuation) | |
myCorpus2 <- tm_map(myCorpus2, removePunctuation) | |
myCorpus <- tm_map(myCorpus, removeNumbers) | |
myCorpus2 <- tm_map(myCorpus2, removeNumbers) | |
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 3)) | |
myDtm2 <- TermDocumentMatrix(myCorpus2, control = list(minWordLength = 3)) | |
library(wordcloud) | |
m <- as.matrix(myDtm) | |
> # calculate the frequency of words | |
v <- sort(rowSums(m), decreasing=TRUE) | |
myNames <- names(v) | |
k <- which(names(v)=="miners") | |
myNames[k] <- "mining" | |
d <- data.frame(word=myNames, freq=v) | |
wordcloud(d$word, d$freq, min.freq=15) | |
#This provides a list of frequently used terms. | |
findFreqTerms(myDtm, lowfreq=30) | |
#This will get rid of some irrelevant terms | |
myStopwords <- c(stopwords('english'), "the", "you") | |
idx <- which(myStopwords == "r") | |
myStopwords <- myStopwords[-idx] | |
myCorpus <- tm_map(myCorpus, removeWords, myStopwords) | |
#This will create a feature based on one item | |
train$s_illeg <- grepl("graffiti", train$summary, ignore.case=TRUE) | |
#This will create a feature based on an array of related terms. | |
illegal<-c("graffiti", "illegal", "drug") | |
train$s_illeg <- grepl(paste(illegal, collapse='|'), train$summary, ignore.case=TRUE) | |
trash<-c("dump", "abandon", "trash", "pickup","recycling","pile", "refuse") | |
train$s_illeg <- grepl(paste(illegal, collapse='|'), train$summary, ignore.case=TRUE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment