Skip to content

Instantly share code, notes, and snippets.

@jkuruzovich
Created November 11, 2013 22:55
Show Gist options
  • Save jkuruzovich/7422079 to your computer and use it in GitHub Desktop.
Save jkuruzovich/7422079 to your computer and use it in GitHub Desktop.
New Code for Class - See Click Predict Fix
#This is a cool demo that shows the potential of connecting with Google Visualizations
install.packages("googleVis")
library(googleVis)
demo(WorldBank)
#Start of Kaggle See Click Predict
setwd("~/Dropbox/30_classes/analytics/2013_fall/kaggle2")
train <- read.csv("train.csv")
test <- read.csv("test.csv")
str(train)
head(train)
#We know there are 4 Cities
geo <- train[,2:3]
set.seed(2)
fit <- kmeans(geo$latitude, 4) # 4 cluster solution
#Make sure clustering worked, should show 4 distinct areas You might have to run again if dots not colored right
plot(geo$latitude, geo$longitude, col = fit$cluster)
#caputure cluster assignment
citycl<-fit$cluster
geob<-cbind(geo,citycl)
cities<-aggregate(geob, by=list(geob$citycl), FUN=mean)
View(cities)
#Once you have the average long/Latitude for each cluster you can find city online.
#1 41.85662 -87.68507 1 Chicago, IL
#2 2 37.54046 -77.46269 2 Richmond, VA
#3 3 41.31132 -72.92412 3 New Haven, CT
#4 4 37.80239 -122.24116 4 Oakland, CA
#You might need to change the ordering
levels(citycl) = c('Chicago','Richmond','New Haven', 'Oakland')
#Let's add our new feature back to the training dataset
train<-cbind(train, citycl)
#Intro to text data analysis
#Basic Text Data Features
train$summarync<-nchar(as.character(train$summary))
train$descriptionnc<-nchar(as.character(train$description))
#Currently variables are treated as factors, and they have repeating digits.
library(tm)
myCorpus <- Corpus(VectorSource(train$summary))
myCorpus2 <- Corpus(VectorSource(train$description))
myCorpus <- tm_map(myCorpus, tolower)
myCorpus2 <- tm_map(myCorpus2, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus2 <- tm_map(myCorpus2, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus2 <- tm_map(myCorpus2, removeNumbers)
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 3))
myDtm2 <- TermDocumentMatrix(myCorpus2, control = list(minWordLength = 3))
library(wordcloud)
m <- as.matrix(myDtm)
> # calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
k <- which(names(v)=="miners")
myNames[k] <- "mining"
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=15)
#This provides a list of frequently used terms.
findFreqTerms(myDtm, lowfreq=30)
#This will get rid of some irrelevant terms
myStopwords <- c(stopwords('english'), "the", "you")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#This will create a feature based on one item
train$s_illeg <- grepl("graffiti", train$summary, ignore.case=TRUE)
#This will create a feature based on an array of related terms.
illegal<-c("graffiti", "illegal", "drug")
train$s_illeg <- grepl(paste(illegal, collapse='|'), train$summary, ignore.case=TRUE)
trash<-c("dump", "abandon", "trash", "pickup","recycling","pile", "refuse")
train$s_illeg <- grepl(paste(illegal, collapse='|'), train$summary, ignore.case=TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment