Created
April 4, 2017 11:28
-
-
Save saraswatmks/4ed7979f28d184888527b2f8ceb5d1ad to your computer and use it in GitHub Desktop.
Text Mining Tutorial on Kaggle DataSet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(jsonlite) | |
library(purrr) | |
library(RecordLinkage) | |
library(stringr) | |
library(tm) | |
traind <- fromJSON("train.json") | |
test <- fromJSON("test.json") | |
#unlist | |
vars <- setdiff(names(traind),c("photos","features")) | |
train <- map_at(traind, vars, unlist) %>% as.data.table() | |
test <- map_at(test,vars,unlist) %>% as.data.table() | |
#subset the data set for text mining | |
train <- train[,.(listing_id,features, description,street_address,display_address,interest_level)] | |
test <- test[,.(listing_id,features,street_address,display_address,description)] | |
#explore the data | |
dim(train) | |
dim(test) | |
head(train) | |
head(test) | |
sapply(train,class) | |
sapply(test,class) | |
#join data | |
test[,interest_level := "None"] | |
tdata <- rbindlist(list(train,test)) | |
#create features | |
tdata[,features := ifelse(map(features, is_empty),"aempty",features)] | |
#count number of features per listing | |
tdata[,feature_count := unlist(lapply(features, length))] | |
#count number of words in description | |
tdata[,desc_word_count := str_count(description,pattern = "\\w+")] | |
#count total length of description | |
tdata[,desc_len := str_count(description)] | |
#similarity between address | |
tdata[,lev_sim := levenshteinDist(street_address,display_address)] | |
dim(tdata) | |
#Features Variables | |
#extract variables from features | |
fdata <- data.table(listing_id = rep(unlist(tdata$listing_id), lapply(tdata$features, length)), features = unlist(tdata$features)) | |
head(fdata) #here we have tokenize the features i.e. one feature per row | |
#convert features to lower | |
fdata[,features := unlist(lapply(features, tolower))] | |
#calculate count for every feature | |
fdata[,count := .N, features] | |
fdata[order(count)][1:20] | |
#keep features which occur 100 or more times | |
fdata <- fdata[count >= 100] | |
#convert columns into table | |
fdata <- dcast(data = fdata, formula = listing_id ~ features, fun.aggregate = length, value.var = "features") | |
#this results in 96 features | |
#create a corpus of descriptions | |
text_corpus <- Corpus(VectorSource(tdata$description)) | |
inspect(text_corpus[1:4]) #check first 4 documents | |
#the corpus is a list object in R of type CORPUS | |
print(lapply(text_corpus[1:2], as.character)) | |
#let's clean the data | |
dropword <- "br" | |
#remove br | |
text_corpus <- tm_map(text_corpus,removeWords,dropword) | |
print(as.character(text_corpus[[1]])) | |
#tolower | |
text_corpus <- tm_map(text_corpus, tolower) | |
print(as.character(text_corpus[[1]])) | |
#remove punctuation | |
text_corpus <- tm_map(text_corpus, removePunctuation) | |
print(as.character(text_corpus[[1]])) | |
#remove number | |
text_corpus <- tm_map(text_corpus, removeNumbers) | |
print(as.character(text_corpus[[1]])) | |
#remove whitespaces | |
text_corpus <- tm_map(text_corpus, stripWhitespace,lazy = T) | |
print(as.character(text_corpus[[1]])) | |
#remove stopwords | |
text_corpus <- tm_map(text_corpus, removeWords, c(stopwords('english'))) | |
print(as.character(text_corpus[[1]])) | |
#convert to text document | |
text_corpus <- tm_map(text_corpus, PlainTextDocument) | |
#perform stemming - this should always be performed after text doc conversion | |
text_corpus <- tm_map(text_corpus, stemDocument,language = "english") | |
print(as.character(text_corpus[[1]])) | |
text_corpus[[1]]$content | |
#convert to document term matrix | |
docterm_corpus <- DocumentTermMatrix(text_corpus) | |
dim(docterm_corpus) | |
#rm(text_corpus) | |
#this corpus has 124011 rows and 81143 columns | |
#remove sparse terms | |
#sparse parameter remove terms which more 95% sparse i.e. 95% of the values in the term are zero | |
new_docterm_corpus <- removeSparseTerms(docterm_corpus,sparse = 0.95) | |
dim(new_docterm_corpus) | |
#the new corpus has 87 features | |
#Exploring these Features | |
#find frequent terms | |
colS <- colSums(as.matrix(new_docterm_corpus)) | |
length(colS) | |
doc_features <- data.table(name = attributes(colS)$names, count = colS) | |
#most frequent and least frequent words | |
doc_features[order(-count)][1:10] #top 10 most frequent words | |
doc_features[order(count)][1:10] #least 10 freuqnet | |
#visualize the data | |
library(ggplot2) | |
library(ggthemes) | |
ggplot(doc_features[count>20000],aes(name, count)) + | |
geom_bar(stat = "identity",fill='lightblue',color='black')+ | |
theme(axis.text.x = element_text(angle = 45, hjust = 1))+ | |
theme_economist()+ | |
scale_color_economist() | |
#check association of terms of top features, and remove the correlated ones | |
findAssocs(new_docterm_corpus,"street",corlimit = 0.5) | |
findAssocs(new_docterm_corpus,"new",corlimit = 0.5) | |
#create wordcloud | |
library(wordcloud) | |
#this wordcloud is exported | |
wordcloud(names(colS), colS, min.freq = 100, scale = c(6,.1), colors = brewer.pal(6, 'Dark2')) | |
wordcloud(names(colS), colS, min.freq = 5000, scale = c(6,.1), colors = brewer.pal(6, 'Dark2')) | |
#create data set for traning | |
processed_data <- as.data.table(as.matrix(new_docterm_corpus)) | |
#Data 1 | |
#combing the data | |
data_one <- cbind(data.table(listing_id = tdata$listing_id, interest_level = tdata$interest_level),processed_data) | |
#merging the features | |
data_one <- fdata[data_one, on="listing_id"] | |
#split the data set into train and test | |
train_one <- data_one[interest_level != "None"] | |
test_one <- data_one[interest_level == "None"] | |
test_one[,interest_level := NULL] | |
rm(data_one,processed_data) | |
#modeling data 1 | |
train_one[,interest_level := as.factor(interest_level)] | |
train_one[,interest_level := as.integer(as.factor(interest_level))-1] | |
#high = 0, low = 1, medium = 2 | |
library(caTools) | |
library(xgboost) | |
#returns 60% indexes from train data | |
sp <- sample.split(Y = train_one$interest_level,SplitRatio = 0.6) | |
#create data for xgboost | |
xg_val <- train_one[sp] | |
listing_id <- train_one$listing_id | |
target <- train_one$interest_level | |
xg_val_target <- target[sp] | |
d_train <- xgb.DMatrix(data = as.matrix(train_one[,-c("listing_id","interest_level"),with=F]),label = target) | |
d_val <- xgb.DMatrix(data = as.matrix(xg_val[,-c("listing_id","interest_level"),with=F]), label = xg_val_target) | |
d_test <- xgb.DMatrix(data = as.matrix(test_one[,-c("listing_id"),with=F])) | |
param <- list(booster="gbtree", | |
objective="multi:softprob", | |
eval_metric="mlogloss", | |
#nthread=13, | |
num_class=3, | |
eta = .02, | |
gamma = 1, | |
max_depth = 4, | |
min_child_weight = 1, | |
subsample = .7, | |
colsample_bytree = .5 | |
) | |
set.seed(2017) | |
watch <- list(val=d_val, train=d_train) | |
xgb2 <- xgb.train(data = d_train, | |
params = param, | |
watchlist=watch, | |
# nrounds = xgb2cv$best_ntreelimit | |
nrounds = 500, | |
print_every_n = 10 | |
) #validation - 0.6817 | |
xg_pred <- as.data.table(t(matrix(predict(xgb2, d_test), nrow=3, ncol=nrow(d_test)))) | |
colnames(xg_pred) <- c("high","low","medium") | |
xg_pred <- cbind(data.table(listing_id = test$listing_id),xg_pred) | |
fwrite(xg_pred, "xgb_textmining.csv") #0.70473 on LB | |
#Exercise 1 | |
#TF IDF Data set | |
data_mining_tf <- as.data.table(as.matrix(weightTfIdf(new_docterm_corpus))) | |
#Exercise 2 | |
#Bigram model | |
install.packages("RWeka") | |
library(RWeka) | |
#bigram function | |
Bigram_Tokenizer <- function(x){ | |
NGramTokenizer(x, Weka_control(min=2, max=2)) | |
} | |
#create a matrix | |
bi_docterm_matrix <- DocumentTermMatrix(text_corpus, control = list(tokenize = Bigram_Tokenizer)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment