saraswatmks/TextMining_Tutorial.R

## TextMining_Tutorial.R
library(data.table)
library(jsonlite)
library(purrr)
library(RecordLinkage)
library(stringr)
library(tm)

traind <- fromJSON("train.json")
test <- fromJSON("test.json")

#unlist
vars <- setdiff(names(traind),c("photos","features"))
train <- map_at(traind, vars, unlist) %>% as.data.table()
test <- map_at(test,vars,unlist) %>% as.data.table()

#subset the data set for text mining
train <- train[,.(listing_id,features, description,street_address,display_address,interest_level)]
test <- test[,.(listing_id,features,street_address,display_address,description)]

#explore the data
dim(train)
dim(test)

head(train)
head(test)

sapply(train,class)
sapply(test,class)

#join data
test[,interest_level := "None"]
tdata <- rbindlist(list(train,test))

#create features
tdata[,features := ifelse(map(features, is_empty),"aempty",features)]
#count number of features per listing
tdata[,feature_count := unlist(lapply(features, length))]
#count number of words in description
tdata[,desc_word_count := str_count(description,pattern = "\\w+")]
#count total length of description
tdata[,desc_len := str_count(description)]
#similarity between address
tdata[,lev_sim := levenshteinDist(street_address,display_address)]

dim(tdata)

#Features Variables
#extract variables from features
fdata <- data.table(listing_id = rep(unlist(tdata$listing_id), lapply(tdata$features, length)), features = unlist(tdata$features))
head(fdata) #here we have tokenize the features i.e. one feature per row

#convert features to lower
fdata[,features := unlist(lapply(features, tolower))]

#calculate count for every feature
fdata[,count := .N, features]
fdata[order(count)][1:20]

#keep features which occur 100 or more times
fdata <- fdata[count >= 100]

#convert columns into table
fdata <- dcast(data = fdata, formula =  listing_id ~ features, fun.aggregate = length, value.var = "features")
#this results in 96 features

#create a corpus of descriptions
text_corpus <- Corpus(VectorSource(tdata$description))
inspect(text_corpus[1:4]) #check first 4 documents

#the corpus is a list object in R of type CORPUS
print(lapply(text_corpus[1:2], as.character))

#let's clean the data
dropword <- "br"

#remove br
text_corpus <- tm_map(text_corpus,removeWords,dropword)
print(as.character(text_corpus[[1]]))
#tolower
text_corpus <- tm_map(text_corpus, tolower)
print(as.character(text_corpus[[1]]))
#remove punctuation
text_corpus <- tm_map(text_corpus, removePunctuation)
print(as.character(text_corpus[[1]]))
#remove number
text_corpus <- tm_map(text_corpus, removeNumbers)
print(as.character(text_corpus[[1]]))
#remove whitespaces
text_corpus <- tm_map(text_corpus, stripWhitespace,lazy = T)
print(as.character(text_corpus[[1]]))
#remove stopwords
text_corpus <- tm_map(text_corpus, removeWords, c(stopwords('english')))
print(as.character(text_corpus[[1]]))

#convert to text document
text_corpus <- tm_map(text_corpus, PlainTextDocument)

#perform stemming - this should always be performed after text doc conversion
text_corpus <- tm_map(text_corpus, stemDocument,language = "english")
print(as.character(text_corpus[[1]]))
text_corpus[[1]]$content

#convert to document term matrix
docterm_corpus <- DocumentTermMatrix(text_corpus)
dim(docterm_corpus)
#rm(text_corpus)
#this corpus has 124011 rows and 81143 columns

#remove sparse terms
#sparse parameter remove terms which more 95% sparse i.e. 95% of the values in the term are zero
new_docterm_corpus <- removeSparseTerms(docterm_corpus,sparse = 0.95)
dim(new_docterm_corpus)
#the new corpus has 87 features

#Exploring these Features
#find frequent terms
colS <- colSums(as.matrix(new_docterm_corpus))
length(colS)

doc_features <- data.table(name = attributes(colS)$names, count = colS)

#most frequent and least frequent words
doc_features[order(-count)][1:10] #top 10 most frequent words
doc_features[order(count)][1:10] #least 10 freuqnet

#visualize the data
library(ggplot2)
library(ggthemes)

ggplot(doc_features[count>20000],aes(name, count)) +
  geom_bar(stat = "identity",fill='lightblue',color='black')+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme_economist()+
  scale_color_economist()

#check association of terms of top features, and remove the correlated ones
findAssocs(new_docterm_corpus,"street",corlimit = 0.5)
findAssocs(new_docterm_corpus,"new",corlimit = 0.5)


#create wordcloud
library(wordcloud)
#this wordcloud is exported
wordcloud(names(colS), colS, min.freq = 100, scale = c(6,.1), colors = brewer.pal(6, 'Dark2'))
wordcloud(names(colS), colS, min.freq = 5000, scale = c(6,.1), colors = brewer.pal(6, 'Dark2'))

#create data set for traning
processed_data <- as.data.table(as.matrix(new_docterm_corpus))


#Data 1
#combing the data
data_one <- cbind(data.table(listing_id = tdata$listing_id, interest_level = tdata$interest_level),processed_data)
#merging the features
data_one <- fdata[data_one, on="listing_id"]

#split the data set into train and test
train_one <- data_one[interest_level != "None"]
test_one <- data_one[interest_level == "None"]

test_one[,interest_level := NULL]
rm(data_one,processed_data)


#modeling data 1
train_one[,interest_level := as.factor(interest_level)]
train_one[,interest_level := as.integer(as.factor(interest_level))-1]
#high = 0, low = 1, medium = 2

library(caTools)
library(xgboost)

#returns 60% indexes from train data
sp <- sample.split(Y = train_one$interest_level,SplitRatio = 0.6)

#create data for xgboost
xg_val <- train_one[sp]
listing_id <- train_one$listing_id
target <- train_one$interest_level
xg_val_target <- target[sp]


d_train <- xgb.DMatrix(data = as.matrix(train_one[,-c("listing_id","interest_level"),with=F]),label = target)
d_val <- xgb.DMatrix(data = as.matrix(xg_val[,-c("listing_id","interest_level"),with=F]), label = xg_val_target)
d_test <- xgb.DMatrix(data = as.matrix(test_one[,-c("listing_id"),with=F]))


param <- list(booster="gbtree",
              objective="multi:softprob",
              eval_metric="mlogloss",
              #nthread=13,
              num_class=3,
              eta = .02,
              gamma = 1,
              max_depth = 4,
              min_child_weight = 1,
              subsample = .7,
              colsample_bytree = .5
)

set.seed(2017)

watch <- list(val=d_val, train=d_train)
xgb2 <- xgb.train(data = d_train,
                  params = param,
                  watchlist=watch,
                  # nrounds = xgb2cv$best_ntreelimit
                  nrounds = 500,
                  print_every_n = 10
) #validation - 0.6817

xg_pred <- as.data.table(t(matrix(predict(xgb2, d_test), nrow=3, ncol=nrow(d_test))))
colnames(xg_pred) <- c("high","low","medium")
xg_pred <- cbind(data.table(listing_id = test$listing_id),xg_pred)
fwrite(xg_pred, "xgb_textmining.csv") #0.70473 on LB

#Exercise 1
#TF IDF Data set
data_mining_tf <- as.data.table(as.matrix(weightTfIdf(new_docterm_corpus)))

#Exercise 2
#Bigram model
install.packages("RWeka")
library(RWeka)

#bigram function
Bigram_Tokenizer <- function(x){
  NGramTokenizer(x, Weka_control(min=2, max=2))
}
#create a matrix
bi_docterm_matrix <- DocumentTermMatrix(text_corpus, control = list(tokenize = Bigram_Tokenizer))
	library(data.table)
	library(jsonlite)
	library(purrr)
	library(RecordLinkage)
	library(stringr)
	library(tm)

	traind <- fromJSON("train.json")
	test <- fromJSON("test.json")

	#unlist
	vars <- setdiff(names(traind),c("photos","features"))
	train <- map_at(traind, vars, unlist) %>% as.data.table()
	test <- map_at(test,vars,unlist) %>% as.data.table()

	#subset the data set for text mining
	train <- train[,.(listing_id,features, description,street_address,display_address,interest_level)]
	test <- test[,.(listing_id,features,street_address,display_address,description)]

	#explore the data
	dim(train)
	dim(test)

	head(train)
	head(test)

	sapply(train,class)
	sapply(test,class)

	#join data
	test[,interest_level := "None"]
	tdata <- rbindlist(list(train,test))

	#create features
	tdata[,features := ifelse(map(features, is_empty),"aempty",features)]
	#count number of features per listing
	tdata[,feature_count := unlist(lapply(features, length))]
	#count number of words in description
	tdata[,desc_word_count := str_count(description,pattern = "\\w+")]
	#count total length of description
	tdata[,desc_len := str_count(description)]
	#similarity between address
	tdata[,lev_sim := levenshteinDist(street_address,display_address)]

	dim(tdata)

	#Features Variables
	#extract variables from features
	fdata <- data.table(listing_id = rep(unlist(tdata$listing_id), lapply(tdata$features, length)), features = unlist(tdata$features))
	head(fdata) #here we have tokenize the features i.e. one feature per row

	#convert features to lower
	fdata[,features := unlist(lapply(features, tolower))]

	#calculate count for every feature
	fdata[,count := .N, features]
	fdata[order(count)][1:20]

	#keep features which occur 100 or more times
	fdata <- fdata[count >= 100]

	#convert columns into table
	fdata <- dcast(data = fdata, formula = listing_id ~ features, fun.aggregate = length, value.var = "features")
	#this results in 96 features

	#create a corpus of descriptions
	text_corpus <- Corpus(VectorSource(tdata$description))
	inspect(text_corpus[1:4]) #check first 4 documents

	#the corpus is a list object in R of type CORPUS
	print(lapply(text_corpus[1:2], as.character))

	#let's clean the data
	dropword <- "br"

	#remove br
	text_corpus <- tm_map(text_corpus,removeWords,dropword)
	print(as.character(text_corpus[[1]]))
	#tolower
	text_corpus <- tm_map(text_corpus, tolower)
	print(as.character(text_corpus[[1]]))
	#remove punctuation
	text_corpus <- tm_map(text_corpus, removePunctuation)
	print(as.character(text_corpus[[1]]))
	#remove number
	text_corpus <- tm_map(text_corpus, removeNumbers)
	print(as.character(text_corpus[[1]]))
	#remove whitespaces
	text_corpus <- tm_map(text_corpus, stripWhitespace,lazy = T)
	print(as.character(text_corpus[[1]]))
	#remove stopwords
	text_corpus <- tm_map(text_corpus, removeWords, c(stopwords('english')))
	print(as.character(text_corpus[[1]]))

	#convert to text document
	text_corpus <- tm_map(text_corpus, PlainTextDocument)

	#perform stemming - this should always be performed after text doc conversion
	text_corpus <- tm_map(text_corpus, stemDocument,language = "english")
	print(as.character(text_corpus[[1]]))
	text_corpus[[1]]$content

	#convert to document term matrix
	docterm_corpus <- DocumentTermMatrix(text_corpus)
	dim(docterm_corpus)
	#rm(text_corpus)
	#this corpus has 124011 rows and 81143 columns

	#remove sparse terms
	#sparse parameter remove terms which more 95% sparse i.e. 95% of the values in the term are zero
	new_docterm_corpus <- removeSparseTerms(docterm_corpus,sparse = 0.95)
	dim(new_docterm_corpus)
	#the new corpus has 87 features

	#Exploring these Features
	#find frequent terms
	colS <- colSums(as.matrix(new_docterm_corpus))
	length(colS)

	doc_features <- data.table(name = attributes(colS)$names, count = colS)

	#most frequent and least frequent words
	doc_features[order(-count)][1:10] #top 10 most frequent words
	doc_features[order(count)][1:10] #least 10 freuqnet

	#visualize the data
	library(ggplot2)
	library(ggthemes)

	ggplot(doc_features[count>20000],aes(name, count)) +
	geom_bar(stat = "identity",fill='lightblue',color='black')+
	theme(axis.text.x = element_text(angle = 45, hjust = 1))+
	theme_economist()+
	scale_color_economist()

	#check association of terms of top features, and remove the correlated ones
	findAssocs(new_docterm_corpus,"street",corlimit = 0.5)
	findAssocs(new_docterm_corpus,"new",corlimit = 0.5)


	#create wordcloud
	library(wordcloud)
	#this wordcloud is exported
	wordcloud(names(colS), colS, min.freq = 100, scale = c(6,.1), colors = brewer.pal(6, 'Dark2'))
	wordcloud(names(colS), colS, min.freq = 5000, scale = c(6,.1), colors = brewer.pal(6, 'Dark2'))

	#create data set for traning
	processed_data <- as.data.table(as.matrix(new_docterm_corpus))


	#Data 1
	#combing the data
	data_one <- cbind(data.table(listing_id = tdata$listing_id, interest_level = tdata$interest_level),processed_data)
	#merging the features
	data_one <- fdata[data_one, on="listing_id"]

	#split the data set into train and test
	train_one <- data_one[interest_level != "None"]
	test_one <- data_one[interest_level == "None"]

	test_one[,interest_level := NULL]
	rm(data_one,processed_data)


	#modeling data 1
	train_one[,interest_level := as.factor(interest_level)]
	train_one[,interest_level := as.integer(as.factor(interest_level))-1]
	#high = 0, low = 1, medium = 2

	library(caTools)
	library(xgboost)

	#returns 60% indexes from train data
	sp <- sample.split(Y = train_one$interest_level,SplitRatio = 0.6)

	#create data for xgboost
	xg_val <- train_one[sp]
	listing_id <- train_one$listing_id
	target <- train_one$interest_level
	xg_val_target <- target[sp]


	d_train <- xgb.DMatrix(data = as.matrix(train_one[,-c("listing_id","interest_level"),with=F]),label = target)
	d_val <- xgb.DMatrix(data = as.matrix(xg_val[,-c("listing_id","interest_level"),with=F]), label = xg_val_target)
	d_test <- xgb.DMatrix(data = as.matrix(test_one[,-c("listing_id"),with=F]))


	param <- list(booster="gbtree",
	objective="multi:softprob",
	eval_metric="mlogloss",
	#nthread=13,
	num_class=3,
	eta = .02,
	gamma = 1,
	max_depth = 4,
	min_child_weight = 1,
	subsample = .7,
	colsample_bytree = .5
	)

	set.seed(2017)

	watch <- list(val=d_val, train=d_train)
	xgb2 <- xgb.train(data = d_train,
	params = param,
	watchlist=watch,
	# nrounds = xgb2cv$best_ntreelimit
	nrounds = 500,
	print_every_n = 10
	) #validation - 0.6817

	xg_pred <- as.data.table(t(matrix(predict(xgb2, d_test), nrow=3, ncol=nrow(d_test))))
	colnames(xg_pred) <- c("high","low","medium")
	xg_pred <- cbind(data.table(listing_id = test$listing_id),xg_pred)
	fwrite(xg_pred, "xgb_textmining.csv") #0.70473 on LB

	#Exercise 1
	#TF IDF Data set
	data_mining_tf <- as.data.table(as.matrix(weightTfIdf(new_docterm_corpus)))

	#Exercise 2
	#Bigram model
	install.packages("RWeka")
	library(RWeka)

	#bigram function
	Bigram_Tokenizer <- function(x){
	NGramTokenizer(x, Weka_control(min=2, max=2))
	}
	#create a matrix
	bi_docterm_matrix <- DocumentTermMatrix(text_corpus, control = list(tokenize = Bigram_Tokenizer))