Skip to content

Instantly share code, notes, and snippets.

@sureshgorakala
Created December 11, 2017 07:25
Show Gist options
  • Save sureshgorakala/74cf69f7ebd48e487b82e939234b990a to your computer and use it in GitHub Desktop.
Save sureshgorakala/74cf69f7ebd48e487b82e939234b990a to your computer and use it in GitHub Desktop.
Search engine using SVD in R
#load required pacakges
if(!require("readtext"))
install.packages("readtext")
library(readtext)
if(!require("tm"))
install.packages("tm")
library(tm)
if(!require("stringr"))
install.packages("stringr")
library(stringr)
if(!require("qdap"))
install.packages("qdap")
library(qdap)
if(!require("slam"))
install.packages("slam")
library(slam)
setwd("C:\\Suresh\\Blog Posts\\textsimilarity\\ML_assignment\\Problem_statement_1\\data")
#data files are uploaded at below location:
#https://github.com/sureshgorakala/machinelearning/tree/master/data
#load all content files
news_docs = readtext("*.txt")
news_list = lapply(news_docs[,2],function(x) genX(x, " [", "]"))
N.docs = length(news_list)
names(news_list) = news_docs[,1]
setwd("C:\\Suresh\\Blog Posts\\textsimilarity\\ML_assignment\\Problem_statement_1")
#load search queries
search_queries = readtext("query.txt",dvsep = "\n")
queries_list = unlist(strsplit(search_queries[1,2],"\n"))
N.query = length(queries_list)
names(queries_list) = paste0("query", c(1:N.query))
#preprocess data news content
#append both content and search queries together, convert the lists to VectorSource
newscorpus = VectorSource(c(news_list,queries_list))
newscorpus$Names = c(names(news_list),names(queries_list))
#convert to corpus format
newscorpus_preproc = Corpus(newscorpus)
#cleaning the data
newscorpus_preproc = tm_map(newscorpus_preproc,stripWhitespace)
newscorpus_preproc = tm_map(newscorpus_preproc,removePunctuation)
newscorpus_preproc = tm_map(newscorpus_preproc,content_transformer(tolower))
newscorpus_preproc = tm_map(newscorpus_preproc,removeWords,stopwords("english"))
#create tdm using weighted tfidf weightage
tdm = TermDocumentMatrix(newscorpus_preproc,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
tdm_mat = as.matrix(tdm)
colnames(tdm_mat) = c(names(news_list),names(queries_list))
#normalizing the term document matrix
tfidf_mat <- scale(tdm_mat, center = FALSE,scale = sqrt(colSums(tdm_mat^2)))
trainsvd = tfidf_mat[,1:9]
#trainsvd2 = t(tdm_mat)[1:9,]
querytfidf = tfidf_mat[,10:18]
#truncated svd
library(irlba)
decom2 = irlba(trainsvd,maxit=100)
D2 = diag(decom2$d)
U2 = decom2$u
V2 = decom2$v
newTerms2= U2 %*% D2 #1233X5
newA2 = t(V2) #5X9
newQ2 = t(newTerms2) %*% querytfidf #5X9
cosinesim2 = t(newQ2) %*% newA2
'
#normal svd
decom = svd(trainsvd)
newTerms= decom$u %*% diag(decom$d)
newA = t(decom$v)
newQ = t(newTerms) %*% querytfidf
cosinesim = newA %*% t(newQ)
'
#querytfidf2 = t(tdm_mat)[10:18,]
#applying svd to tdf
#nu = nrow(trainsvd)
#nv = ncol(trainsvd)
#decom = svd(trainsvd2)
#cosinesim2 = querytfidf2 %*% decom$v
'
if(!require("irlba"))
install.packages("irlba")
library(irlba)
decom2 = irlba(trainsvd,maxit=100)
cosinesim2 = querytfidf %*% decom2$v
svddata = data.frame(decom$v)
names(svddata) = rownames(tfidf_mat)
if(!require("lsa"))
install.packages("lsa")
library(lsa)
cosineSim = cosine(t(svddata))
View(cosineSim)
#D = diag(decom$d)
#S = diag(decom$d^0.5 )
#calculating document similarity:
#DocSim = S %*% t(decom$v)
library(corrplot)
o=corrplot(DocSim,method="number") # shows the similarity between the documents visually
#subsetting only the queries and its document similarity values
x = N.docs+1
y = N.query+N.query
searchSimilarities = DocSim[x:y,1:x-1]
o=corrplot(searchSimilarities,method="number") # shows the similarity between the documents visually
#seperating query tdm matrix and content tdm matrix
query.vectors <- tfidf_mat[, (N.docs + 1):(N.docs+N.query)]
tfidf_mat <- tfidf_mat[, 1:N.docs]'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment