#load required pacakges if(!require("readtext")) install.packages("readtext") library(readtext) if(!require("tm")) install.packages("tm") library(tm) if(!require("stringr")) install.packages("stringr") library(stringr) if(!require("qdap")) install.packages("qdap") library(qdap) if(!require("slam")) install.packages("slam") library(slam) #data files are uploaded at below location: #https://github.com/sureshgorakala/machinelearning/tree/master/data #load all content files news_docs = readtext("*.txt") news_list = lapply(news_docs[,2],function(x) genX(x, " [", "]")) N.docs = length(news_list) names(news_list) = news_docs[,1] #load search queries search_queries = readtext("query.txt",dvsep = "\n") queries_list = unlist(strsplit(search_queries[1,2],"\n")) N.query = length(queries_list) names(queries_list) = paste0("query", c(1:N.query)) #preprocess data news content #append both content and search queries together, convert the lists to VectorSource newscorpus = VectorSource(c(news_list,queries_list)) newscorpus$Names = c(names(news_list),names(queries_list)) #convert to corpus format newscorpus_preproc = Corpus(newscorpus) #cleaning the data newscorpus_preproc = tm_map(newscorpus_preproc,stripWhitespace) newscorpus_preproc = tm_map(newscorpus_preproc,removePunctuation) newscorpus_preproc = tm_map(newscorpus_preproc,content_transformer(tolower)) newscorpus_preproc = tm_map(newscorpus_preproc,removeWords,stopwords("english")) #create tdm using weighted tfidf weightage tdm = TermDocumentMatrix(newscorpus_preproc,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))) tdm_mat = as.matrix(tdm) colnames(tdm_mat) = c(names(news_list),names(queries_list)) #normalizing the term document matrix tfidf_mat <- scale(tdm_mat, center = FALSE,scale = sqrt(colSums(tdm_mat^2))) #seperating query tdm matrix and content tdm matrix query.vectors <- tfidf_mat[, (N.docs + 1):(N.docs+N.query)] tfidf_mat <- tfidf_mat[, 1:N.docs] #calculating the similarity scores doc.scores <- t(query.vectors) %*% tfidf_mat results.df <- data.frame(querylist = queries_list,doc.scores) #function to display the final results showTopresults <- function(query){ x = results.df[which(results.df$querylist == query),] yy = data.frame(t(x),rownames(t(x)),row.names = NULL)[-1,] names(yy) = c("score","docs") yy$score = as.numeric(as.character(yy$score)) yyy = yy[order(yy$score,decreasing = T),] return(yyy[which(yyy$score > 0),][1:3,]) } #test the function showTopresults("narendra modi visit to washington")