Skip to content

Instantly share code, notes, and snippets.

@brendte
Created February 5, 2013 16:15
Show Gist options
  • Save brendte/4715478 to your computer and use it in GitHub Desktop.
Save brendte/4715478 to your computer and use it in GitHub Desktop.
Complete index and query solution. Naively implemented with nested loops.
#!/usr/bin/ruby
require 'rubygems'
require 'fast_stemmer'
def doc_prep(docs)
prepped_docs = {}
doc_id = 0
stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,use,used".split(',')
docs.each do |doc|
doc_id += 1
unstemmed_words = doc.gsub(/[[:punct:]]/, '').downcase.split.select { |word| !stop_words.include?(word) }
stemmed_words = []
unstemmed_words.each { |word| stemmed_words << word.stem }
prepped_docs[doc_id] = stemmed_words
end
return prepped_docs
end
def create_dictionary_and_postings(docs)
dictionary = {}
postings =
term_id = 0
docs.each do |doc_id, doc|
words_in_this_doc = []
doc.each do |word|
symbolized_word = word.to_sym
if dictionary.has_key?(symbolized_word)
dictionary[symbolized_word][:cf] += 1
if !words_in_this_doc.include?(symbolized_word)
words_in_this_doc << symbolized_word
dictionary[symbolized_word][:df] += 1
dictionary[symbolized_word][:postings] << {doc_id: doc_id, tf: 1}
else
dictionary[symbolized_word][:postings].each do |posting|
posting[:tf] = posting[:tf] += 1 if posting[:doc_id] == doc_id
end
end
else
dictionary[symbolized_word] = {term_id: term_id += 1, cf: 1, df: 1, postings: [{doc_id: doc_id, tf: 1}]}
words_in_this_doc << symbolized_word
end
end
end
return dictionary
end
docs = []
ARGV.each do |doc|
docs << doc
end
docs = doc_prep(docs)
index = create_dictionary_and_postings(docs)
###################
### index AND query
###################
puts "index AND query"
term_1 = "index".stem.to_sym
term_1_hits = index[term_1][:postings]
term_2 = "query".stem.to_sym
term_2_hits = index[term_2][:postings]
document_hits = []
term_1_hits.each do |l_posting|
term_2_hits.each do |r_posting|
document_hits << r_posting[:doc_id] if l_posting[:doc_id] == r_posting[:doc_id]
end
end
puts "hit list: #{document_hits.sort.inspect}"
###################
### (search AND query) OR (search AND retrieve)
###################
puts "(search AND query) OR (search AND retrieve)"
# search AND query
term_1 = "search".stem.to_sym
term_1_hits = index[term_1][:postings]
term_2 = "query".stem.to_sym
term_2_hits = index[term_2][:postings]
document_hits_1 = []
term_1_hits.each do |l_posting|
term_2_hits.each do |r_posting|
document_hits_1 << r_posting[:doc_id] if l_posting[:doc_id] == r_posting[:doc_id]
end
end
# search AND retrieve
term_1 = "search".stem.to_sym
term_1_hits = index[term_1][:postings]
term_2 = "retrieve".stem.to_sym
term_2_hits = index[term_2][:postings]
document_hits_2 = []
term_1_hits.each do |l_posting|
term_2_hits.each do |r_posting|
document_hits_2 << r_posting[:doc_id] if l_posting[:doc_id] == r_posting[:doc_id]
end
end
# (search AND query) OR (search AND retrieve)
document_hits = document_hits_1.concat(document_hits_2).uniq.sort
puts "hit list: #{document_hits.inspect}"
###################
### (search AND engine AND web) OR feedback
###################
puts "(search AND engine AND web) OR feedback"
#search AND engine AND web
term_1 = "search".stem.to_sym
term_1_hits = index[term_1][:postings]
term_2 = "engine".stem.to_sym
term_2_hits = index[term_2][:postings]
term_3 = "web".stem.to_sym
term_3_hits = index[term_3][:postings]
document_hits_1 = []
term_1_hits.each do |l_posting|
term_2_hits.each do |m_posting|
if l_posting[:doc_id] == m_posting[:doc_id]
term_3_hits.each do |r_posting|
document_hits_1 << r_posting[:doc_id] if m_posting[:doc_id] == r_posting[:doc_id]
end
end
end
end
#feedback
term_1 = "feedback".stem.to_sym
term_1_hits = index[term_1][:postings]
document_hits_2 = []
term_1_hits.each { |posting| document_hits_2 << posting[:doc_id]}
# (search AND engine AND web) OR feedback
document_hits = document_hits_1.concat(document_hits_2).uniq.sort
puts "hit list: #{document_hits.inspect}"
###################
### (index OR cluster) AND (web OR system)
###################
puts "(index OR cluster) AND (web OR system)"
# (index OR cluster)
term_1 = "index".stem.to_sym
term_1_hits = index[term_1][:postings]
document_hits_1 = []
term_1_hits.each { |posting| document_hits_1 << posting[:doc_id]}
term_2 = "cluster".stem.to_sym
term_2_hits = index[term_2][:postings]
document_hits_2 = []
term_2_hits.each { |posting| document_hits_2 << posting[:doc_id]}
document_hits_l = document_hits_1.concat(document_hits_2).uniq.sort
# (web OR system)
term_1 = "web".stem.to_sym
term_1_hits = index[term_1][:postings]
document_hits_1 = []
term_1_hits.each { |posting| document_hits_1 << posting[:doc_id]}
term_2 = "system".stem.to_sym
term_2_hits = index[term_2][:postings]
document_hits_2 = []
term_2_hits.each { |posting| document_hits_2 << posting[:doc_id]}
document_hits_r = document_hits_1.concat(document_hits_2).uniq.sort
# (index OR cluster) AND (web OR system)
document_hits = []
document_hits_l.each do |l_posting|
document_hits_r.each do |r_posting|
document_hits << r_posting if l_posting == r_posting
end
end
document_hits = document_hits.uniq.sort
puts "hit list: #{document_hits.inspect}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment