git clone git://gist.github.com/923934.git redisearch
cd redisearch
./redisearch.rb index file1.txt file2.txt file3.txt
./redisearch.rb search ruby
./redisearch.rb search ruby programming
./redisearch.rb search ruby diamond
Created
April 17, 2011 11:06
-
-
Save karmi/923944 to your computer and use it in GitHub Desktop.
Simplistic Full-Text Search With Redis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The ruby is a pink to blood-red colored gemstone, a variety of the mineral corundum (aluminium oxide). The red color is caused mainly by the presence of the element chromium. Its name comes from ruber, Latin for red. Other varieties of gem-quality corundum are called sapphires. The ruby is considered one of the four precious stones, together with the sapphire, the emerald, and the diamond. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features. Ruby originated in Japan during the mid-1990s and was first developed and designed by Yukihiro "Matz" Matsumoto. It was influenced primarily by Perl, Smalltalk, Eiffel, and Lisp. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"Ruby" is a song by English rock band Kaiser Chiefs and is the lead track on their second album, Yours Truly, Angry Mob. It was released as the lead single from that album in the United Kingdom as a download on February 5, 2007 and as a limited edition 7 in and CD single on February 19 that year. It became the band's first ever #1 single on February 25, 2007, and ended 2007 as the year's 10th biggest-selling single in the UK with total sales of 313,765. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'redis' | |
require 'benchmark' | |
module SimpleSearch | |
def index input | |
document = File.new(input) | |
document_id = document.path | |
tokens = analyze document.read | |
store document_id, tokens | |
puts "Indexed document #{document_id} with tokens:", tokens.inspect, "\n" | |
end | |
def analyze content | |
# >>> Split content by words | |
content.split(/\W/). | |
# >>> Downcase every word | |
map { |word| word.downcase }. | |
# >>> Reject stop words and empty tokens | |
reject { |word| STOPWORDS.include?(word) || word == '' } | |
end | |
def store document_id, tokens | |
tokens.each do |token| | |
# >>> Save posting into inverted index | |
R.sadd "search:index:terms:#{token}", document_id | |
# >>> Save occurence count per token and document | |
R.incr "search:index:counts:#{token}:#{document_id}" | |
end | |
end | |
def search query | |
puts '-'*80 | |
elapsed = Benchmark.realtime do | |
# >>> Just split query into terms | |
terms = query.split(' ') | |
# >>> Gather document ids containing all terms | |
documents = R.sinter *terms.map { |term| "search:index:terms:#{term}" } | |
documents.each do |document| | |
# >>> Compute score as: <OCCURENCES OF ALL TERMS IN DOCUMENT> / <TERMS COUNT> | |
score = terms.inject(0) do |sum, term| | |
sum += R.get("search:index:counts:#{term}:#{document}").to_i; sum | |
end / terms.size | |
puts "* #{document} (Score: #{score})" | |
end | |
end | |
puts '-'*80, "Query '#{query}' finished in #{sprintf("%1.5f", elapsed)} seconds" | |
end | |
def counts | |
occurences = {} | |
# >>> Load counts for all terms and documents in index | |
counts = R.keys "search:index:counts:*" | |
counts.each do |count| | |
_, _, _, term, file = count.split(':') | |
# p [term, file] | |
# p R.get count | |
occurences[term] ||= 0 | |
# >>> Build a hash with terms and their counts | |
occurences.update( { term => (occurences[term] += R.get(count).to_i) } ) | |
end | |
# >>> Sort the terms by their occurences, in descending order | |
occurences = occurences.sort { |a, b| b[1] <=> a[1] } | |
occurences[0...10] | |
end | |
R = Redis.new | |
STOPWORDS = %w|a an and are as at but by for if in is it no not of on or that the then there these they this to was will with| | |
extend self | |
end | |
if __FILE__ == $0 | |
case command = ARGV.shift | |
when 'search' | |
query = ARGV.join(' ') | |
SimpleSearch.search query | |
when 'counts' | |
SimpleSearch.counts.each do |facet| | |
term, count = facet | |
puts "* #{term.ljust(10)} (#{count})" | |
end | |
when 'index' | |
elapsed = Benchmark.realtime do | |
SimpleSearch::R.keys("search:*").each { |key| SimpleSearch::R.del key } | |
ARGV.each { |file| SimpleSearch.index file } | |
end | |
puts '-'*80, "Indexing done in #{sprintf("%1.2f", elapsed)} seconds", '-'*80 | |
else | |
puts "USAGE:\n #{$0} index <FILE>\n #{$0} search <QUERY>\n #{$0} counts" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment