karmi/_readme.markdown

## _readme.markdown

      
    Raw
  

              _readme.markdown
            
          
    Simplistic Full-Text Search With Redis

Howto

git clone git://gist.github.com/923934.git redisearch

cd redisearch

./redisearch.rb index file1.txt file2.txt file3.txt

./redisearch.rb search ruby
./redisearch.rb search ruby programming
./redisearch.rb search ruby diamond

Resources


http://en.wikipedia.org/wiki/Index_(search_engine)
http://rosettacode.org/wiki/Inverted_Index
https://gist.github.com/120067
http://www.search-engines-book.com/
http://www.ir.uwaterloo.ca/book/


## file1.txt
The ruby is a pink to blood-red colored gemstone, a variety of the mineral corundum (aluminium oxide). The red color is caused mainly by the presence of the element chromium. Its name comes from ruber, Latin for red. Other varieties of gem-quality corundum are called sapphires. The ruby is considered one of the four precious stones, together with the sapphire, the emerald, and the diamond.

## file2.txt
Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features. Ruby originated in Japan during the mid-1990s and was first developed and designed by Yukihiro "Matz" Matsumoto. It was influenced primarily by Perl, Smalltalk, Eiffel, and Lisp.

## file3.txt
"Ruby" is a song by English rock band Kaiser Chiefs and is the lead track on their second album, Yours Truly, Angry Mob. It was released as the lead single from that album in the United Kingdom as a download on February 5, 2007 and as a limited edition 7 in and CD single on February 19 that year. It became the band's first ever #1 single on February 25, 2007, and ended 2007 as the year's 10th biggest-selling single in the UK with total sales of 313,765.

## redisearch.rb
#!/usr/bin/env ruby

require 'rubygems'
require 'redis'
require 'benchmark'

module SimpleSearch

  def index input
    document    = File.new(input)
    document_id = document.path
    tokens      = analyze document.read

    store document_id, tokens
    puts "Indexed document #{document_id} with tokens:", tokens.inspect, "\n"
  end

  def analyze content
    # >>> Split content by words
    content.split(/\W/).
    # >>> Downcase every word
    map    { |word| word.downcase }.
    # >>> Reject stop words and empty tokens
    reject { |word| STOPWORDS.include?(word) || word == ''  }
  end

  def store document_id, tokens
    tokens.each do |token|
      # >>> Save posting into inverted index
      R.sadd "search:index:terms:#{token}", document_id
      # >>> Save occurence count per token and document
      R.incr "search:index:counts:#{token}:#{document_id}"
    end
  end

  def search query
    puts '-'*80
    elapsed = Benchmark.realtime do
      # >>> Just split query into terms
      terms = query.split(' ')
      # >>> Gather document ids containing all terms
      documents = R.sinter *terms.map { |term| "search:index:terms:#{term}" }

      documents.each do |document|
        # >>> Compute score as: <OCCURENCES OF ALL TERMS IN DOCUMENT> / <TERMS COUNT>
        score = terms.inject(0) do |sum, term|
                  sum += R.get("search:index:counts:#{term}:#{document}").to_i; sum
                end / terms.size
        puts "* #{document} (Score: #{score})"
      end
    end
    puts '-'*80, "Query '#{query}' finished in #{sprintf("%1.5f", elapsed)} seconds"
  end

  def counts
    occurences = {}
    # >>> Load counts for all terms and documents in index
    counts = R.keys "search:index:counts:*"
    counts.each do |count|
      _, _, _, term, file = count.split(':')
      # p [term, file]
      # p R.get count
      occurences[term] ||= 0
      # >>> Build a hash with terms and their counts
      occurences.update( { term => (occurences[term] += R.get(count).to_i) } )
    end
    # >>> Sort the terms by their occurences, in descending order
    occurences = occurences.sort { |a, b| b[1] <=> a[1] }
    occurences[0...10]
  end

  R = Redis.new

  STOPWORDS = %w|a an and are as at but by for if in is it no not of on or that the then there these they this to was will with|

  extend self
end


if __FILE__ == $0
  case command = ARGV.shift

    when 'search'
      query = ARGV.join(' ')
      SimpleSearch.search query

    when 'counts'
      SimpleSearch.counts.each do |facet|
        term, count = facet
        puts "* #{term.ljust(10)} (#{count})"
      end

    when 'index'
      elapsed = Benchmark.realtime do
        SimpleSearch::R.keys("search:*").each { |key| SimpleSearch::R.del key }
        ARGV.each { |file| SimpleSearch.index file }
      end
      puts '-'*80, "Indexing done in #{sprintf("%1.2f", elapsed)} seconds", '-'*80

    else
      puts "USAGE:\n  #{$0} index <FILE>\n  #{$0} search <QUERY>\n #{$0} counts"
  end
end
	#!/usr/bin/env ruby

	require 'rubygems'
	require 'redis'
	require 'benchmark'

	module SimpleSearch

	def index input
	document = File.new(input)
	document_id = document.path
	tokens = analyze document.read

	store document_id, tokens
	puts "Indexed document #{document_id} with tokens:", tokens.inspect, "\n"
	end

	def analyze content
	# >>> Split content by words
	content.split(/\W/).
	# >>> Downcase every word
	map { \|word\| word.downcase }.
	# >>> Reject stop words and empty tokens
	reject { \|word\| STOPWORDS.include?(word) \|\| word == '' }
	end

	def store document_id, tokens
	tokens.each do \|token\|
	# >>> Save posting into inverted index
	R.sadd "search:index:terms:#{token}", document_id
	# >>> Save occurence count per token and document
	R.incr "search:index:counts:#{token}:#{document_id}"
	end
	end

	def search query
	puts '-'*80
	elapsed = Benchmark.realtime do
	# >>> Just split query into terms
	terms = query.split(' ')
	# >>> Gather document ids containing all terms
	documents = R.sinter *terms.map { \|term\| "search:index:terms:#{term}" }

	documents.each do \|document\|
	# >>> Compute score as: <OCCURENCES OF ALL TERMS IN DOCUMENT> / <TERMS COUNT>
	score = terms.inject(0) do \|sum, term\|
	sum += R.get("search:index:counts:#{term}:#{document}").to_i; sum
	end / terms.size
	puts "* #{document} (Score: #{score})"
	end
	end
	puts '-'*80, "Query '#{query}' finished in #{sprintf("%1.5f", elapsed)} seconds"
	end

	def counts
	occurences = {}
	# >>> Load counts for all terms and documents in index
	counts = R.keys "search:index:counts:*"
	counts.each do \|count\|
	_, _, _, term, file = count.split(':')
	# p [term, file]
	# p R.get count
	occurences[term] \|\|= 0
	# >>> Build a hash with terms and their counts
	occurences.update( { term => (occurences[term] += R.get(count).to_i) } )
	end
	# >>> Sort the terms by their occurences, in descending order
	occurences = occurences.sort { \|a, b\| b[1] <=> a[1] }
	occurences[0...10]
	end

	R = Redis.new

	STOPWORDS = %w\|a an and are as at but by for if in is it no not of on or that the then there these they this to was will with\|

	extend self
	end


	if __FILE__ == $0
	case command = ARGV.shift

	when 'search'
	query = ARGV.join(' ')
	SimpleSearch.search query

	when 'counts'
	SimpleSearch.counts.each do \|facet\|
	term, count = facet
	puts "* #{term.ljust(10)} (#{count})"
	end

	when 'index'
	elapsed = Benchmark.realtime do
	SimpleSearch::R.keys("search:*").each { \|key\| SimpleSearch::R.del key }
	ARGV.each { \|file\| SimpleSearch.index file }
	end
	puts '-'80, "Indexing done in #{sprintf("%1.2f", elapsed)} seconds", '-'80

	else
	puts "USAGE:\n #{$0} index <FILE>\n #{$0} search <QUERY>\n #{$0} counts"
	end
	end