karmi/elasticoverflow.rb

## elasticoverflow.rb
# =======================================================
# Importing and searching RSS with ElasticSearch and Tire
# =======================================================
#
# This script downloads, parses and indexes Stackoverflow RSS feed with ElasticSearch
# via the [Tire](https://github.com/karmi/tire) Rubygem.
#
# Requirements
# ------------
#
# * Sun Java 6 (for ElasticSearch)
# * Ruby >= 1.8.7
# * Rubygems >= 1.5.0
#
# Usage
# -----
#
#     ruby elasticoverflow.rb
#

require 'rubygems'
require 'open-uri'
require 'benchmark'

# Check for required Rubygems, exit otherwise
#
%w[ tire nokogiri ].each do |lib|
  begin
    require lib
  rescue LoadError
    STDERR.puts "[ERROR] Required library '#{lib}' missing.", "        Please install it with:", "        $ gem install #{lib}", "\n"
    exit(1)
  end
end

# Check if ElasticSearch is running on this machine, exit otherwise
#
( puts <<-"INSTALL" ; exit(1) ) unless (RestClient.get('http://localhost:9200') rescue false)

 [ERROR] You don’t appear to have ElasticSearch installed. Please install and launch it with the following commands:

         curl -k -L -o elasticsearch-0.16.0.tar.gz http://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.16.0.tar.gz
         tar -zxvf elasticsearch-0.16.0.tar.gz
         ./elasticsearch-0.16.0/bin/elasticsearch -f
INSTALL

URL = 'http://stackoverflow.com/feeds'

puts "", "Fetching data from '#{URL}'...", "-"*80

# Parse the Stackoverflow RSS
#
feed = Nokogiri::HTML(open(URL))

# Prepare the documents
#
documents = feed.search("//entry").map do |entry|
  result              = {}
  result[:type]       = 'question'
  result[:id]         = entry.xpath("id").text[/questions\/(\d+)\//, 1]
  result[:title]      = entry.xpath("title").text
  result[:link]       = entry.xpath("link[@rel='alternate']/@href").text
  result[:categories] = entry.xpath("category/@term").map { |c| c.to_s }
  result[:author]     = entry.xpath("author/name").text
  result[:published]  = entry.xpath("published").text
  result[:summary]    = entry.xpath("summary").text

  result
end

puts "", "Importing these #{documents.size} documents:", "-"*80

documents.each { |document| puts "* #{document[:title]}" }

elapsed = Benchmark.realtime do

  Tire.index 'stackoverflow' do

    # Create the index with proper mapping (if not exists already)
    #
    create :mappings => {
      :question => {
        :properties => {
          :id         => { :type => 'string', :analyzer => 'keyword' },
          :link       => { :type => 'string', :analyzer => 'keyword' },
          :categories => { :type => 'string', :analyzer => 'keyword' },
          :author     => { :type => 'string', :analyzer => 'keyword' },
          :title      => { :type => 'string', :analyzer => 'snowball' },
          :summary    => { :type => 'string', :analyzer => 'snowball' }
        }
      }
    }

    # Import documents
    import documents

    # Refresh the index for immediate searching
    #
    refresh
  end

end

puts "-"*80, "Importing took #{(elapsed*1000).to_i} milliseconds"

puts "", "Searching...", "-"*80

s = Tire.search('stackoverflow') do

  # Search for questions containing ‘ruby’
  #
  query { string 'ruby' }

  # Retrieve aggregated counts for top ten categories
  #
  facet('categories') { terms :categories, :global => true }
end

puts "Search took #{s.results.time} milliseconds."

puts "", "Any questions about ruby?", "-"*80

s.results.each do |d|
  puts "#{ d.author } : #{d.title} [#{d.categories.join(', ')}]"
end

puts "", "Top 10 categories in database:", "-"*80

s.results.facets['categories']['terms'].each do |f|
  puts "#{f['term'].ljust(15)} #{f['count']}"
end

puts "", "Or, try the search with curl:", "-"*80
puts s.to_curl
	# =======================================================
	# Importing and searching RSS with ElasticSearch and Tire
	# =======================================================
	#
	# This script downloads, parses and indexes Stackoverflow RSS feed with ElasticSearch
	# via the [Tire](https://github.com/karmi/tire) Rubygem.
	#
	# Requirements
	# ------------
	#
	# * Sun Java 6 (for ElasticSearch)
	# * Ruby >= 1.8.7
	# * Rubygems >= 1.5.0
	#
	# Usage
	# -----
	#
	# ruby elasticoverflow.rb
	#

	require 'rubygems'
	require 'open-uri'
	require 'benchmark'

	# Check for required Rubygems, exit otherwise
	#
	%w[ tire nokogiri ].each do \|lib\|
	begin
	require lib
	rescue LoadError
	STDERR.puts "[ERROR] Required library '#{lib}' missing.", " Please install it with:", " $ gem install #{lib}", "\n"
	exit(1)
	end
	end

	# Check if ElasticSearch is running on this machine, exit otherwise
	#
	( puts <<-"INSTALL" ; exit(1) ) unless (RestClient.get('http://localhost:9200') rescue false)

	[ERROR] You don’t appear to have ElasticSearch installed. Please install and launch it with the following commands:

	curl -k -L -o elasticsearch-0.16.0.tar.gz http://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.16.0.tar.gz
	tar -zxvf elasticsearch-0.16.0.tar.gz
	./elasticsearch-0.16.0/bin/elasticsearch -f
	INSTALL

	URL = 'http://stackoverflow.com/feeds'

	puts "", "Fetching data from '#{URL}'...", "-"*80

	# Parse the Stackoverflow RSS
	#
	feed = Nokogiri::HTML(open(URL))

	# Prepare the documents
	#
	documents = feed.search("//entry").map do \|entry\|
	result = {}
	result[:type] = 'question'
	result[:id] = entry.xpath("id").text[/questions\/(\d+)\//, 1]
	result[:title] = entry.xpath("title").text
	result[:link] = entry.xpath("link[@rel='alternate']/@href").text
	result[:categories] = entry.xpath("category/@term").map { \|c\| c.to_s }
	result[:author] = entry.xpath("author/name").text
	result[:published] = entry.xpath("published").text
	result[:summary] = entry.xpath("summary").text

	result
	end

	puts "", "Importing these #{documents.size} documents:", "-"*80

	documents.each { \|document\| puts "* #{document[:title]}" }

	elapsed = Benchmark.realtime do

	Tire.index 'stackoverflow' do

	# Create the index with proper mapping (if not exists already)
	#
	create :mappings => {
	:question => {
	:properties => {
	:id => { :type => 'string', :analyzer => 'keyword' },
	:link => { :type => 'string', :analyzer => 'keyword' },
	:categories => { :type => 'string', :analyzer => 'keyword' },
	:author => { :type => 'string', :analyzer => 'keyword' },
	:title => { :type => 'string', :analyzer => 'snowball' },
	:summary => { :type => 'string', :analyzer => 'snowball' }
	}
	}
	}

	# Import documents
	import documents

	# Refresh the index for immediate searching
	#
	refresh
	end

	end

	puts "-"80, "Importing took #{(elapsed1000).to_i} milliseconds"

	puts "", "Searching...", "-"*80

	s = Tire.search('stackoverflow') do

	# Search for questions containing ‘ruby’
	#
	query { string 'ruby' }

	# Retrieve aggregated counts for top ten categories
	#
	facet('categories') { terms :categories, :global => true }
	end

	puts "Search took #{s.results.time} milliseconds."

	puts "", "Any questions about ruby?", "-"*80

	s.results.each do \|d\|
	puts "#{ d.author } : #{d.title} [#{d.categories.join(', ')}]"
	end

	puts "", "Top 10 categories in database:", "-"*80

	s.results.facets['categories']['terms'].each do \|f\|
	puts "#{f['term'].ljust(15)} #{f['count']}"
	end

	puts "", "Or, try the search with curl:", "-"*80
	puts s.to_curl