karmi/import-rss-feed-into-elasticsearch.rb

## import-rss-feed-into-elasticsearch.rb
# Importing and searching RSS with ElasticSearch and Tire
# =======================================================

require 'rubygems'
require 'tire'
require 'nokogiri'
require 'open-uri'

# First, let's check for a running ElasticSearch server.
#
( puts <<-"INSTALL" ; exit(1) ) unless (RestClient.get('http://localhost:9200') rescue false)

 [ERROR] You don’t appear to have ElasticSearch installed. Please install and launch it with the following commands:

         curl -k -L -o elasticsearch-0.16.0.tar.gz http://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.16.0.tar.gz
         tar -zxvf elasticsearch-0.16.0.tar.gz
         ./elasticsearch-0.16.0/bin/elasticsearch -f
INSTALL

URL = 'http://stackoverflow.com/feeds'

puts "", "Fetching data from #{URL}...", "-"*80

feed = Nokogiri::HTML(open(URL))

documents = feed.search("//entry").map do |entry|
  result              = {}
  result[:type]       = 'question'
  result[:id]         = entry.xpath("id").text[/questions\/(\d+)\//, 1]
  result[:title]      = entry.xpath("title").text
  result[:link]       = entry.xpath("link[@rel='alternate']/@href").text
  result[:categories] = entry.xpath("category/@term").map { |c| c.to_s }
  result[:author]     = entry.xpath("author/name").text
  result[:published]  = entry.xpath("published").text
  result[:summary]    = entry.xpath("summary").text

  result
end

puts "", "Importing these #{documents.size} documents...", "-"*80

documents.each { |document| puts "* #{document[:title]}" }

Tire.index 'stackoverflow' do
  create :mappings => {
    :question => {
      :properties => {
        :id         => { :type => 'string', :analyzer => 'keyword' },
        :link       => { :type => 'string', :analyzer => 'keyword' },
        :categories => { :type => 'string', :analyzer => 'keyword' },
        :author     => { :type => 'string', :analyzer => 'keyword' },
        :title      => { :type => 'string', :analyzer => 'snowball' },
        :summary    => { :type => 'string', :analyzer => 'snowball' }
      }
    }
  }

  import documents

  refresh
end

s = Tire.search('stackoverflow') do
  query { string 'ruby' }
  facet('categories') { terms :categories, :global => true }
end

puts "", "Any questions about ruby?", "-"*80

s.results.each do |d|
  puts "#{ d.author } : #{d.title} [#{d.categories.join(', ')}]"
end

puts "", "Top 10 categories in database:", "-"*80

s.results.facets['categories']['terms'].each do |f|
  puts "#{f['term'].ljust(15)} #{f['count']}"
end


puts "", "Or, try the search with curl:", "-"*80
puts s.to_curl
	# Importing and searching RSS with ElasticSearch and Tire
	# =======================================================

	require 'rubygems'
	require 'tire'
	require 'nokogiri'
	require 'open-uri'

	# First, let's check for a running ElasticSearch server.
	#
	( puts <<-"INSTALL" ; exit(1) ) unless (RestClient.get('http://localhost:9200') rescue false)

	[ERROR] You don’t appear to have ElasticSearch installed. Please install and launch it with the following commands:

	curl -k -L -o elasticsearch-0.16.0.tar.gz http://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.16.0.tar.gz
	tar -zxvf elasticsearch-0.16.0.tar.gz
	./elasticsearch-0.16.0/bin/elasticsearch -f
	INSTALL

	URL = 'http://stackoverflow.com/feeds'

	puts "", "Fetching data from #{URL}...", "-"*80

	feed = Nokogiri::HTML(open(URL))

	documents = feed.search("//entry").map do \|entry\|
	result = {}
	result[:type] = 'question'
	result[:id] = entry.xpath("id").text[/questions\/(\d+)\//, 1]
	result[:title] = entry.xpath("title").text
	result[:link] = entry.xpath("link[@rel='alternate']/@href").text
	result[:categories] = entry.xpath("category/@term").map { \|c\| c.to_s }
	result[:author] = entry.xpath("author/name").text
	result[:published] = entry.xpath("published").text
	result[:summary] = entry.xpath("summary").text

	result
	end

	puts "", "Importing these #{documents.size} documents...", "-"*80

	documents.each { \|document\| puts "* #{document[:title]}" }

	Tire.index 'stackoverflow' do
	create :mappings => {
	:question => {
	:properties => {
	:id => { :type => 'string', :analyzer => 'keyword' },
	:link => { :type => 'string', :analyzer => 'keyword' },
	:categories => { :type => 'string', :analyzer => 'keyword' },
	:author => { :type => 'string', :analyzer => 'keyword' },
	:title => { :type => 'string', :analyzer => 'snowball' },
	:summary => { :type => 'string', :analyzer => 'snowball' }
	}
	}
	}

	import documents

	refresh
	end

	s = Tire.search('stackoverflow') do
	query { string 'ruby' }
	facet('categories') { terms :categories, :global => true }
	end

	puts "", "Any questions about ruby?", "-"*80

	s.results.each do \|d\|
	puts "#{ d.author } : #{d.title} [#{d.categories.join(', ')}]"
	end

	puts "", "Top 10 categories in database:", "-"*80

	s.results.facets['categories']['terms'].each do \|f\|
	puts "#{f['term'].ljust(15)} #{f['count']}"
	end


	puts "", "Or, try the search with curl:", "-"*80
	puts s.to_curl