public
Last active

Importing and searching RSS with ElasticSearch and Tire

  • Download Gist
elasticoverflow.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
# =======================================================
# Importing and searching RSS with ElasticSearch and Tire
# =======================================================
#
# This script downloads, parses and indexes Stackoverflow RSS feed with ElasticSearch
# via the [Tire](https://github.com/karmi/tire) Rubygem.
#
# Requirements
# ------------
#
# * Sun Java 6 (for ElasticSearch)
# * Ruby >= 1.8.7
# * Rubygems >= 1.5.0
#
# Usage
# -----
#
# ruby elasticoverflow.rb
#
 
require 'rubygems'
require 'open-uri'
require 'benchmark'
 
# Check for required Rubygems, exit otherwise
#
%w[ tire nokogiri ].each do |lib|
begin
require lib
rescue LoadError
STDERR.puts "[ERROR] Required library '#{lib}' missing.", " Please install it with:", " $ gem install #{lib}", "\n"
exit(1)
end
end
 
# Check if ElasticSearch is running on this machine, exit otherwise
#
( puts <<-"INSTALL" ; exit(1) ) unless (RestClient.get('http://localhost:9200') rescue false)
 
[ERROR] You don’t appear to have ElasticSearch installed. Please install and launch it with the following commands:
curl -k -L -o elasticsearch-0.16.0.tar.gz http://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.16.0.tar.gz
tar -zxvf elasticsearch-0.16.0.tar.gz
./elasticsearch-0.16.0/bin/elasticsearch -f
INSTALL
 
URL = 'http://stackoverflow.com/feeds'
 
puts "", "Fetching data from '#{URL}'...", "-"*80
 
# Parse the Stackoverflow RSS
#
feed = Nokogiri::HTML(open(URL))
 
# Prepare the documents
#
documents = feed.search("//entry").map do |entry|
result = {}
result[:type] = 'question'
result[:id] = entry.xpath("id").text[/questions\/(\d+)\//, 1]
result[:title] = entry.xpath("title").text
result[:link] = entry.xpath("link[@rel='alternate']/@href").text
result[:categories] = entry.xpath("category/@term").map { |c| c.to_s }
result[:author] = entry.xpath("author/name").text
result[:published] = entry.xpath("published").text
result[:summary] = entry.xpath("summary").text
 
result
end
 
puts "", "Importing these #{documents.size} documents:", "-"*80
 
documents.each { |document| puts "* #{document[:title]}" }
 
elapsed = Benchmark.realtime do
 
Tire.index 'stackoverflow' do
 
# Create the index with proper mapping (if not exists already)
#
create :mappings => {
:question => {
:properties => {
:id => { :type => 'string', :analyzer => 'keyword' },
:link => { :type => 'string', :analyzer => 'keyword' },
:categories => { :type => 'string', :analyzer => 'keyword' },
:author => { :type => 'string', :analyzer => 'keyword' },
:title => { :type => 'string', :analyzer => 'snowball' },
:summary => { :type => 'string', :analyzer => 'snowball' }
}
}
}
 
# Import documents
import documents
 
# Refresh the index for immediate searching
#
refresh
end
 
end
 
puts "-"*80, "Importing took #{(elapsed*1000).to_i} milliseconds"
 
puts "", "Searching...", "-"*80
 
s = Tire.search('stackoverflow') do
 
# Search for questions containing ‘ruby’
#
query { string 'ruby' }
 
# Retrieve aggregated counts for top ten categories
#
facet('categories') { terms :categories, :global => true }
end
 
puts "Search took #{s.results.time} milliseconds."
 
puts "", "Any questions about ruby?", "-"*80
 
s.results.each do |d|
puts "#{ d.author } : #{d.title} [#{d.categories.join(', ')}]"
end
 
puts "", "Top 10 categories in database:", "-"*80
 
s.results.facets['categories']['terms'].each do |f|
puts "#{f['term'].ljust(15)} #{f['count']}"
end
 
puts "", "Or, try the search with curl:", "-"*80
puts s.to_curl

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.