Forked from karmi/import-rss-feed-into-elasticsearch.rb
Created
August 9, 2011 10:45
-
-
Save edwardsmit/1133741 to your computer and use it in GitHub Desktop.
Importing and searching RSS with ElasticSearch and Tire
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Importing and searching RSS with ElasticSearch and Tire | |
# ======================================================= | |
require 'rubygems' | |
require 'tire' | |
require 'nokogiri' | |
require 'open-uri' | |
# First, let's check for a running ElasticSearch server. | |
# | |
( puts <<-"INSTALL" ; exit(1) ) unless (RestClient.get('http://localhost:9200') rescue false) | |
[ERROR] You don’t appear to have ElasticSearch installed. Please install and launch it with the following commands: | |
curl -k -L -o elasticsearch-0.16.0.tar.gz http://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.16.0.tar.gz | |
tar -zxvf elasticsearch-0.16.0.tar.gz | |
./elasticsearch-0.16.0/bin/elasticsearch -f | |
INSTALL | |
URL = 'http://stackoverflow.com/feeds' | |
puts "", "Fetching data from #{URL}...", "-"*80 | |
feed = Nokogiri::HTML(open(URL)) | |
documents = feed.search("//entry").map do |entry| | |
result = {} | |
result[:type] = 'question' | |
result[:id] = entry.xpath("id").text[/questions\/(\d+)\//, 1] | |
result[:title] = entry.xpath("title").text | |
result[:link] = entry.xpath("link[@rel='alternate']/@href").text | |
result[:categories] = entry.xpath("category/@term").map { |c| c.to_s } | |
result[:author] = entry.xpath("author/name").text | |
result[:published] = entry.xpath("published").text | |
result[:summary] = entry.xpath("summary").text | |
result | |
end | |
puts "", "Importing these #{documents.size} documents...", "-"*80 | |
documents.each { |document| puts "* #{document[:title]}" } | |
Tire.index 'stackoverflow' do | |
create :mappings => { | |
:question => { | |
:properties => { | |
:id => { :type => 'string', :analyzer => 'keyword' }, | |
:link => { :type => 'string', :analyzer => 'keyword' }, | |
:categories => { :type => 'string', :analyzer => 'keyword' }, | |
:author => { :type => 'string', :analyzer => 'keyword' }, | |
:title => { :type => 'string', :analyzer => 'snowball' }, | |
:summary => { :type => 'string', :analyzer => 'snowball' } | |
} | |
} | |
} | |
import documents | |
refresh | |
end | |
s = Tire.search('stackoverflow') do | |
query { string 'ruby' } | |
facet('categories') { terms :categories, :global => true } | |
end | |
puts "", "Any questions about ruby?", "-"*80 | |
s.results.each do |d| | |
puts "#{ d.author } : #{d.title} [#{d.categories.join(', ')}]" | |
end | |
puts "", "Top 10 categories in database:", "-"*80 | |
s.results.facets['categories']['terms'].each do |f| | |
puts "#{f['term'].ljust(15)} #{f['count']}" | |
end | |
puts "", "Or, try the search with curl:", "-"*80 | |
puts s.to_curl |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment