-
-
Save karmi/947437 to your computer and use it in GitHub Desktop.
# Importing and searching RSS with ElasticSearch and Tire | |
# ======================================================= | |
require 'rubygems' | |
require 'tire' | |
require 'nokogiri' | |
require 'open-uri' | |
# First, let's check for a running ElasticSearch server. | |
# | |
( puts <<-"INSTALL" ; exit(1) ) unless (RestClient.get('http://localhost:9200') rescue false) | |
[ERROR] You don’t appear to have ElasticSearch installed. Please install and launch it with the following commands: | |
curl -k -L -o elasticsearch-0.16.0.tar.gz http://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.16.0.tar.gz | |
tar -zxvf elasticsearch-0.16.0.tar.gz | |
./elasticsearch-0.16.0/bin/elasticsearch -f | |
INSTALL | |
URL = 'http://stackoverflow.com/feeds' | |
puts "", "Fetching data from #{URL}...", "-"*80 | |
feed = Nokogiri::HTML(open(URL)) | |
documents = feed.search("//entry").map do |entry| | |
result = {} | |
result[:type] = 'question' | |
result[:id] = entry.xpath("id").text[/questions\/(\d+)\//, 1] | |
result[:title] = entry.xpath("title").text | |
result[:link] = entry.xpath("link[@rel='alternate']/@href").text | |
result[:categories] = entry.xpath("category/@term").map { |c| c.to_s } | |
result[:author] = entry.xpath("author/name").text | |
result[:published] = entry.xpath("published").text | |
result[:summary] = entry.xpath("summary").text | |
result | |
end | |
puts "", "Importing these #{documents.size} documents...", "-"*80 | |
documents.each { |document| puts "* #{document[:title]}" } | |
Tire.index 'stackoverflow' do | |
create :mappings => { | |
:question => { | |
:properties => { | |
:id => { :type => 'string', :analyzer => 'keyword' }, | |
:link => { :type => 'string', :analyzer => 'keyword' }, | |
:categories => { :type => 'string', :analyzer => 'keyword' }, | |
:author => { :type => 'string', :analyzer => 'keyword' }, | |
:title => { :type => 'string', :analyzer => 'snowball' }, | |
:summary => { :type => 'string', :analyzer => 'snowball' } | |
} | |
} | |
} | |
import documents | |
refresh | |
end | |
s = Tire.search('stackoverflow') do | |
query { string 'ruby' } | |
facet('categories') { terms :categories, :global => true } | |
end | |
puts "", "Any questions about ruby?", "-"*80 | |
s.results.each do |d| | |
puts "#{ d.author } : #{d.title} [#{d.categories.join(', ')}]" | |
end | |
puts "", "Top 10 categories in database:", "-"*80 | |
s.results.facets['categories']['terms'].each do |f| | |
puts "#{f['term'].ljust(15)} #{f['count']}" | |
end | |
puts "", "Or, try the search with curl:", "-"*80 | |
puts s.to_curl |
I'm also thinking about line 71:
Using a string as index identifier ('stackoverflow') carries a bit of power with it, flexibility of putting it together dynamically etc. but I wonder if it wouldn't be more prudent to use an actual object here:
stackoverflow = Slingshot.index 'stackoverflow' do
create :mappings => {
# ...
}
end
stackoverflow.import documents
stackoverflow.refresh
s = Slingshot.search(stackoverflow) do # ...
(Yes, I know, in Picky it is like that, and was once like in your example – but for a reason: To have stronger "bindings" than just loose strings. If I change one it will only crash much later in the backend while with variables it will crash earlier and so on)
Perhaps have it so I can use both a String and/or an Index?
In the example, I extracted #import and #refresh out from the block so as to give to block the single meaning and function of "here I am configuring", and after, with the instance "here I perform actions like import and refresh" on the index.
A quick feedback about the API. Imho, on line 60 I think I'd either return the results exclusively, or return an un-#perform-ed search such that I can call facet etc. on it and then call #perform myself.
I like the one-ruby-file approach, btw.