karmi/percolated-twitter.rb

## percolated-twitter.rb
# Reversed or “Real Time” Search in ElasticSearch
# ====================================================================================

# You may have come across the term “realtime search” lately
# (eg. [here](http://engineering.socialcast.com/2011/05/realtime-search-solr-vs-elasticsearch/))
# and wondered what all the fuss is about.
#
# Well, the usual workflow with search engines goes like this:
#
# 1. You index some documents.
# 2. You perform a search query.
#
# In [_ElasticSearch_](http://www.elasticsearch.org/), there's a default delay
# of 1 second between indexing a document and being able to search it.
# You can, of course, force refresh the index. That's enough “realtime”
# in my book, if you ask me.
#
# However, _ElasticSearch_ comes with a much more powerful idea for “realtime seach”,
# called [_percolation_](http://www.elasticsearch.org/guide/reference/api/percolate.html).
#
# It _reverses_ the usual workflow like this:
#
# 1. You register some queries you'd like to perform in real time, on demand,
# or _while the documents are being indexed_.
# 2. You index some documents.
#
# The search engine will match your documents against your queries, and return names of matching queries
# in the response. Yes, _names_.
#
# This allows you to do crazy stuff immediately, _while your documents are being indexed_.
# Think [Google Alerts](http://www.google.com/alerts) for _your_ data.
#
# The _Tire_ gem for _ElasticSearch_ recently got
# [percolation support]((http://karmi.github.com/tire/#section-Percolation)).
#
# In this script, we'll register couple of queries, and then fetch data from _Twitter_,
# receiving notifications when any status message matches one of our queries.
#
# You can download this script at <https://gist.github.com/1025498>.
#

# First of all, let's install some gems:
#
#     gem install tire ansi yajl-ruby
#
# Note that you need the 0.1.11 version of the `tire` gem.
#
require 'rubygems'
require 'time'
require 'uri'
require 'tire'
require 'ansi/code'
require 'yajl/http_stream'

include ANSI::Code

# Let's define a class to hold our data in _ElasticSearch_.
#
# We're using the _persistence_ mode of _Tire_, so we don't need a database, because
# [_the index is our database_](http://www.slideshare.net/karmi/your-data-your-search-elasticsearch-euruko-2011/49).
#
class Status
  include Tire::Model::Persistence

  property :user
  property :text
  property :created_at

  # Let's define callback for percolation.
  # Whenewer a new document is saved in the index, this block will be executed,
  # and we will have access to matching queries in the `Status#matches` property.
  #
  # In our case, we will just print the list of matching queries.
  #
  on_percolate do
    puts green { "'#{text}' from @#{user} matches queries: #{matches.inspect}" } unless matches.empty?
  end
end

# Let's register the queries for percolation now.
#
# First, let's define the query_string queries.
#
q            = {}
q[:newspeak] = 'wow omg lol wtf fuu*'
q[:fail]     = 'fail'
q[:memes]    = '"why u no" "all your base" "i can has"'
#
# Second, let's save those queries in _ElasticSearch_.
#
Status.index.register_percolator_query('newspeak') { |query| query.string q[:newspeak] }
Status.index.register_percolator_query('fails')    { |query| query.string q[:fail] }
Status.index.register_percolator_query('memes')    { |query| query.string q[:memes] }

puts "", bold { "Testing percolation" }, '-'*80

# Let's check out the percolation on some “example data”.
#
status = Status.new :text => 'OMG i can has #fail'
puts "'#{status.text}' matches queries #{status.percolate.inspect}"

# Now, let's fetch some real data from the collective consciousness.
#
url   = URI.parse("http://api.twitter.com/1/statuses/public_timeline.json")
puts "", bold { "Fetching '#{url}'" }, '-'*80

5.times do |i|

  # Get JSON data from _Twitter_.
  Yajl::HttpStream.get(url, :symbolize_keys => true) do |timeline|
    timeline.each do |status|

      # Create new document from each status message.
      #
      # Watch the output from the `on_percolate` callback in your console.
      # You may have to run this file repeatedly, in case _Twitter_ gets quiet.
      #
      Status.create :id => status[:id],
                    :user => status[:user][:screen_name],
                    :text => status[:text],
                    :created_at => Time.parse(status[:created_at])
    end

    puts "Indexed #{timeline.size} tweets"
  end
  sleep 10 if i < 4
end

puts "", bold { "Check out your index" }, '-'*80

# You can check out the the documents in your index with `curl` or your browser.
#
puts "curl 'http://localhost:9200/statuses/_search?q=*&sort=created_at:desc&size=5&pretty=true'", ""
	# Reversed or “Real Time” Search in ElasticSearch
	# ====================================================================================

	# You may have come across the term “realtime search” lately
	# (eg. [here](http://engineering.socialcast.com/2011/05/realtime-search-solr-vs-elasticsearch/))
	# and wondered what all the fuss is about.
	#
	# Well, the usual workflow with search engines goes like this:
	#
	# 1. You index some documents.
	# 2. You perform a search query.
	#
	# In [_ElasticSearch_](http://www.elasticsearch.org/), there's a default delay
	# of 1 second between indexing a document and being able to search it.
	# You can, of course, force refresh the index. That's enough “realtime”
	# in my book, if you ask me.
	#
	# However, _ElasticSearch_ comes with a much more powerful idea for “realtime seach”,
	# called [_percolation_](http://www.elasticsearch.org/guide/reference/api/percolate.html).
	#
	# It _reverses_ the usual workflow like this:
	#
	# 1. You register some queries you'd like to perform in real time, on demand,
	# or _while the documents are being indexed_.
	# 2. You index some documents.
	#
	# The search engine will match your documents against your queries, and return names of matching queries
	# in the response. Yes, _names_.
	#
	# This allows you to do crazy stuff immediately, _while your documents are being indexed_.
	# Think [Google Alerts](http://www.google.com/alerts) for _your_ data.
	#
	# The _Tire_ gem for _ElasticSearch_ recently got
	# [percolation support]((http://karmi.github.com/tire/#section-Percolation)).
	#
	# In this script, we'll register couple of queries, and then fetch data from _Twitter_,
	# receiving notifications when any status message matches one of our queries.
	#
	# You can download this script at <https://gist.github.com/1025498>.
	#

	# First of all, let's install some gems:
	#
	# gem install tire ansi yajl-ruby
	#
	# Note that you need the 0.1.11 version of the `tire` gem.
	#
	require 'rubygems'
	require 'time'
	require 'uri'
	require 'tire'
	require 'ansi/code'
	require 'yajl/http_stream'

	include ANSI::Code

	# Let's define a class to hold our data in _ElasticSearch_.
	#
	# We're using the _persistence_ mode of _Tire_, so we don't need a database, because
	# [_the index is our database_](http://www.slideshare.net/karmi/your-data-your-search-elasticsearch-euruko-2011/49).
	#
	class Status
	include Tire::Model::Persistence

	property :user
	property :text
	property :created_at

	# Let's define callback for percolation.
	# Whenewer a new document is saved in the index, this block will be executed,
	# and we will have access to matching queries in the `Status#matches` property.
	#
	# In our case, we will just print the list of matching queries.
	#
	on_percolate do
	puts green { "'#{text}' from @#{user} matches queries: #{matches.inspect}" } unless matches.empty?
	end
	end

	# Let's register the queries for percolation now.
	#
	# First, let's define the query_string queries.
	#
	q = {}
	q[:newspeak] = 'wow omg lol wtf fuu*'
	q[:fail] = 'fail'
	q[:memes] = '"why u no" "all your base" "i can has"'
	#
	# Second, let's save those queries in _ElasticSearch_.
	#
	Status.index.register_percolator_query('newspeak') { \|query\| query.string q[:newspeak] }
	Status.index.register_percolator_query('fails') { \|query\| query.string q[:fail] }
	Status.index.register_percolator_query('memes') { \|query\| query.string q[:memes] }

	puts "", bold { "Testing percolation" }, '-'*80

	# Let's check out the percolation on some “example data”.
	#
	status = Status.new :text => 'OMG i can has #fail'
	puts "'#{status.text}' matches queries #{status.percolate.inspect}"

	# Now, let's fetch some real data from the collective consciousness.
	#
	url = URI.parse("http://api.twitter.com/1/statuses/public_timeline.json")
	puts "", bold { "Fetching '#{url}'" }, '-'*80

	5.times do \|i\|

	# Get JSON data from _Twitter_.
	Yajl::HttpStream.get(url, :symbolize_keys => true) do \|timeline\|
	timeline.each do \|status\|

	# Create new document from each status message.
	#
	# Watch the output from the `on_percolate` callback in your console.
	# You may have to run this file repeatedly, in case _Twitter_ gets quiet.
	#
	Status.create :id => status[:id],
	:user => status[:user][:screen_name],
	:text => status[:text],
	:created_at => Time.parse(status[:created_at])
	end

	puts "Indexed #{timeline.size} tweets"
	end
	sleep 10 if i < 4
	end

	puts "", bold { "Check out your index" }, '-'*80

	# You can check out the the documents in your index with `curl` or your browser.
	#
	puts "curl 'http://localhost:9200/statuses/_search?q=*&sort=created_at:desc&size=5&pretty=true'", ""