karmi/facebook-messages-search.rb

## facebook-messages-search.rb
# --------------------------------------------------------------
# Simplified model of Facebook's Message Inbox Search with HBase
# --------------------------------------------------------------
#
# Facebook exploits versioning support in HBase with a very interesting twist:
# it stores message IDs for given token as “custom timestamps” in the database.
#
# The [HBase: The Definitive Guide](http://ofps.oreilly.com/titles/9781449396107/advanced.html#advsearch) book says (p. 385):
#
# > A prominent implementation of a client managed solution is the Facebook inbox search. The schema is built roughly like this:
# >
# > * Every row is a single inbox, i.e., every user has a single row in the search table,
# >
# > * the columns are the terms indexed from the messages,
# >
# > * the versions are the message IDs,
# >
# > * the values contain additional information, such as the position of the term in the document.
#
# See also the [Facebook Messages & HBase](http://www.slideshare.net/brizzzdotcom/facebook-messages-hbase/14) presentation.
#
# Run the example with:
#
#     $ hbase shell facebook-messages-search.rb
#
# --------------------------------------------------------------

# First, some auxiliary infrastructure:

# 1) Let's define some stopwords for the tokenization process.
#
STOPWORDS = %w|a an and are as at but by for if in is it no not of on or that the then there these they this to was will with|

# 2) Let's define a method to create tokens from the text stream.
#
def tokenize content
  content.split(/\W/).
  map    { |word| word.downcase }.
  reject { |word| STOPWORDS.include?(word) || word == ''  }
end

# 3) Let's define a method to search user's messages for given words.
#
def search words
  columns = tokenize(words).map { |t| "index:#{t}" }
  puts "Let's search for words #{tokenize(words).map { |t| "'#{t}'" }.join(', ')}:"
  puts ">  get 'messages', 'mary', { COLUMNS => #{columns.inspect}, VERSIONS => 10 }", ""

  get 'messages', 'mary', { COLUMNS => columns, VERSIONS => 10 }
end

# Now, let's add some data.

# Create the table to hold the index for messages. Every user has one row in the table.
#
disable 'messages'
drop    'messages'
create  'messages', {NAME => 'index', VERSIONS => 1000}

# Mary receives a message...
#
message = {:id => 1, :content => "Let's have a dinner!"}

# Let's index the message 1:
#
tokens = tokenize(message[:content])
puts "Analyzed content '#{message[:content]}' as: #{tokens.join(', ')}"

tokens.each do |token|
  put 'messages', 'mary', "index:#{token}", '', message[:id]
end

# Mary receives another message...
#
message = {:id => 2, :content => "Hmm, dinner? What about just a coffee?"}

# Let's index the message 2:
#
tokens = tokenize(message[:content])
puts "Analyzed content '#{message[:content]}' as: #{tokens.join(', ')}"

tokens.each do |token|
  put 'messages', 'mary', "index:#{token}", '', message[:id]
end

# OK, how does the index look like for Mary's messages, now?
puts "Index for Mary's messages contains these tokens (columns):"
puts ">  get 'messages', 'mary', 'index'", ""
get 'messages', 'mary', 'index'

# Let's search for last 10 Mary's messages containing some terms, such as 'dinner' or 'coffee'
#
query = 'dinner coffee'

search(query)
	# --------------------------------------------------------------
	# Simplified model of Facebook's Message Inbox Search with HBase
	# --------------------------------------------------------------
	#
	# Facebook exploits versioning support in HBase with a very interesting twist:
	# it stores message IDs for given token as “custom timestamps” in the database.
	#
	# The [HBase: The Definitive Guide](http://ofps.oreilly.com/titles/9781449396107/advanced.html#advsearch) book says (p. 385):
	#
	# > A prominent implementation of a client managed solution is the Facebook inbox search. The schema is built roughly like this:
	# >
	# > * Every row is a single inbox, i.e., every user has a single row in the search table,
	# >
	# > * the columns are the terms indexed from the messages,
	# >
	# > * the versions are the message IDs,
	# >
	# > * the values contain additional information, such as the position of the term in the document.
	#
	# See also the [Facebook Messages & HBase](http://www.slideshare.net/brizzzdotcom/facebook-messages-hbase/14) presentation.
	#
	# Run the example with:
	#
	# $ hbase shell facebook-messages-search.rb
	#
	# --------------------------------------------------------------

	# First, some auxiliary infrastructure:

	# 1) Let's define some stopwords for the tokenization process.
	#
	STOPWORDS = %w\|a an and are as at but by for if in is it no not of on or that the then there these they this to was will with\|

	# 2) Let's define a method to create tokens from the text stream.
	#
	def tokenize content
	content.split(/\W/).
	map { \|word\| word.downcase }.
	reject { \|word\| STOPWORDS.include?(word) \|\| word == '' }
	end

	# 3) Let's define a method to search user's messages for given words.
	#
	def search words
	columns = tokenize(words).map { \|t\| "index:#{t}" }
	puts "Let's search for words #{tokenize(words).map { \|t\| "'#{t}'" }.join(', ')}:"
	puts "> get 'messages', 'mary', { COLUMNS => #{columns.inspect}, VERSIONS => 10 }", ""

	get 'messages', 'mary', { COLUMNS => columns, VERSIONS => 10 }
	end

	# Now, let's add some data.

	# Create the table to hold the index for messages. Every user has one row in the table.
	#
	disable 'messages'
	drop 'messages'
	create 'messages', {NAME => 'index', VERSIONS => 1000}

	# Mary receives a message...
	#
	message = {:id => 1, :content => "Let's have a dinner!"}

	# Let's index the message 1:
	#
	tokens = tokenize(message[:content])
	puts "Analyzed content '#{message[:content]}' as: #{tokens.join(', ')}"

	tokens.each do \|token\|
	put 'messages', 'mary', "index:#{token}", '', message[:id]
	end

	# Mary receives another message...
	#
	message = {:id => 2, :content => "Hmm, dinner? What about just a coffee?"}

	# Let's index the message 2:
	#
	tokens = tokenize(message[:content])
	puts "Analyzed content '#{message[:content]}' as: #{tokens.join(', ')}"

	tokens.each do \|token\|
	put 'messages', 'mary', "index:#{token}", '', message[:id]
	end

	# OK, how does the index look like for Mary's messages, now?
	puts "Index for Mary's messages contains these tokens (columns):"
	puts "> get 'messages', 'mary', 'index'", ""
	get 'messages', 'mary', 'index'

	# Let's search for last 10 Mary's messages containing some terms, such as 'dinner' or 'coffee'
	#
	query = 'dinner coffee'

	search(query)