plexus/partitioner.rb

## partitioner.rb
# Very (very) naive wordt segmentation algorithm for Chinese
# (or any language with similar characteristics, works at the
# character level.)
class Partitioner
  attr_reader :ngrams

  # +ngrams+ Enumerable list of ngrams
  def initialize(ngrams, lookahead = 6)
    @lookahead = lookahead
    @ngrams = {}
    ngrams.each {|ng| @ngrams[ng] = true}
  end

  # Goes from beginning to end, each time trying to find the longest
  # initial n characters that are in the list of known n-grams
  def partition(text)
    text = text.split('')
    result = []
    while text and not text.empty?
      lookahead = @lookahead
      while lookahead > 0
        test = text[0...lookahead].join
        if lookahead == 1 || ngrams[test]
          result << test
          text = text[lookahead..-1]
          break
        end
        lookahead-=1
      end
    end
    result
  end
end
	# Very (very) naive wordt segmentation algorithm for Chinese
	# (or any language with similar characteristics, works at the
	# character level.)
	class Partitioner
	attr_reader :ngrams

	# +ngrams+ Enumerable list of ngrams
	def initialize(ngrams, lookahead = 6)
	@lookahead = lookahead
	@ngrams = {}
	ngrams.each {\|ng\| @ngrams[ng] = true}
	end

	# Goes from beginning to end, each time trying to find the longest
	# initial n characters that are in the list of known n-grams
	def partition(text)
	text = text.split('')
	result = []
	while text and not text.empty?
	lookahead = @lookahead
	while lookahead > 0
	test = text[0...lookahead].join
	if lookahead == 1 \|\| ngrams[test]
	result << test
	text = text[lookahead..-1]
	break
	end
	lookahead-=1
	end
	end
	result
	end
	end