Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save dandrews/0fe8ac58f437f898e67e to your computer and use it in GitHub Desktop.
Save dandrews/0fe8ac58f437f898e67e to your computer and use it in GitHub Desktop.
Given an input string and a dictionary of words,
segment the input string into a space-separated
sequence of dictionary words if possible. For
example, if the input string is "applepie" and
dictionary contains a standard set of English words,
then we would return the string "apple pie" as output.
http://thenoisychannel.com/2011/08/08/retiring-a-great-interview-problem/
(ns word-break.core
(:use midje.sweet
[clojure.contrib.core :only [-?> -?>>]]))
(defn word-break [input dict]
(if (contains? dict input)
input
(let [length (count input)
candidates (map #(list (subs input 0 %) %) (range length))
prefixes (filter (fn [[c _]] (contains? dict c)) candidates)]
(first (for [[prefix i] prefixes
:let [suffix (-?> (subs input i length) (word-break dict))]
:when suffix]
(str prefix " " suffix))))))
;; inspired by http://stackoverflow.com/questions/3906831/how-do-i-generate-memoized-recursive-functions-in-clojure
(defmacro memoize-fn
"Produces a memoized anonymous function that can recursively call itself."
[fn-name & fn-args]
`(with-local-vars
[~fn-name (memoize
(fn ~@fn-args))]
(.bindRoot ~fn-name @~fn-name)
@~fn-name))
(defn word-break-with-memoization [input dict]
(let [word-break
(memoize-fn word-break [input]
(if (contains? dict input)
input
(let [length (count input)
candidates (map #(list (subs input 0 %) %) (range length))
prefixes (filter (fn [[c _]] (contains? dict c)) candidates)]
(first (for [[prefix i] prefixes
:let [suffix (-?> (subs input i length) word-break)]
:when suffix]
(str prefix " " suffix))))))]
(word-break input)))
(fact
(word-break "applepie" #{"apple" "pie"}) => "apple pie"
(word-break "anappleaday" #{"an" "apple" "a" "day"}) => "an apple a day"
(word-break "aaaab" #{"a" "aa" "aaa" "aaaa"}) => nil
(word-break-with-memoization "applepie" #{"apple" "pie"}) => "apple pie"
(word-break-with-memoization "anappleaday" #{"an" "apple" "a" "day"}) => "an apple a day"
(word-break-with-memoization "aaaab" #{"a" "aa" "aaa" "aaaa"}) => nil)
require 'benchmark'
require 'rubygems'
require 'facets/enumerator'
require 'facets/enumerable/defer'
warmup = 2 # The additional warmups are for HotSpot's benefit
(warmup + 1).times do
Benchmark.bmbm do |x|
x.report("non-lazy one op") { (1..100000).map{|i| i + 1} }
x.report("lazy one op") { (1..100000).defer.map{|i| i + 1}.to_a }
x.report("non-lazy two ops") { (1..100000).map{|i| i + 1}.map{|i| i + 1} }
x.report("lazy two ops") { (1..100000).defer.map{|i| i + 1}.map{|i| i + 1}.to_a }
x.report("non-lazy three ops") { (1..100000).map{|i| i + 1}.map{|i| i + 1}.map{|i| i + 1} }
x.report("lazy three ops") { (1..100000).defer.map{|i| i + 1}.map{|i| i + 1}.map{|i| i + 1}.to_a }
end
end
#/usr/bin/env ruby
require 'set'
require 'rubygems'
require 'rspec'
require 'facets/enumerator'
require 'facets/enumerable/defer'
# This monkey patch allows for destructing of the generated/yielded items...
# TODO: open a pull request with Facets with this change...
class Denumerator
def to_ary
self.next.to_ary
end
end
module WordBreak
class << self
def segmentize(string, dictionary)
return string if dictionary.include?(string)
string.size.times do |i|
prefix = string[0..i]
next unless dictionary.include?(prefix)
remaining_segment = segmentize(string[(i+1)..-1], dictionary)
return [prefix, remaining_segment].join(" ") if remaining_segment
end
nil
end
def segmentize_with_memoization(string, dictionary)
segmentize = memoize( -> string {
return string if dictionary.include?(string)
string.size.times do |i|
prefix = string[0..i]
next unless dictionary.include?(prefix)
remaining_segment = segmentize.call(string[(i+1)..-1])
return [prefix, remaining_segment].join(" ") if remaining_segment
end
nil
})
segmentize.call(string)
end
def lazy_segmentize(string, dictionary)
return string if dictionary.include?(string)
(0...(string.size)).defer.
map { |i| [string[0..i], i]}.
select { |(prefix,i)| dictionary.include?(prefix) }.
map{ |(prefix,i)|
remaining_segment = lazy_segmentize(string[(i+1)..-1], dictionary)
[prefix, remaining_segment].join(" ") if remaining_segment }.
reject {|r| r.nil? }.
first
end
def lazy_segmentize_with_memoization(string, dictionary)
segmentize = memoize(
-> string {
return string if dictionary.include?(string)
(0...(string.size)).defer.
map { |i| [string[0..i], i]}.
select { |(prefix,i)| dictionary.include?(prefix) }.
map{ |(prefix,i)|
remaining_segment = segmentize.call(string[(i+1)..-1])
[prefix, remaining_segment].join(" ") if remaining_segment }.
reject {|r| r.nil? }.
first})
segmentize.call(string)
end
private
def memoize(fn)
cache = {}
-> *args { cache.include?(args) ? cache[args] : cache[args] = fn.call(*args) }
end
end
end
describe WordBreak do
describe ".segmentize" do
specify do
WordBreak.segmentize("applepie", %w[apple pie].to_set).should == "apple pie"
WordBreak.segmentize("blah", %w[apple pie].to_set).should be_nil
WordBreak.segmentize("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
WordBreak.segmentize("aaaab", %w[a aa aaa aaaa].to_set).should be_nil
end
end
describe ".segmentize_with_memoization" do
it do
#WordBreak.segmentize_with_memoization("applepie", %w[apple pie].to_set).should == "apple pie"
#WordBreak.segmentize_with_memoization("blah", %w[apple pie].to_set).should be_nil
#WordBreak.segmentize_with_memoization("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
WordBreak.segmentize_with_memoization("aaaab", %w[a aa aaa aaaa].to_set).should be_nil
end
end
describe ".lazy_segmentize" do
specify do
WordBreak.lazy_segmentize("applepie", %w[apple pie].to_set).should == "apple pie"
WordBreak.lazy_segmentize("blah", %w[apple pie].to_set).should == nil
WordBreak.lazy_segmentize("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
end
end
describe ".lazy_segmentize_with_memoization" do
specify do
WordBreak.lazy_segmentize_with_memoization("applepie", %w[apple pie].to_set).should == "apple pie"
WordBreak.lazy_segmentize_with_memoization("blah", %w[apple pie].to_set).should == nil
WordBreak.lazy_segmentize_with_memoization("anappleaday", %w[a an apple day].to_set).should == "an apple a day"
WordBreak.lazy_segmentize_with_memoization("aaaab", %w[a aa aaa aaaa].to_set).should be_nil
end
end
end
class Enumerator
def self.iterate(initial_value, &iterate_fn)
Enumerator.new do |yielder|
yielder.yield initial_value
current_value = initial_value
loop do
current_value = iterate_fn.call(current_value)
yielder.yield current_value
end
end
end
end
describe "Enumerator::iterate" do
it "returns an Enumerator of the block provided over the initial value and each following result" do
numbers = Enumerator.iterate(1) { |v| v + 1}
numbers.take(4).should == [1, 2, 3, 4]
doubles = Enumerator.iterate(1) { |v| v * 2}
doubles.take(4).should == [1, 2, 4, 8]
end
end
require 'benchmark'
require 'facets/random'
def worst_case(n)
["a" * n + "b",
Enumerator.iterate("a"){|i| i + "a"}.take(n-1).to_a] # see what I did here? ;)
end
dictionary = File.open("/usr/share/dict/words") do |f|
f.enum_for(:each_line).defer.map{|l| l.strip}.take(300).to_a # yep, here too...
end
def dictionary.random_words(num_words)
Random.srand(1) # make it deterministic
self.rand_subset(num_words, false).join("")
end
warmup = 2 # The additional warmups are for HotSpot's benefit
(warmup + 1).times do
Benchmark.bmbm do |x|
[10, 50, 100, 200, 300, 400].each do |n|
input, dictionary = worst_case(n)
x.report("non-lazy w/memo; n = #{n}") { WordBreak.segmentize_with_memoization(input, dictionary)}
x.report("lazy w/memo; n = #{n}") { WordBreak.lazy_segmentize_with_memoization(input, dictionary)}
end
#[10, 50, 100, 200].each do |num_words|
# input = dictionary.random_words(num_words)
# x.report("non-lazy w/memo; num_words = #{num_words}, n = #{input.size}") { WordBreak.segmentize_with_memoization(input, dictionary)}
# x.report("lazy w/memo; num_words = #{num_words}, n = #{input.size}") { WordBreak.lazy_segmentize_with_memoization(input, dictionary)}
#end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment