Last active
January 28, 2016 16:08
-
-
Save mhink/72a1cf590ce22d08ad9a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A heavily annotated version of this script follows below. | |
module EachWindow | |
def each_window(n, &block) | |
if self.length < n | |
raise "Length must be greater than window size" | |
end | |
if block_given? | |
(0..self.size-n).each do |ix| | |
yield self.slice(ix, n) | |
end | |
else | |
to_enum(__method__, n) do | |
(self.size - n + 1) | |
end | |
end | |
end | |
end | |
Array.include(EachWindow) | |
def sliceify(words, n) | |
words. | |
reject { |word| word.length < n }. | |
map { |word| word. | |
chars. | |
each_window(n). | |
each_with_object(word). | |
map { |slice, word| | |
[slice.join(""), word] | |
}}. | |
flatten(1). | |
sort_by { |(slice, word)| slice.downcase } | |
end | |
in_filename = ARGV[1] || "hello_world.in" | |
out_filename = ARGV[2] || "hello_world.out" | |
begin | |
infile = File.open(in_filename, "r") | |
outfile = File.open(out_filename, "w+") | |
words = infile.each_line.map(&:chomp) | |
sliceify(words, 4).each do |(slice, word)| | |
outfile.puts "#{slice} #{word}" | |
end | |
ensure | |
infile.close | |
outfile.close | |
end | |
# ------------------------------------------------------------------------- | |
# First, let's implement the idea of 'sliding' a window across a sequence | |
# of elements. The Ruby way is to ask 'what methods does this object respond | |
# to' rather than 'what is the Class of this object'- therefore, we implement | |
# this as a Module which can be included into any Object which responds to | |
# the #each, #size, and #slice method calls. | |
# | |
# You have to trust that this isn't too much like the relevant XKCD (974)- it's | |
# more robust to implement this as a standalone idea. The reason? Because | |
# we're not relying on assumptions about *what the object is*- we're relying on | |
# assumptions about *how the object behaves*. | |
module EachWindow | |
# This method is an 'iterator-style' method, which can be chained | |
# with other iterators like #each, #inject, and #map. | |
# | |
# Its first argument is the size of the window we would like to | |
# iterate over. It yields each 'window' to the block it's given. | |
def each_window(n, &block) | |
# If we tried to chop a string of length 4 out of, say, "Foo", | |
# we would get non-sensical results. So we implement a guard | |
# to prevent Weird Shit like that from happening. | |
if self.length < n | |
raise "Length must be greater than window size" | |
end | |
if block_given? | |
# Here's the meat of the method. Suppose we're sliding a window | |
# of length 2 over this Array: | |
# | |
# index: 0 1 2 | |
# array: ['a', 'b', 'c'] (size is 3) | |
# slice 1: ['a', 'b'] | |
# slice 2: [ 'b', 'c'] | |
# | |
# The size of the array itself is 3, and the size of our window | |
# is 2. Simple reasoning shows that we can only *start* our slices | |
# at the indices 0 and 1. | |
(0..self.size-n).each do |ix| | |
# At each point, we extract the "slice" of the Array and yield it | |
# to the block provided by the calling code. | |
yield self.slice(ix..ix + n) | |
end | |
else | |
# On the other hand, if we weren't given a block to yield our slices | |
# to, we use the Kernel#to_enum method to create an Enumerator object | |
# which we can chain other enumerator-style methods onto. We'll see | |
# this in action later. | |
to_enum(__method__, n) do | |
(self.size - n) | |
end | |
end | |
end | |
end | |
# Because Array includes #each, #size, and #slice, we can "mix in" the module | |
# defined above, which means that any instance of Array will get a copy of the | |
# #each_window method. Score! | |
Array.include(EachWindow) | |
# Now, we implement the actual algorithm described in the problem. It takes | |
# an array and a window size. Again, we don't want to abstract *too* much, but | |
# we have an invariant that we want to remove words which are shorter than | |
# our window size. | |
def sliceify(words, n) | |
# First, we take the words we were given. For the sake of example, let's | |
# use the array ['pooba', 'baz', 'quux'], with a window size of 4. | |
words. | |
# Now, we filter out words less than 4 characters long. The return | |
# value of #reject is ['pooba', 'quux']. | |
reject { |word| word.length < n }. | |
# Now, we are building a new array, whose elements represent each | |
# element of ['pooba', 'quux'] with a transformation applied. | |
map { |word| word. | |
# The #chars method turns a String like | |
# "pooba" into an Array like ['p', 'o', 'o', 'b', 'a']. | |
chars. | |
# Our #each_window method has its chance to shine! It will yield | |
# the Arrays ['p', 'o', 'o', 'b'] and ['o', 'o', 'b', 'a']. But | |
# we still need to get those into a dictionary-like format. | |
each_window(n). | |
# Next, we *compose* our Enumerator with the #each_with_object | |
# method. Basically we want to iterate over each of the Arrays | |
# above and do something with the word corresponding to it. | |
each_with_object(word). | |
# Finally, we want to build an Array out of the result of each | |
# execution of this block. Within this block, the 'slice' parameter | |
# is being provided by #each_window, and the 'word' parameter is | |
# being provided by #each_with_object. | |
map { |slice, word| | |
# And now we wave the black-magic wand. In the first execution | |
# of this block, 'slice' is the Array ['p', 'o', 'o', 'b'], and | |
# 'word' is the String "pooba". We #join the former back into a | |
# String, and stuff it into its own little pair, along with the | |
# word it was sliced out of. | |
[slice.join(""), word] | |
}}. | |
# Whew! At this point, we have an Array of Arrays which looks like this: | |
# [ | |
# [ | |
# ['poob', 'pooba'], | |
# ['ooba', 'pooba'] | |
# ], | |
# [ | |
# ['quux', 'quux'] | |
# ] | |
# ] | |
# | |
# We need to get all those two-element arrays of String back into one | |
# 'unified' Array. Because this structure is self-similar in a way, we | |
# can #flatten one layer of it and get what we want: | |
# [ | |
# ['poob', 'pooba'], | |
# ['ooba', 'pooba'] | |
# ['quux', 'quux'] | |
# ] | |
flatten(1). | |
# Finally, we take each of these Arrays and sort them, using their | |
# first element as the key. | |
sort_by do |(slice, word)| | |
# One final bugfix: apparently, upper-case characters are | |
# sorted before lower-case characters: "Zot" will be sorted | |
# before "alpha". So we make sure to sort these strings based | |
# on their lower-case versions. | |
# | |
# One learns something new every day. | |
slice.downcase | |
end | |
# So now, we have the following array: | |
# [ | |
# ['ooba', 'pooba'] | |
# ['poob', 'pooba'], | |
# ['quux', 'quux'] | |
# ] | |
# | |
# Which is what we wanted all along. | |
end | |
# So now, on to the actual usage of all this! Let's pull the filename | |
# out of our script's arguments, with a fallback. | |
in_filename = ARGV[1] || "hello_world.in" | |
out_filename = ARGV[2] || "hello_world.out" | |
# We use the begin..ensure block to make sure we're being good stewards | |
# and closing our files, even if an Exception gets thrown somewhere. | |
begin | |
# Good practice: only open your files with the flags you actually need. | |
# We open our input file with a "read-only" flag and our output file with | |
# the "write-only" flag. | |
infile = File.open(in_filename, "r") | |
outfile = File.open(out_filename, "w+") | |
# We get each line of the input file, and map the #chomp method over it | |
# to remove newline characters. | |
words = infile.each_line.map(&:chomp) | |
# And finally, we simply use our implementation of the algorithm above... | |
slices = sliceify(words, 4) | |
# ...and dump each line into our output file. (We could have been really | |
# clever and made #sliceify into an enumerator-style ethod like #each_window, | |
# but I figure that's pushing things a little bit.) | |
slices.each do |(slice, word)| | |
outfile.puts "#{slice} #{word}" | |
end | |
ensure | |
# And close our files. | |
infile.close | |
outfile.close | |
end | |
# fin. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment