mgill25/markov.rb

## markov.rb
# Text Analysis for the Markov chain

# class Object
#   def method_missing( name, *args )
#     puts "There is no method '##{name}' defined on #{self.class}, you dummy!"
#   end
# end

def get_ngrams(n, corpus)
    output_list = []
    text_list = corpus.strip.split(' ')
    text_list.each_with_index do |item, index|
        # then add all subsequent grams based on n
        pair = []
        bad = false                                 # don't add current if next is nil.
        n.times.each do |x|
            item = text_list[index + x]
            pair << item
            bad = item.nil?
        end
        if not bad then output_list << pair end
    end
    output_list.uniq
end


class TextAnalyser
    attr_reader :transition_map

    def initialize(n:, corpus:nil, file_name:nil)
        # Note how optional named parameters have different syntax
        # using semi-colons instead of equal sign!
        @n = n
        @file_name = file_name
        @corpus = corpus || parse_corpus
        @transition_map = generate_transition_map
    end

    def remove_tags(line)
        # Corpus contains a tag after each word, specifying
        # what kind of word it is. We don't need those.
        word_list = fix_punctuation(line.strip.split(' '))
        word_list.map do |word|
            word.split('/').first
        end.join(' ')               # Cool! We can append things right after end
    end

    def fix_punctuation(line_arr)
        line_arr.each_with_index do |item, index|
            split = item.split('/')
            if ['.', ',', ':'].include? split.last
                prev_split = line_arr[index - 1].split('/')
                prev_split = prev_split.first + split.first + ' /' + prev_split.last
                line_arr[index - 1] = prev_split
                line_arr.delete_at(index)
            end
        end
    end

    def parse_corpus
        puts "parsing corpus for file: #{@file_name}"
        # Load and parse the corpus file
        File.open("./brown/#{@file_name}", "r") do |fsock|
            fsock.each_line.each_with_object([]) do |line, acc|
                acc << remove_tags(line) unless line.empty?
            end.join('')
        end
    end

    def ngrams
        @corpus.strip.split(' ').each_cons(@n).to_a
    end

    def generate_transition_map
        # Returns a map which correlates an n-gram with all the
        # possible output states it can have.

        # Hash with default value as an Array, to be mutated every time
        # we append something to the Array itself.
        rv = Hash.new { |hsh, key| hsh[key] = [] }

        self.ngrams.each_with_index do |item, index|
            if !self.ngrams[index + 1].nil?
                rv[item] << self.ngrams[index + 1][-1]
            end
        end
        return rv
    end

    def update_table(ngram, val)
        map = @transition_map
        state_list = map[ngram]
        state_list.nil? ? state_list = [val] : state_list << val
        return map
    end

    def get_next_state(state)
        @transition_map[state].sample
    end

    def make_ngram(arr, item)
        output = []
        (@n - 1).times.each { |index| output << arr[-1 - index] }
        output.reverse!
        output << item
        return output
    end

    def generate_text(iterations, start_state: [], end_states:[])
        arr = []
        puts "Starting from: #{start_state}"
        current = start_state
        iterations.times.each do
            next_state = self.get_next_state(current)
            if next_state.nil?
                puts "Got nil from #{current}"
                break
            end
            # next will be the discrete state that goes in the output array
            arr << next_state
            # now we make another n-gram from the output seen so far, and repeat.
            current = make_ngram(current, next_state)
        end
        arr.join(' ')
    end

    # Declare what methods should be private at the end.
    private :remove_tags, :fix_punctuation, :parse_corpus, :generate_transition_map, :make_ngram
end

text = %Q(Far far away, behind the word mountains, far from the countries
Vokalia and Consonantia, there live the blind texts. Separated they live in
Bookmarksgrove right at the coast of the Semantics, a large language ocean. A
small river named Duden flows by their place and supplies it with the
necessary regelialia. It is a paradisematic country, in which roasted parts of
sentences fly into your mouth. Even the all-powerful Pointing has no control
about the blind texts it is an almost unorthographic life One day however a
small line of blind text by the name of Lorem Ipsum decided to leave for the
far World of Grammar. The Big Oxmox advised her not to do so, because there
were thousands of bad Commas, wild Question Marks and devious Semikoli, but
the Little Blind Text didn’t listen. She packed her seven versalia, put her
initial into the belt and made herself on the way. When she reached the first
hills of the Italic Mountains, she had a last view back on the skyline of her
hometown Bookmarksgrove, the headline of Alphabet Village and the subline of
her own road, the Line Lane. Pityful a rethoric question ran over her cheek,
then)

# text = "the more we try the more we do"
# ta = TextAnalyser.new(n:2, corpus:text)
ta = TextAnalyser.new(n:2, file_name:"cb01")
# puts ta.transition_map
# print ta.ngrams
puts ta.generate_text(20, start_state: ta.ngrams.sample).to_s
# TODO: Can we break up the text line-by-line?
# TODO: Better indication of Start and End tokens?

# Some creative uses of Markov chain text generation:
# - Sonnets
# - Scientific Papers
# - Poems
# - Tweets
# - Blog posts
# - Political Speeches
# - Markov Haiku?
# - And so on...
	# Text Analysis for the Markov chain

	# class Object
	# def method_missing( name, *args )
	# puts "There is no method '##{name}' defined on #{self.class}, you dummy!"
	# end
	# end

	def get_ngrams(n, corpus)
	output_list = []
	text_list = corpus.strip.split(' ')
	text_list.each_with_index do \|item, index\|
	# then add all subsequent grams based on n
	pair = []
	bad = false # don't add current if next is nil.
	n.times.each do \|x\|
	item = text_list[index + x]
	pair << item
	bad = item.nil?
	end
	if not bad then output_list << pair end
	end
	output_list.uniq
	end


	class TextAnalyser
	attr_reader :transition_map

	def initialize(n:, corpus:nil, file_name:nil)
	# Note how optional named parameters have different syntax
	# using semi-colons instead of equal sign!
	@n = n
	@file_name = file_name
	@corpus = corpus \|\| parse_corpus
	@transition_map = generate_transition_map
	end

	def remove_tags(line)
	# Corpus contains a tag after each word, specifying
	# what kind of word it is. We don't need those.
	word_list = fix_punctuation(line.strip.split(' '))
	word_list.map do \|word\|
	word.split('/').first
	end.join(' ') # Cool! We can append things right after end
	end

	def fix_punctuation(line_arr)
	line_arr.each_with_index do \|item, index\|
	split = item.split('/')
	if ['.', ',', ':'].include? split.last
	prev_split = line_arr[index - 1].split('/')
	prev_split = prev_split.first + split.first + ' /' + prev_split.last
	line_arr[index - 1] = prev_split
	line_arr.delete_at(index)
	end
	end
	end

	def parse_corpus
	puts "parsing corpus for file: #{@file_name}"
	# Load and parse the corpus file
	File.open("./brown/#{@file_name}", "r") do \|fsock\|
	fsock.each_line.each_with_object([]) do \|line, acc\|
	acc << remove_tags(line) unless line.empty?
	end.join('')
	end
	end

	def ngrams
	@corpus.strip.split(' ').each_cons(@n).to_a
	end

	def generate_transition_map
	# Returns a map which correlates an n-gram with all the
	# possible output states it can have.

	# Hash with default value as an Array, to be mutated every time
	# we append something to the Array itself.
	rv = Hash.new { \|hsh, key\| hsh[key] = [] }

	self.ngrams.each_with_index do \|item, index\|
	if !self.ngrams[index + 1].nil?
	rv[item] << self.ngrams[index + 1][-1]
	end
	end
	return rv
	end

	def update_table(ngram, val)
	map = @transition_map
	state_list = map[ngram]
	state_list.nil? ? state_list = [val] : state_list << val
	return map
	end

	def get_next_state(state)
	@transition_map[state].sample
	end

	def make_ngram(arr, item)
	output = []
	(@n - 1).times.each { \|index\| output << arr[-1 - index] }
	output.reverse!
	output << item
	return output
	end

	def generate_text(iterations, start_state: [], end_states:[])
	arr = []
	puts "Starting from: #{start_state}"
	current = start_state
	iterations.times.each do
	next_state = self.get_next_state(current)
	if next_state.nil?
	puts "Got nil from #{current}"
	break
	end
	# next will be the discrete state that goes in the output array
	arr << next_state
	# now we make another n-gram from the output seen so far, and repeat.
	current = make_ngram(current, next_state)
	end
	arr.join(' ')
	end

	# Declare what methods should be private at the end.
	private :remove_tags, :fix_punctuation, :parse_corpus, :generate_transition_map, :make_ngram
	end

	text = %Q(Far far away, behind the word mountains, far from the countries
	Vokalia and Consonantia, there live the blind texts. Separated they live in
	Bookmarksgrove right at the coast of the Semantics, a large language ocean. A
	small river named Duden flows by their place and supplies it with the
	necessary regelialia. It is a paradisematic country, in which roasted parts of
	sentences fly into your mouth. Even the all-powerful Pointing has no control
	about the blind texts it is an almost unorthographic life One day however a
	small line of blind text by the name of Lorem Ipsum decided to leave for the
	far World of Grammar. The Big Oxmox advised her not to do so, because there
	were thousands of bad Commas, wild Question Marks and devious Semikoli, but
	the Little Blind Text didn’t listen. She packed her seven versalia, put her
	initial into the belt and made herself on the way. When she reached the first
	hills of the Italic Mountains, she had a last view back on the skyline of her
	hometown Bookmarksgrove, the headline of Alphabet Village and the subline of
	her own road, the Line Lane. Pityful a rethoric question ran over her cheek,
	then)

	# text = "the more we try the more we do"
	# ta = TextAnalyser.new(n:2, corpus:text)
	ta = TextAnalyser.new(n:2, file_name:"cb01")
	# puts ta.transition_map
	# print ta.ngrams
	puts ta.generate_text(20, start_state: ta.ngrams.sample).to_s
	# TODO: Can we break up the text line-by-line?
	# TODO: Better indication of Start and End tokens?

	# Some creative uses of Markov chain text generation:
	# - Sonnets
	# - Scientific Papers
	# - Poems
	# - Tweets
	# - Blog posts
	# - Political Speeches
	# - Markov Haiku?
	# - And so on...