Skip to content

Instantly share code, notes, and snippets.

@mgill25
Created November 19, 2014 12:34
Show Gist options
  • Save mgill25/effc45adeb392ffe74cd to your computer and use it in GitHub Desktop.
Save mgill25/effc45adeb392ffe74cd to your computer and use it in GitHub Desktop.
Markov generator finished
# Text Analysis for the Markov chain
# class Object
# def method_missing( name, *args )
# puts "There is no method '##{name}' defined on #{self.class}, you dummy!"
# end
# end
def get_ngrams(n, corpus)
output_list = []
text_list = corpus.strip.split(' ')
text_list.each_with_index do |item, index|
# then add all subsequent grams based on n
pair = []
bad = false # don't add current if next is nil.
n.times.each do |x|
item = text_list[index + x]
pair << item
bad = item.nil?
end
if not bad then output_list << pair end
end
output_list.uniq
end
class TextAnalyser
attr_reader :transition_map
def initialize(n:, corpus:nil, file_name:nil)
# Note how optional named parameters have different syntax
# using semi-colons instead of equal sign!
@n = n
@file_name = file_name
@corpus = corpus || parse_corpus
@transition_map = generate_transition_map
end
def remove_tags(line)
# Corpus contains a tag after each word, specifying
# what kind of word it is. We don't need those.
word_list = fix_punctuation(line.strip.split(' '))
word_list.map do |word|
word.split('/').first
end.join(' ') # Cool! We can append things right after end
end
def fix_punctuation(line_arr)
line_arr.each_with_index do |item, index|
split = item.split('/')
if ['.', ',', ':'].include? split.last
prev_split = line_arr[index - 1].split('/')
prev_split = prev_split.first + split.first + ' /' + prev_split.last
line_arr[index - 1] = prev_split
line_arr.delete_at(index)
end
end
end
def parse_corpus
puts "parsing corpus for file: #{@file_name}"
# Load and parse the corpus file
File.open("./brown/#{@file_name}", "r") do |fsock|
fsock.each_line.each_with_object([]) do |line, acc|
acc << remove_tags(line) unless line.empty?
end.join('')
end
end
def ngrams
@corpus.strip.split(' ').each_cons(@n).to_a
end
def generate_transition_map
# Returns a map which correlates an n-gram with all the
# possible output states it can have.
# Hash with default value as an Array, to be mutated every time
# we append something to the Array itself.
rv = Hash.new { |hsh, key| hsh[key] = [] }
self.ngrams.each_with_index do |item, index|
if !self.ngrams[index + 1].nil?
rv[item] << self.ngrams[index + 1][-1]
end
end
return rv
end
def update_table(ngram, val)
map = @transition_map
state_list = map[ngram]
state_list.nil? ? state_list = [val] : state_list << val
return map
end
def get_next_state(state)
@transition_map[state].sample
end
def make_ngram(arr, item)
output = []
(@n - 1).times.each { |index| output << arr[-1 - index] }
output.reverse!
output << item
return output
end
def generate_text(iterations, start_state: [], end_states:[])
arr = []
puts "Starting from: #{start_state}"
current = start_state
iterations.times.each do
next_state = self.get_next_state(current)
if next_state.nil?
puts "Got nil from #{current}"
break
end
# next will be the discrete state that goes in the output array
arr << next_state
# now we make another n-gram from the output seen so far, and repeat.
current = make_ngram(current, next_state)
end
arr.join(' ')
end
# Declare what methods should be private at the end.
private :remove_tags, :fix_punctuation, :parse_corpus, :generate_transition_map, :make_ngram
end
text = %Q(Far far away, behind the word mountains, far from the countries
Vokalia and Consonantia, there live the blind texts. Separated they live in
Bookmarksgrove right at the coast of the Semantics, a large language ocean. A
small river named Duden flows by their place and supplies it with the
necessary regelialia. It is a paradisematic country, in which roasted parts of
sentences fly into your mouth. Even the all-powerful Pointing has no control
about the blind texts it is an almost unorthographic life One day however a
small line of blind text by the name of Lorem Ipsum decided to leave for the
far World of Grammar. The Big Oxmox advised her not to do so, because there
were thousands of bad Commas, wild Question Marks and devious Semikoli, but
the Little Blind Text didn’t listen. She packed her seven versalia, put her
initial into the belt and made herself on the way. When she reached the first
hills of the Italic Mountains, she had a last view back on the skyline of her
hometown Bookmarksgrove, the headline of Alphabet Village and the subline of
her own road, the Line Lane. Pityful a rethoric question ran over her cheek,
then)
# text = "the more we try the more we do"
# ta = TextAnalyser.new(n:2, corpus:text)
ta = TextAnalyser.new(n:2, file_name:"cb01")
# puts ta.transition_map
# print ta.ngrams
puts ta.generate_text(20, start_state: ta.ngrams.sample).to_s
# TODO: Can we break up the text line-by-line?
# TODO: Better indication of Start and End tokens?
# Some creative uses of Markov chain text generation:
# - Sonnets
# - Scientific Papers
# - Poems
# - Tweets
# - Blog posts
# - Political Speeches
# - Markov Haiku?
# - And so on...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment