Created
November 19, 2014 12:34
-
-
Save mgill25/effc45adeb392ffe74cd to your computer and use it in GitHub Desktop.
Markov generator finished
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Text Analysis for the Markov chain | |
# class Object | |
# def method_missing( name, *args ) | |
# puts "There is no method '##{name}' defined on #{self.class}, you dummy!" | |
# end | |
# end | |
def get_ngrams(n, corpus) | |
output_list = [] | |
text_list = corpus.strip.split(' ') | |
text_list.each_with_index do |item, index| | |
# then add all subsequent grams based on n | |
pair = [] | |
bad = false # don't add current if next is nil. | |
n.times.each do |x| | |
item = text_list[index + x] | |
pair << item | |
bad = item.nil? | |
end | |
if not bad then output_list << pair end | |
end | |
output_list.uniq | |
end | |
class TextAnalyser | |
attr_reader :transition_map | |
def initialize(n:, corpus:nil, file_name:nil) | |
# Note how optional named parameters have different syntax | |
# using semi-colons instead of equal sign! | |
@n = n | |
@file_name = file_name | |
@corpus = corpus || parse_corpus | |
@transition_map = generate_transition_map | |
end | |
def remove_tags(line) | |
# Corpus contains a tag after each word, specifying | |
# what kind of word it is. We don't need those. | |
word_list = fix_punctuation(line.strip.split(' ')) | |
word_list.map do |word| | |
word.split('/').first | |
end.join(' ') # Cool! We can append things right after end | |
end | |
def fix_punctuation(line_arr) | |
line_arr.each_with_index do |item, index| | |
split = item.split('/') | |
if ['.', ',', ':'].include? split.last | |
prev_split = line_arr[index - 1].split('/') | |
prev_split = prev_split.first + split.first + ' /' + prev_split.last | |
line_arr[index - 1] = prev_split | |
line_arr.delete_at(index) | |
end | |
end | |
end | |
def parse_corpus | |
puts "parsing corpus for file: #{@file_name}" | |
# Load and parse the corpus file | |
File.open("./brown/#{@file_name}", "r") do |fsock| | |
fsock.each_line.each_with_object([]) do |line, acc| | |
acc << remove_tags(line) unless line.empty? | |
end.join('') | |
end | |
end | |
def ngrams | |
@corpus.strip.split(' ').each_cons(@n).to_a | |
end | |
def generate_transition_map | |
# Returns a map which correlates an n-gram with all the | |
# possible output states it can have. | |
# Hash with default value as an Array, to be mutated every time | |
# we append something to the Array itself. | |
rv = Hash.new { |hsh, key| hsh[key] = [] } | |
self.ngrams.each_with_index do |item, index| | |
if !self.ngrams[index + 1].nil? | |
rv[item] << self.ngrams[index + 1][-1] | |
end | |
end | |
return rv | |
end | |
def update_table(ngram, val) | |
map = @transition_map | |
state_list = map[ngram] | |
state_list.nil? ? state_list = [val] : state_list << val | |
return map | |
end | |
def get_next_state(state) | |
@transition_map[state].sample | |
end | |
def make_ngram(arr, item) | |
output = [] | |
(@n - 1).times.each { |index| output << arr[-1 - index] } | |
output.reverse! | |
output << item | |
return output | |
end | |
def generate_text(iterations, start_state: [], end_states:[]) | |
arr = [] | |
puts "Starting from: #{start_state}" | |
current = start_state | |
iterations.times.each do | |
next_state = self.get_next_state(current) | |
if next_state.nil? | |
puts "Got nil from #{current}" | |
break | |
end | |
# next will be the discrete state that goes in the output array | |
arr << next_state | |
# now we make another n-gram from the output seen so far, and repeat. | |
current = make_ngram(current, next_state) | |
end | |
arr.join(' ') | |
end | |
# Declare what methods should be private at the end. | |
private :remove_tags, :fix_punctuation, :parse_corpus, :generate_transition_map, :make_ngram | |
end | |
text = %Q(Far far away, behind the word mountains, far from the countries | |
Vokalia and Consonantia, there live the blind texts. Separated they live in | |
Bookmarksgrove right at the coast of the Semantics, a large language ocean. A | |
small river named Duden flows by their place and supplies it with the | |
necessary regelialia. It is a paradisematic country, in which roasted parts of | |
sentences fly into your mouth. Even the all-powerful Pointing has no control | |
about the blind texts it is an almost unorthographic life One day however a | |
small line of blind text by the name of Lorem Ipsum decided to leave for the | |
far World of Grammar. The Big Oxmox advised her not to do so, because there | |
were thousands of bad Commas, wild Question Marks and devious Semikoli, but | |
the Little Blind Text didn’t listen. She packed her seven versalia, put her | |
initial into the belt and made herself on the way. When she reached the first | |
hills of the Italic Mountains, she had a last view back on the skyline of her | |
hometown Bookmarksgrove, the headline of Alphabet Village and the subline of | |
her own road, the Line Lane. Pityful a rethoric question ran over her cheek, | |
then) | |
# text = "the more we try the more we do" | |
# ta = TextAnalyser.new(n:2, corpus:text) | |
ta = TextAnalyser.new(n:2, file_name:"cb01") | |
# puts ta.transition_map | |
# print ta.ngrams | |
puts ta.generate_text(20, start_state: ta.ngrams.sample).to_s | |
# TODO: Can we break up the text line-by-line? | |
# TODO: Better indication of Start and End tokens? | |
# Some creative uses of Markov chain text generation: | |
# - Sonnets | |
# - Scientific Papers | |
# - Poems | |
# - Tweets | |
# - Blog posts | |
# - Political Speeches | |
# - Markov Haiku? | |
# - And so on... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment