Last active
October 11, 2015 17:11
-
-
Save samanthadoran/013c70cd429d3769d23b to your computer and use it in GitHub Desktop.
Arbitrary order markov text generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tables | |
import sequtils, strutils, math | |
proc readCorpusString(filename: string): seq[string] = | |
#Turns a corpus into a sequence of words | |
result = newSeq[string]() | |
var buff: string = "" | |
var file: File | |
#Read a file into the program | |
if open(file, filename): | |
for line in filename.lines: | |
buff &= " " & line | |
file.close() | |
return buff.split(' ') | |
proc makeTable(corpus: seq[string], order: int = 2): Table[string, seq[string]] = | |
#Generates the markov table | |
result = initTable[string, seq[string]]() | |
#Create the n word keys and their entries | |
for j in countup(order, len(corpus) - 1 - order): | |
var tempkey = newSeq[string]() | |
#Get the words making up the key... | |
for k in countup(j - order, j - 1): | |
tempkey.add(corpus[k].toLower()) | |
#Keys are just lowercase joined sequences | |
let key = join(tempkey, " ") | |
#If the key exists, just append | |
if result.hasKey(key): | |
result[key] = concat(@[corpus[j]], result[key]) | |
#Otherwise, instantiate | |
else: | |
result[key] = @[corpus[j]] | |
proc generate(corpusSeq: seq[string], corpusTable: Table[string, seq[string]], | |
maxWords: int, seedIndex: int = -1, order: int = 2): string = | |
#Generate a markov text | |
#Make sure we have a proper random initialized | |
randomize() | |
var seed: int | |
#Only set the seed to seedIndex if our seed word is in the corpus. | |
if seedIndex != -1: | |
seed = seedIndex | |
else: | |
seed = random(len(corpusSeq) - 1 - order) | |
#Get order n seed words | |
var seedWords: seq[string] = @[] | |
for i in 0..<order: | |
seedWords.add(corpusSeq[seed + i]) | |
var generatedWords = newSeq[string]() | |
for i in 0..maxWords: | |
generatedWords.add(seedWords[0]) | |
#Keys are just joined lowercase strings | |
let key = seedWords.join(" ").toLower() | |
#We can't continue if we don't have it in the table | |
if not corpusTable.hasKey(key): | |
break | |
let randomWordChoice = random(corpusTable[key]) | |
#Remove the start and add to the tail | |
seedWords.delete(0, 0) | |
seedWords.add(randomWordChoice) | |
result = join(generatedWords, " ") | |
proc main() = | |
echo("Enter the name of the corpus you would like to parse...") | |
let corpusSeq = readCorpusString(readLine(stdin).strip()) | |
echo("Enter the order of the markov chain you would like(2 is most common)...") | |
let order = readLine(stdin).parseInt() | |
let corpusTable = makeTable(corpusSeq, order) | |
while true: | |
echo("\n\n\n") | |
var seedIndex = -1 | |
echo("How many words would you like to generate?") | |
let maxWords = readLine(stdin).strip().parseInt() | |
echo("Enter the word you would like to seed(Leave blank for random)") | |
let answer = readLine(stdin).strip().toLower() | |
if answer != "": | |
seedIndex = corpusSeq.find(answer) | |
echo("\n") | |
echo(generate(corpusSeq, corpusTable, maxWords, seedIndex, order)) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment