Last active
May 13, 2016 23:16
-
-
Save zehnpaard/442bce736664c09485de73b26525c371 to your computer and use it in GitHub Desktop.
Random Text Generator Using Markov Chains
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;Utility Functions | |
(defn map-vals | |
"Transform a map by applying f to each value" | |
[f hmap] | |
(zipmap (keys hmap) | |
(map f (vals hmap)))) | |
(defn reductions-vals | |
"Transform a map by running (reductions f ...) onto the values | |
and returning an array-map" | |
[f hmap] | |
(apply array-map | |
(interleave | |
(keys hmap) | |
(reductions f (vals hmap))))) | |
;Creating N-Gram & Word Frequency Distributions | |
(defn file->words | |
"Create a sequence of words from a string specifying the filename | |
containing the source text" | |
[file] | |
(-> file | |
slurp | |
(clojure.string/split #" "))) | |
(defn make-ngrams | |
"Transform a sequence of words to a sequence of vectors of all | |
n-length consecutive words, looping at the end to the start" | |
[words n] | |
(letfn [(rotate [[head & tail]] | |
(concat tail [head]))] | |
(->> words | |
(iterate rotate) | |
(take n) | |
(apply map vector)))) | |
(defn cumulative-frequencies | |
"Take a list and convert to an array-map of cumulative frequencies" | |
[xs] | |
(->> xs | |
frequencies | |
(reductions-vals +))) | |
(defn words->freq-distribution | |
"Create an associative of all n-1 length ngram vectors to the | |
cumulative frequency distributions of the following word" | |
[words n] | |
(let [ngrams (make-ngrams words n) | |
n-1gram (comp vec drop-last) | |
grouped (group-by n-1gram ngrams)] | |
(->> grouped | |
(map-vals #(map last %)) | |
(map-vals cumulative-frequencies)))) | |
;Generating Infinite Markov Chains | |
(defn next-word | |
"Choose the next word based on an associative with vectors of n-grams | |
and the cumulative frequency distribution of the words following" | |
[starting-words freq-distribution] | |
(let [cum-freq (get freq-distribution starting-words) | |
total (second (last cum-freq)) | |
i (rand-int total) | |
pair-at-i (first | |
(filter #(< i (second %)) cum-freq)) | |
word-at-i (first pair-at-i)] | |
word-at-i)) | |
(defn markov-chain | |
"Create an infinite sequence of words by repeatedly choosing the next | |
word based on the previous n-1 words" | |
[starting-words freq-distribution] | |
(letfn [(f [words] | |
(conj (vec (rest words)) | |
(next-word words freq-distribution)))] | |
(->> starting-words | |
(iterate f) | |
(map first)))) | |
;Putting it all together | |
(defn combine-words | |
"Take a sequence of words and convert to a string with spaces between words" | |
[words] | |
(->> words | |
(interpose " ") | |
(apply str))) | |
(defn random-text | |
"Generate random text of 'length', starting with 'words', using | |
word distributions based on the text in 'file'" | |
[words length file] | |
(let [n (inc (count words)) | |
freq-distribution (-> file | |
file->words | |
(words->freq-distribution n))] | |
(if (contains? freq-distribution words) | |
(->> (markov-chain words freq-distribution) | |
(take length) | |
combine-words)))) | |
(comment | |
"Usage example:" | |
(random-text ["it" "is" "a"] 100 "war-and-peace.txt")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment