Skip to content

Instantly share code, notes, and snippets.

@xquery
Last active August 29, 2015 14:02
Show Gist options
  • Save xquery/3b527eaf04a02dc7f693 to your computer and use it in GitHub Desktop.
Save xquery/3b527eaf04a02dc7f693 to your computer and use it in GitHub Desktop.
simple markov chain example using xquery (requires MarkLogic)
xquery version "1.0-ml";
import module namespace functx = "http://www.functx.com" at
"/MarkLogic/functx/functx-1.0-nodoc-2007-01.xqy";
(: extracts //p from web page and tokenizes to lower case words :)
declare function local:generate-corpus(
$uri
){
let $context := xdmp:tidy(xdmp:http-get($uri)[2])[2]
let $content := fn:string-join($context//*:p,' ')
return
for $w in tokenize($content, '\W+') return lower-case($w)
};
(: generates sentence of length $num-words :)
declare function local:generate-text(
$first-word,
$wordbase,
$num-words
){
if($num-words eq 0) then ()
else
let $new-word := local:choose-word($first-word)
return (
$new-word,
local:generate-text($wordbase[@value eq $new-word], $wordbase, $num-words - 1)
)
};
(: creates markov chain word database :)
declare function local:generate-wordbase(
$corpus
){
for $word in distinct-values(for $w in $corpus return lower-case($w))
return
let $following-words := index-of($corpus,$word)
let $following-word-list := for $following-word in $following-words
return $corpus[$following-word + 1]
let $map := map:map()
let $calc-word :=
<word value="{$word}">
{
for $following-word in $following-words
let $w := $corpus[$following-word + 1]
return
map:put($map,$w,count(index-of($following-word-list ,$w))),
for $m in $map
return $m
}</word>
return $calc-word
};
(: probablistic selection of next word, based on markov chain :)
declare function local:choose-word(
$word as element(word)
){
let $values := $word//*:value/number(.)
let $sum := sum($word//*:value)
let $random := xdmp:random($sum)
let $r := $random
return
(
let $new-word :=
for $v at $n in $values
let $total := sum($values[1 to $n])
return if ($r eq $total or $r lt $total) then $word//*:entry[$n]/@key else ()
return $new-word[1]
)
};
(: generate corpus from uri, generate markov chains for all words contained in corpus, then generates text :)
let $corpus := local:generate-corpus("http://en.wikipedia.org/wiki/Dixie_Square_Mall")
let $wordbase := local:generate-wordbase($corpus)
let $r := xdmp:random(count($wordbase))
let $first-word := $wordbase[$r]
return
string-join(
(
$first-word/@value,
local:generate-text($wordbase[$r],$wordbase, 10)
)
," ")
@xquery
Copy link
Author

xquery commented Jun 13, 2014

example output - major incidents occurred at the remainder of the blues brothers film

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment