Created
November 18, 2015 20:42
-
-
Save gfldex/e8469ad25ee559e7c8ca to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl6 | |
# a simple script to take a dictionary w/ one word per line, e.g. | |
# /usr/share/dict/words, and count trigrams of letters in order to | |
# build a model to generate plausible nonsense words | |
# pretty much all the slowness happens in this function | |
# admittedly it gets called thousands of times | |
use nqp; | |
sub update_dict(\dict, Str $w) { | |
my $tagged := '$$' ~ $w ~ '%'; | |
my str $tagged_native = nqp::unbox_s($tagged); | |
my int $max = nqp::chars($tagged_native); | |
loop (my int $idx = 0; $idx < $max - 2; $idx = $idx + 1) { | |
dict{nqp::substr($tagged_native,$idx,3)}++; | |
} | |
} | |
my Int %countdict; | |
for lines() -> $w { | |
update_dict(%countdict, $w); | |
} | |
say "processed"; | |
my @c = unique(gather { | |
for %countdict.keys -> $k { | |
take $_ for $k.split(''); | |
} | |
}); | |
sub getnext(%dict, @c, $cur) { | |
my $firsttwo = $cur.substr(1,2); | |
my $m = 0; | |
my $mv = '%'; | |
for @c -> $v { | |
my $score = rand * (%dict{$firsttwo ~ $v} // 0); | |
if $score > $m { | |
$m = $score; | |
$mv = $v; | |
} | |
} | |
return $mv; | |
} | |
sub getword(%dict, @c) { | |
my $str = '$$' ~ @c.pick; | |
until $str.substr($str.chars-1, 1) === '%' { | |
my $cur = $str.flip.substr(0,3).flip; | |
$str = $str ~ getnext(%countdict, @c, $cur); | |
} | |
return $str.match(/\w+/); | |
} | |
say getword(%countdict, @c) for ^7; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment