Skip to content

Instantly share code, notes, and snippets.

@baragona
Created December 16, 2014 05:21
Show Gist options
  • Save baragona/9c619f0f8141b677fd6a to your computer and use it in GitHub Desktop.
Save baragona/9c619f0f8141b677fd6a to your computer and use it in GitHub Desktop.
Markov Chain Encoding
#!/usr/bin/perl -w
# created October 4, 2014
use strict;
use Data::Dumper;
#use JSON::PP;
$"=" ";
$|=1;
sub context_to_string{
my @arr = @{ shift() };
my $char = ')';
return join $char, @arr;
}
sub dec2bin {
my $numBits = shift; # param1
my $number = shift; # param2
my $str = unpack("B32", pack("N", $number));
#$str =~ s/^0+(?=\d)//; # otherwise you'll get leading zeros
return substr($str, 32-$numBits, $numBits);
}
sub splitIntoWords{
my $text = shift;
my @words = split /\s+/, $text ;
@words = map { /(\w+|\W+)/g } @words;
return @words;
}
sub build_markov_table{
my $text = shift;
my $n_context=shift;
$text =~ s/[(){}\[\]"']+/ /g;
my @words = splitIntoWords($text);
my %words_seen;
#die Dumper \@words;
my @last_words;
push @last_words, "";
for (1 .. $n_context){
push @last_words, "";
}
for my $word(@words){
shift @last_words;
push @last_words, $word;
my @context = @last_words;
pop @context;
$words_seen{context_to_string \@context}{ $word }++;
}
return \%words_seen;
}
sub bitstring_to_bytes{
my $bitstring=shift;
$bitstring .= '0' x (8-(length($bitstring)%8)) if length($bitstring)%8>0;
return pack("B*", $bitstring);
}
sub bytes_to_bitstring{
my $bytes=shift;
$bytes = unpack("B*",$bytes);
return $bytes;
}
sub slurp_file{
my $fname = shift;
local( $/ ) ;
open my $fh, '<', $fname or die "cant open $!";
my $text = <$fh>;
return $text;
}
sub encode_or_decode{
my $mode = shift;
my %words_seen = %{ shift() };
my $n_context=shift;
my $input=shift;
my $context = [ map { "" } (1 .. $n_context)];
my @encodedWords;
my $input_plain;
if($mode eq "decode") {
@encodedWords = splitIntoWords($input);
}
if($mode eq "encode") {
$input_plain =$input;
}
my $input_binary = bytes_to_bitstring($input_plain);
my $getBit = sub {
if(not defined $input_binary){
return;
}
my $bit = substr($input_binary,0,1);
#warn $input_binary;
if(length($input_binary)==1){
$input_binary = undef;
}else{
$input_binary = substr($input_binary, 1);
}
return $bit;
#return 1;#rand() > .5? 1:0;
};
my $input_finished = sub {
if((not defined $input_binary) or $input_binary eq '' ){
return 1;
}
return 0;
};
my $bitString;
my $textString;
while(1) {
my %choices_hash = %{ $words_seen{context_to_string($context)} };
#print Dumper \%choices_hash;
my @keys = sort { ($choices_hash{$b} <=> $choices_hash{$a}) or ($a cmp $b) } (keys %choices_hash);
my $max_bits = int(log(scalar(@keys))/log(2));
if($mode eq "decode") {
my $word = shift(@encodedWords);
if(not defined $word) {
#print "\n".$bitString."\n";
return bitstring_to_bytes($bitString);
}
if($max_bits > 0){
my $numericValue;
for(my $i = 0; $i < scalar(@keys); ++$i){
if($word eq $keys[$i]){
$numericValue = $i;
}
}
die "numeric value undef BAD" if(not defined $numericValue);
#print $numericValue;
#print "\n".dec2bin($max_bits, $numericValue)."\n";
$bitString .= dec2bin($max_bits, $numericValue);
}
shift @$context;
push @$context, $word;
}
if($mode eq "encode") {
my @databits;
# get databits
for(1..$max_bits){
my $bit = $getBit->();
push @databits, $bit if defined $bit;
}
my $numeric_value = oct("0b".join("", @databits));
#print $max_bits."\n";
my $freq_sum=0;
for my $freq(values %choices_hash){
$freq_sum+=$freq;
}
# my $rand = int(rand()*$freq_sum);
#
# my $lower_limit=0;
# my $choice_key;
# for my $key(@keys){
# $lower_limit += $choices_hash{$key};
# if($rand < $lower_limit){
# $choice_key=$key;
# last;
# }
# }
# if(not defined $choice_key){
# warn "no choice index";
# $choice_key = $keys[-1];
# }
my $choice_key = $keys[$numeric_value];
#my $choice = $keys[ int(rand()* scalar(@keys)) ];
my $choice = $choice_key;
#die $choice;
$textString .= " " unless $choice =~ /[.,?!:;]/;
$textString .= $choice;
shift @$context;
push @$context, $choice;
if($input_finished->()){
#warn "input finished";
return $textString;
}
}
}
}
sub encode{
my %words_seen = %{ shift() };
my $n_context=shift;
my $input=shift;
return encode_or_decode('encode', \%words_seen, $n_context, $input);
}
sub decode{
my %words_seen = %{ shift() };
my $n_context=shift;
my $input=shift;
return encode_or_decode('decode', \%words_seen, $n_context, $input);
}
#Main script follows
if("@ARGV" =~ /-(d|e)\s+-sample=(.+?)$/){
my $m=$1;
my $sample_f = $2;
my $mode;
my $sample_text = slurp_file($sample_f);
my $n_context=2;
my %words_seen = %{build_markov_table($sample_text, $n_context)};
#print Dumper \%words_seen;
#my $plain_input = "She sells sea shells by the sea shore.";
#read everything from stdin
local $/;
my $plain_input = <STDIN>;
if($m eq 'd'){
print decode(\%words_seen, 2, $plain_input);
}elsif($m eq 'e'){
print encode(\%words_seen, 2, $plain_input);
}
}else{
die "Usage: ./markov.pl [-d or -e] -sample=<FILE>\nReads from STDIN and writes to STDOUT\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment