Last active
August 29, 2015 14:21
-
-
Save rjbs/9e716296aeef57e65068 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use v6; | |
class ZMachine::ZSCII { | |
use ZMachine::Util; | |
my %DEFAULT_ZSCII = ( | |
chr(0x00) => "\c[NULL]", | |
chr(0x08) => "\c[DELETE]", | |
chr(0x0D) => "\x0D", | |
chr(0x1B) => "\c[ESCAPE]", | |
# I consider the fact that { .chr } is a block but { .chr => .chr } is a | |
# hash to be one of the more frustrating things in Perl 6. This was | |
# already a horrible problem in Perl 5, and it isn't fixed. It's now | |
# unambiguous to a computer, but for a person, it's just not clear enough. | |
# -- rjbs, 2015-05-12 | |
(0x20 .. 0x7E).map(-> $_ { .chr => .chr }), # ASCII maps over | |
# 0x09B - 0x0FB are the "extra characters"; need Unicode translation table | |
# 0x0FF - 0x3FF are undefined and never (?) used | |
); | |
# We can use these characters below because they all (save for the magic | |
# A2-C6) are the same in Unicode/ASCII/ZSCII. -- rjbs, 2013-01-18 | |
my $DEFAULT_ALPHABET = join('', | |
'a' .. 'z', # A0 | |
'A' .. 'Z', # A1 | |
( # A2 | |
"\0", # special: read 2 chars for 10-bit zscii character | |
"\x0D", | |
(0 .. 9), | |
<< . , ! ? _ # ' " / \ - : ( ) >>, | |
), | |
); | |
my @DEFAULT_EXTRA = << | |
E4 F6 FC C4 D6 DC DF BB AB EB EF FF CB CF E1 E9 | |
ED F3 FA FD C1 C9 CD D3 DA DD E0 E8 EC F2 F9 C0 | |
C8 CC D2 D9 | |
E2 EA EE F4 FB C2 CA CE D4 DB E5 C5 F8 D8 E3 F1 | |
F5 C3 D1 D5 E6 C6 E7 C7 FE F0 DE D0 A3 153 152 A1 | |
BF | |
>>.map({ "0x{$_}".chr }); | |
# sub _validate_alphabet { | |
# my (undef, $alphabet) = @_; | |
# | |
# Carp::croak("alphabet table was not 78 entries long") | |
# unless length $alphabet == 78; | |
# | |
# Carp::carp("alphabet character 52 not set to 0x000") | |
# unless substr($alphabet, 52, 1) eq chr(0); | |
# | |
# Carp::croak("alphabet table contains characters over 0xFF") | |
# if grep {; ord > 0xFF } split //, $alphabet; | |
# } | |
# | |
# sub _shortcuts_for { | |
# my ($self, $alphabet) = @_; | |
# | |
# $self->_validate_alphabet($alphabet); | |
# | |
# my %shortcut = (q{ } => chr(0)); | |
# | |
# for my $i (0 .. 2) { | |
# my $offset = $i * 26; | |
# my $prefix = $i ? chr(0x03 + $i) : ''; | |
# | |
# for my $j (0 .. 25) { | |
# next if $i == 2 and $j == 0; # that guy is magic! -- rjbs, 2013-01-18 | |
# | |
# $shortcut{ substr($alphabet, $offset + $j, 1) } = $prefix . chr($j + 6); | |
# } | |
# } | |
# | |
# return \%shortcut; | |
# } | |
# | |
# sub new { | |
# my ($class, $arg) = @_; | |
# | |
# if (! defined $arg) { | |
# $arg = { version => 5 }; | |
# } if (! ref $arg) { | |
# $arg = { version => $arg }; | |
# } | |
# | |
# my $guts = { version => $arg->{version} }; | |
# | |
# Carp::croak("only Version 5, 7, and 8 ZSCII are supported at present") | |
# unless $guts->{version} == 5 | |
# or $guts->{version} == 7 | |
# or $guts->{version} == 8; | |
# | |
# $guts->{zscii} = { %DEFAULT_ZSCII }; | |
# | |
# # Why is this an arrayref and not, like alphabets, a string? | |
# # Alphabets are strings because they're guaranteed to fit in bytestrings. | |
# # You can't put a ZSCII character over 0xFF in the alphabet, because it can't | |
# # be put in the story file's alphabet table! By using a string, it's easy to | |
# # just pass in the alphabet from memory to/from the codec. On the other | |
# # hand, the Unicode translation table stores Unicode codepoint values packed | |
# # into words, and it's not a good fit for use in the codec. Maybe a | |
# # ZMachine::Util will be useful for packing/unpacking Unicode translation | |
# # tables. | |
# $guts->{extra} = $arg->{extra_characters} | |
# || \@DEFAULT_EXTRA; | |
# | |
# Carp::confess("Unicode translation table exceeds maximum length of 97") | |
# if @{ $guts->{extra} } > 97; | |
# | |
# for (0 .. $#{ $guts->{extra} }) { | |
# Carp::confess("tried to add ambiguous Z->U mapping") | |
# if exists $guts->{zscii}{ chr(155 + $_) }; | |
# | |
# my $u_char = $guts->{extra}[$_]; | |
# | |
# # Extra characters must go into the Unicode substitution table, which can | |
# # only represent characters with codepoints between 0 and 0xFFFF. See | |
# # Z-Machine Spec v1.1 § 3.8.4.2.1 | |
# Carp::confess("tried to add Unicode codepoint greater than U+FFFF") | |
# if ord($u_char) > 0xFFFF; | |
# | |
# $guts->{zscii}{ chr(155 + $_) } = $u_char; | |
# } | |
# | |
# $guts->{zscii_for} = { }; | |
# for my $zscii_char (sort keys %{ $guts->{zscii} }) { | |
# my $unicode_char = $guts->{zscii}{$zscii_char}; | |
# | |
# Carp::confess("tried to add ambiguous U->Z mapping") | |
# if exists $guts->{zscii_for}{ $unicode_char }; | |
# | |
# $guts->{zscii_for}{ $unicode_char } = $zscii_char; | |
# } | |
# | |
# my $self = bless $guts => $class; | |
# | |
# # The default alphabet is entirely made up of characters that are the same in | |
# # Unicode and ZSCII. If a user wants to put "extra characters" into the | |
# # alphabet table, though, the alphabet should contain ZSCII values. When | |
# # we're building a ZMachine::ZSCII using the contents of the story file's | |
# # alphabet table, that's easy. If we're building a codec to *produce* a | |
# # story file, it's less trivial, because we don't want to think about the | |
# # specific ZSCII codepoints for the Unicode text we'll encode. | |
# # | |
# # We provide alphabet_is_unicode to let the user say "my alphabet is supplied | |
# # in Unicode, please convert it to ZSCII during construction." -- rjbs, | |
# # 2013-01-19 | |
# my $alphabet = $arg->{alphabet} || $DEFAULT_ALPHABET; | |
# | |
# # It's okay if the user supplies alphabet_is_unicode but not alphabet, | |
# # because the default alphabet is all characters with the same value in both | |
# # character sets! -- rjbs, 2013-01-20 | |
# $alphabet = $self->unicode_to_zscii($alphabet) | |
# if $arg->{alphabet_is_unicode}; | |
# | |
# $self->{alphabet} = $alphabet; | |
# $self->{shortcut} = $class->_shortcuts_for( $self->{alphabet} ); | |
# | |
# return $self; | |
# } | |
# | |
# =method encode | |
# | |
# my $packed_zchars = $z->encode( $unicode_text ); | |
# | |
# This method takes a string of text and encodes it to a bytestring of packed | |
# Z-characters. | |
# | |
# Internally, it converts the Unicode text to ZSCII, then to Z-characters, and | |
# then packs them. Before this processing, any native newline characters (the | |
# value of C<\n>) are converted to C<U+000D> to match the Z-Machine's use of | |
# character 0x00D for newline. | |
# | |
# =cut | |
# | |
# sub encode { | |
# my ($self, $string) = @_; | |
# | |
# $string =~ s/\n/\x0D/g; | |
# | |
# my $zscii = $self->unicode_to_zscii($string); | |
# my $zchars = $self->zscii_to_zchars($zscii); | |
# | |
# return $self->pack_zchars($zchars); | |
# } | |
# | |
# =method decode | |
# | |
# my $text = $z->decode( $packed_zchars ); | |
# | |
# This method takes a bytestring of packed Z-characters and returns a string of | |
# text. | |
# | |
# Internally, it unpacks the Z-characters, converts them to ZSCII, and then | |
# converts those to Unicode. Any ZSCII characters 0x00D are converted to the | |
# value of C<\n>. | |
# | |
# =cut | |
# | |
# sub decode { | |
# my ($self, $bytestring) = @_; | |
# | |
# my $zchars = $self->unpack_zchars( $bytestring ); | |
# my $zscii = $self->zchars_to_zscii( $zchars ); | |
# my $unicode = $self->zscii_to_unicode( $zscii ); | |
# | |
# $unicode =~ s/\x0D/\n/g; | |
# | |
# return $unicode; | |
# } | |
# | |
# =method unicode_to_zscii | |
# | |
# my $zscii_string = $z->unicode_to_zscii( $unicode_string ); | |
# | |
# This method converts a Unicode string to a ZSCII string, using the dialect of | |
# ZSCII for the ZMachine::ZSCII's configuration. | |
# | |
# If the Unicode input contains any characters that cannot be mapped to ZSCII, an | |
# exception is raised. | |
# | |
# =cut | |
# | |
# sub unicode_to_zscii { | |
# my ($self, $unicode_text) = @_; | |
# | |
# my $zscii = ''; | |
# for (0 .. length($unicode_text) - 1) { | |
# my $char = substr $unicode_text, $_, 1; | |
# | |
# Carp::croak( | |
# sprintf "no ZSCII character available for Unicode U+%v05X <%s>", | |
# $char, | |
# charnames::viacode(ord $char), | |
# ) unless defined( my $zscii_char = $self->{zscii_for}{ $char } ); | |
# | |
# $zscii .= $zscii_char; | |
# } | |
# | |
# return $zscii; | |
# } | |
# | |
# =method zscii_to_unicode | |
# | |
# my $unicode_string = $z->zscii_to_unicode( $zscii_string ); | |
# | |
# This method converts a ZSCII string to a Unicode string, using the dialect of | |
# ZSCII for the ZMachine::ZSCII's configuration. | |
# | |
# If the ZSCII input contains any characters that cannot be mapped to Unicode, an | |
# exception is raised. I<In the future, it may be possible to request a Unicode | |
# replacement character instead.> | |
# | |
# =cut | |
# | |
# sub zscii_to_unicode { | |
# my ($self, $zscii) = @_; | |
# | |
# my $unicode = ''; | |
# for (0 .. length($zscii) - 1) { | |
# my $char = substr $zscii, $_, 1; | |
# | |
# Carp::croak( | |
# sprintf "no Unicode character available for ZSCII %#v05x", $char, | |
# ) unless defined(my $unicode_char = $self->{zscii}{ $char }); | |
# | |
# $unicode .= $unicode_char; | |
# } | |
# | |
# return $unicode; | |
# } | |
# | |
# =method zscii_to_zchars | |
# | |
# my $zchars = $z->zscii_to_zchars( $zscii_string ); | |
# | |
# Given a string of ZSCII characters, this method will return a (unpacked) string | |
# of Z-characters. | |
# | |
# It will raise an exception on ZSCII codepoints that cannot be represented as | |
# Z-characters, which should not be possible with legal ZSCII. | |
# | |
# =cut | |
# | |
# sub zscii_to_zchars { | |
# my ($self, $zscii) = @_; | |
# | |
# return '' unless length $zscii; | |
# | |
# my $zchars = ''; | |
# for (0 .. length($zscii) - 1) { | |
# my $zscii_char = substr($zscii, $_, 1); | |
# if (defined (my $shortcut = $self->{shortcut}{ $zscii_char })) { | |
# $zchars .= $shortcut; | |
# next; | |
# } | |
# | |
# my $ord = ord $zscii_char; | |
# | |
# if ($ord >= 1024) { | |
# Carp::croak( | |
# sprintf "can't encode ZSCII codepoint %#v05x in Z-characters", | |
# $zscii_char | |
# ); | |
# } | |
# | |
# my $top = ($ord & 0b1111100000) >> 5; | |
# my $bot = ($ord & 0b0000011111); | |
# | |
# $zchars .= "\x05\x06"; # The escape code for a ten-bit ZSCII character. | |
# $zchars .= chr($top) . chr($bot); | |
# } | |
# | |
# return $zchars; | |
# } | |
# | |
# =method zchars_to_zscii | |
# | |
# my $zscii = $z->zchars_to_zscii( $zchars_string, \%arg ); | |
# | |
# Given a string of (unpacked) Z-characters, this method will return a string of | |
# ZSCII characters. | |
# | |
# It will raise an exception when the right thing to do can't be determined. | |
# Right now, that could mean lots of things. | |
# | |
# Valid arguments are: | |
# | |
# =begin :list | |
# | |
# = allow_early_termination | |
# | |
# If C<allow_early_termination> is true, no exception is thrown if the | |
# Z-character string ends in the middle of a four z-character sequence. This is | |
# useful when dealing with dictionary words. | |
# | |
# =end :list | |
# | |
# =cut | |
# | |
# sub zchars_to_zscii { | |
# my ($self, $zchars, $arg) = @_; | |
# $arg ||= {}; | |
# | |
# my $text = ''; | |
# my $alphabet = 0; | |
# | |
# while (length( my $char = substr $zchars, 0, 1, '')) { | |
# my $ord = ord $char; | |
# | |
# if ($ord == 0) { $text .= q{ }; next; } | |
# | |
# if ($ord == 0x04) { $alphabet = 1; next } | |
# elsif ($ord == 0x05) { $alphabet = 2; next } | |
# | |
# if ($alphabet == 2 && $ord == 0x06) { | |
# my $next_two = substr $zchars, 0, 2, ''; | |
# if (length $next_two != 2) { | |
# last if $arg->{allow_early_termination}; | |
# Carp::croak("ten-bit ZSCII encoding segment terminated early") | |
# } | |
# | |
# my $value = ord(substr $next_two, 0, 1) << 5 | |
# | ord(substr $next_two, 1, 1); | |
# | |
# $text .= chr $value; | |
# $alphabet = 0; | |
# next; | |
# } | |
# | |
# if ($ord >= 0x06 && $ord <= 0x1F) { | |
# $text .= substr $self->{alphabet}, (26 * $alphabet) + $ord - 6, 1; | |
# $alphabet = 0; | |
# next; | |
# } | |
# | |
# Carp::croak("unknown zchar <$char> encountered in alphabet <$alphabet>"); | |
# } | |
# | |
# return $text; | |
# } | |
# | |
# =method make_dict_length | |
# | |
# my $zchars = $z->make_dict_length( $zchars_string ) | |
# | |
# This method returns the Z-character string fit to dictionary length for the | |
# Z-machine version being handled. It will trim excess characters or pad with | |
# Z-character 5 to be the right length. | |
# | |
# When converting such strings back to ZSCII, you should pass the | |
# C<allow_early_termination> to C<zchars_to_zscii>, as a four-Z-character | |
# sequence may have been terminated early. | |
# | |
# =cut | |
# | |
# sub make_dict_length { | |
# my ($self, $zchars) = @_; | |
# | |
# my $length = $self->{version} >= 5 ? 9 : 6; | |
# $zchars = substr $zchars, 0, $length; | |
# $zchars .= "\x05" x ($length - length($zchars)); | |
# | |
# return $zchars; | |
# } | |
# | |
# =method pack_zchars | |
# | |
# my $packed_zchars = $z->pack_zchars( $zchars_string ); | |
# | |
# This method takes a string of unpacked Z-characters and packs them into a | |
# bytestring with three Z-characters per word. The final word will have its top | |
# bit set. | |
# | |
# =cut | |
# | |
# sub pack_zchars { | |
# my ($self, $zchars) = @_; | |
# | |
# my $bytestring = ''; | |
# | |
# while (my $substr = substr $zchars, 0, 3, '') { | |
# $substr .= chr(5) until length $substr == 3; | |
# | |
# my $value = ord(substr($substr, 0, 1)) << 10 | |
# | ord(substr($substr, 1, 1)) << 5 | |
# | ord(substr($substr, 2, 1)); | |
# | |
# $value |= (0x8000) if ! length $zchars; | |
# | |
# $bytestring .= pack 'n', $value; | |
# } | |
# | |
# return $bytestring; | |
# } | |
# | |
# =method unpack_zchars | |
# | |
# my $zchars_string = $z->pack_zchars( $packed_zchars ); | |
# | |
# Given a bytestring of packed Z-characters, this method will unpack them into a | |
# string of unpacked Z-characters that aren't packed anymore because they're | |
# unpacked instead of packed. | |
# | |
# Exceptions are raised if the input bytestring isn't made of an even number of | |
# octets, or if the string continues past the first word with its top bit set. | |
# | |
# =cut | |
# | |
# sub unpack_zchars { | |
# my ($self, $bytestring) = @_; | |
# | |
# Carp::croak("bytestring of packed zchars is not an even number of bytes") | |
# if length($bytestring) % 2; | |
# | |
# my $terminate; | |
# my $zchars = ''; | |
# while (my $word = substr $bytestring, 0, 2, '') { | |
# # XXX: Probably allow this to warn and `last` -- rjbs, 2013-01-18 | |
# Carp::croak("input continues after terminating byte") if $terminate; | |
# | |
# my $n = unpack 'n', $word; | |
# $terminate = $n & 0x8000; | |
# | |
# my $c1 = chr( ($n & 0b0111110000000000) >> 10 ); | |
# my $c2 = chr( ($n & 0b0000001111100000) >> 5 ); | |
# my $c3 = chr( ($n & 0b0000000000011111) ); | |
# | |
# $zchars .= "$c1$c2$c3"; | |
# } | |
# | |
# return $zchars; | |
# } | |
# | |
# 1; | |
my %for = ( | |
' ' => [ 0x20 ], | |
'.' => [ 0x05, 0x12 ], | |
',' => [ 0x05, 0x13 ], | |
'!' => [ 0x05, 0x14 ], | |
"\n" => [ 0x05, 0x07 ], | |
(map { chr(ord('a') + $_) => [ 6 + $_ ] } <== (0 .. 25)), | |
(map { chr(ord('A') + $_) => [ 0x04, 6 + $_ ] } <== (0 .. 25)), | |
); | |
method to-zscii (Str $string) { | |
my $result = Buf.new(); | |
my @zchars = map { die "unknown char $_" unless %for{$_}; %for{ $_ }.list }, | |
$string.comb; | |
my @values = map -> $c0 = 5, $c1 = 5, $c2 = 5 { | |
$c0 +< 10 | |
+| $c1 +< 5 | |
+| $c2; | |
}, @zchars; | |
@values[*-1] +|= 0x8000; | |
$result = [~] map { mkword($_) }, @values; | |
return $result; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment