chansen/unicode.pm

## unicode.pm
package unicode;
use strict;
use warnings;
use warnings::register;

use Carp   qw[croak cluck];
use Encode qw[];

# from perl.h verified in 5.6.2, 5.8.9, 5.10.1 and blead (2010-02-27)
sub HINT_BLOCK_SCOPE () { 0x00000100 }
sub HINT_LOCALIZE_HH () { 0x00020000 }

# PL_hints (U32)
# Perl 5.6.2 and 5.8.9:
#   HINT_UTF8       0x00800000
# Perl 5.10.1 and blead (2010-02-27):
#   HINT_NO_AMAGIC  0x01000000

# hijack the remaining (hopefully unused) 7 bits for our purposes.
# I rather not use the hint hash for performance reasons since we
# need fast traversal at runtime.
sub HINT_UNICODE_MASK () { 0xFE000000 }

our @CARP_NOT = ( __PACKAGE__ );

my $DefaultEncoding = 0;    # default @Encoding index (0 is UTF-8)
my @Encoding;               # Encode::Encoding.pm instances
my %EncodingMap;            # name/alias to @Encoding index
my $WarningsOffset  = $warnings::Offsets{unicode};

BEGIN {
    push @Encoding, Encode::find_encoding('UTF-8');
    @EncodingMap{qw(UTF-8 utf-8 utf8)} = (0) x 3;
}

sub import {
    shift;

    warnings->import(FATAL => 'unicode')
      if $^H & HINT_BLOCK_SCOPE;

    if (@_) {
        my ($name) = @_;
        unless (exists $EncodingMap{$name}) {
            my $resolved = Encode::resolve_alias($name);
            (defined $resolved)
              || croak(qq/unicode: unknown encoding '$name'/);
            unless (exists $EncodingMap{$resolved}) {
                ($#Encoding < 0x7F)
                  || croak(q/unicode: reached maximum (0x7F) concurrent compile-time encodings/);
                push @Encoding, Encode::find_encoding($resolved);
                $EncodingMap{$resolved} = $#Encoding;
            }
            $EncodingMap{$name} = $EncodingMap{$resolved};
        }
        my $encoding = $EncodingMap{$name};
        $^H &= ~HINT_UNICODE_MASK;
        $^H |= $encoding << 25;
    }
    elsif ($^H & HINT_UNICODE_MASK) {
        my $encoding = ($^H & HINT_UNICODE_MASK) >> 25;
        unless (exists $Encoding[$encoding]) {
            cluck(sprintf q/unicode: hints (0x%.2X) set to an unknown encoding (0x%.2X)/, $^H, $encoding);
            $^H &= ~HINT_UNICODE_MASK;
        }
    }
}

sub unimport {
    $^H &= ~HINT_UNICODE_MASK;
}

my $get_encoding = sub {
    my ($name) = @_;
    my $encoding;
    if (exists $EncodingMap{$name}) {
        $encoding = $Encoding[$EncodingMap{$name}];
    }
    else {
        $encoding = Encode::find_encoding($name);
    }
    (defined $encoding)
      || croak(qq/unicode: unknown encoding '$name'/);
    return $encoding;
};

my $get_scope_encoding = sub {
    my $encoding = $DefaultEncoding;
    my $frame    = 1;
    while (my ($hints) = (caller($frame++))[8]) {
        if ($hints & HINT_UNICODE_MASK) {
            $encoding = ($hints & HINT_UNICODE_MASK) >> 25;
            last;
        }
    }
    (exists $Encoding[$encoding])
      || croak(qq/unicode: panic: no encoding at index '$encoding'/);
    return $Encoding[$encoding];
};

my $get_scope_encoding_check = sub {
    my $wbits = (caller(1))[9];
    my $check = Encode::LEAVE_SRC;
    if (vec($wbits, $WarningsOffset, 1)) { # unicode.pm warnings ?
        my $is_fatal = vec($wbits, $WarningsOffset + 1, 1);
        $check |= ($is_fatal ? Encode::DIE_ON_ERR : Encode::WARN_ON_ERR);
    }
    return $check;
};

sub decoded {
    @_ == 1 || croak(q/Usage: unicode::decoded(string)/);
    return (defined $_[0] && (&utf8::is_utf8 || $_[0] !~ /[^\x00-\x7F]/));
}

sub string {
    @_ == 1 || croak(q/Usage: unicode::string(string)/);
    return !!0 unless &decoded;
    # prevent runtime warning (exception on $] < 5.011):
    # $] >= 5.011: Unicode non-character 0x%x is illegal for interchange
    # $] <  5.011: Malformed UTF-8 character (fatal)
    no warnings 'utf8';
    return $_[0] !~ / [^\x{00}-\x{10FFFF}]          # perls UTF-X encoding supports 2**64
                    | \p{Surrogate}
                    | \p{Noncharacter_Code_Point}
                    /x;
}

*valid = \&string;

sub encoding {
    @_ == 0 || croak(q/Usage: unicode::encoding()/);
    return &$get_scope_encoding->mime_name;
}

sub decode {
    @_ == 1 || @_ == 2 || croak(q/Usage: unicode::decode(octets [, encoding])/);
    my $encoding = @_ == 1 ? &$get_scope_encoding : $get_encoding->($_[1]);
    $encoding->decode($_[0], &$get_scope_encoding_check);
}

sub encode {
    @_ == 1 || @_ == 2 || croak(q/Usage: unicode::encode(string [, encoding])/);
    my $encoding = @_ == 1 ? &$get_scope_encoding : $get_encoding->($_[1]);
    $encoding->encode($_[0], &$get_scope_encoding_check);
}

1;

## unicode.pod

      
    Raw
  

              unicode.pod
            
          
    NAME


unicode - Perl pragma ..

SYNOPSIS


use unicode 'UTF-8';
use warnings FATAL => 'unicode'; # recommended

my $octets = "\xE2\x98\xBA"; # U+263A WHITE SMILING FACE
my $string = unicode::decode($octets);

say unicode::encoding; # UTF-8

# Lexically scoped encoding
{
    use unicode 'UTF-16';
    say unicode::encoding; # UTF-16
    $octets = unicode::encode($string);
}

# Dynamically scoped encoding
{
    package Foo;
    use unicode;
    sub process {
        my ($string) = @_;
        say "encoding string to " . unicode::encoding;
        return unicode::encode($string);
    }

    package Bar;
    use unicode 'UTF-32';
    sub process {
        my ($string) = @_;
        return Foo->process($string);
    }
}

$octets = Foo->process($string); # UTF-8
$octets = Bar->process($string); # UTF-32

# Explicit encoding
$octets = unicode::encode($string, 'UTF-16');
$string = unicode::decode($octets, 'UTF-16');

# Well-formed?
say "String is a well-formed Unicode string"
  if unicode::valid($string);

say unicode::encode($string);

DESCRIPTION


FUNCTIONS


unicode::decode($octets [, $encoding = SCOPE | 'UTF-8'])


Returns an decoded representation of $octets in $encoding as a character string.

unicode::encode($string [, $encoding = SCOPE | 'UTF-8'])


Returns an encoded representation of $string in $encoding as an octet string.

unicode::encoding([ $encoding = SCOPE | 'UTF-8'])


Returns the canonical encoding name.

unicode::normalize($string [, $form = 'NFC'])


Returns a normalized representation of $string in Unicode normalization $form as a character string.

Valid normalization forms are NFC, NFD, NFKD and NFKC.

unicode::valid($string)


Determine whether or not the supplied $string is a well-formed Unicode string.

A well-formed Unicode string consist of the values U+0000..U+D7FF and U+E000..U+10FFFF excluding the noncharacter values U+nFFFE and U+nFFFF (where n is from 0 to 10^16) and the values U+FDD0..U+FDEF.

ENCODINGS


UTF-8


UTF-16


UTF-16LE


UTF-16BE


UTF-32


UTF-32LE


UTF-32BE


UCS-2


UCS-2BE


UCS-2LE


DIAGNOSTICS


Can't decode %s of type %s


(W unicode)


Can't decode a wide character string


(W unicode)


Can't decode ill-formed %s octet sequence <%s>


(W unicode)


Can't decode incomplete %s code unit <%s>


(W unicode)


Can't encode %s of type %s


(W unicode)


Can't interpret noncharacter code point U+%.4X as an abstract character


(W unicode)


Can't map code point U+%.4X to %s encoding


(W unicode) Code point U+%.4X can't be represented in %s encoding codespace.


Can't map surrogate code point U+%.4X to %s encoding


(W unicode) Surrogate code points are designated only for surrogate code units in the UTF-16 character encoding form. Surrogates consist of code points in the range U+D800 to U+DFFF.


Can't map noncharacter code point U+%.4X to %s encoding


(W unicode) Noncharacters is permanently reserved for internal use and that should never be interchanged. Noncharacters consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 10^16) and the values U+FDD0..U+FDEF.


Can't map restricted code point U-%.8X to %s encoding


(W unicode) Code points in the range U-00110000 to U-7FFFFFFF.

JTC1/SC2/WG2 N 2175 Proposal to restrict the range of code positions to the values up to U-0010FFFF JTC1/SC2/WG2 N 2225 RESOLUTION M38.6 (Restriction of encoding space)


Can't map extended code point %.8X to %s encoding


(W unicode) Code points in the range 2^31 to 2^64-1.


Unknown encoding '%s'


(F)


Unknown Unicode normalization form '%s'


(F)


Usage: unicode::%s


(F) Subroutine %s was called with invalid number of arguments.


Use of uninitialized value %s


(W uninitialized) Please see perldiag.
	package unicode;
	use strict;
	use warnings;
	use warnings::register;

	use Carp qw[croak cluck];
	use Encode qw[];

	# from perl.h verified in 5.6.2, 5.8.9, 5.10.1 and blead (2010-02-27)
	sub HINT_BLOCK_SCOPE () { 0x00000100 }
	sub HINT_LOCALIZE_HH () { 0x00020000 }

	# PL_hints (U32)
	# Perl 5.6.2 and 5.8.9:
	# HINT_UTF8 0x00800000
	# Perl 5.10.1 and blead (2010-02-27):
	# HINT_NO_AMAGIC 0x01000000

	# hijack the remaining (hopefully unused) 7 bits for our purposes.
	# I rather not use the hint hash for performance reasons since we
	# need fast traversal at runtime.
	sub HINT_UNICODE_MASK () { 0xFE000000 }

	our @CARP_NOT = ( __PACKAGE__ );

	my $DefaultEncoding = 0; # default @Encoding index (0 is UTF-8)
	my @Encoding; # Encode::Encoding.pm instances
	my %EncodingMap; # name/alias to @Encoding index
	my $WarningsOffset = $warnings::Offsets{unicode};

	BEGIN {
	push @Encoding, Encode::find_encoding('UTF-8');
	@EncodingMap{qw(UTF-8 utf-8 utf8)} = (0) x 3;
	}

	sub import {
	shift;

	warnings->import(FATAL => 'unicode')
	if $^H & HINT_BLOCK_SCOPE;

	if (@_) {
	my ($name) = @_;
	unless (exists $EncodingMap{$name}) {
	my $resolved = Encode::resolve_alias($name);
	(defined $resolved)
	\|\| croak(qq/unicode: unknown encoding '$name'/);
	unless (exists $EncodingMap{$resolved}) {
	($#Encoding < 0x7F)
	\|\| croak(q/unicode: reached maximum (0x7F) concurrent compile-time encodings/);
	push @Encoding, Encode::find_encoding($resolved);
	$EncodingMap{$resolved} = $#Encoding;
	}
	$EncodingMap{$name} = $EncodingMap{$resolved};
	}
	my $encoding = $EncodingMap{$name};
	$^H &= ~HINT_UNICODE_MASK;
	$^H \|= $encoding << 25;
	}
	elsif ($^H & HINT_UNICODE_MASK) {
	my $encoding = ($^H & HINT_UNICODE_MASK) >> 25;
	unless (exists $Encoding[$encoding]) {
	cluck(sprintf q/unicode: hints (0x%.2X) set to an unknown encoding (0x%.2X)/, $^H, $encoding);
	$^H &= ~HINT_UNICODE_MASK;
	}
	}
	}

	sub unimport {
	$^H &= ~HINT_UNICODE_MASK;
	}

	my $get_encoding = sub {
	my ($name) = @_;
	my $encoding;
	if (exists $EncodingMap{$name}) {
	$encoding = $Encoding[$EncodingMap{$name}];
	}
	else {
	$encoding = Encode::find_encoding($name);
	}
	(defined $encoding)
	\|\| croak(qq/unicode: unknown encoding '$name'/);
	return $encoding;
	};

	my $get_scope_encoding = sub {
	my $encoding = $DefaultEncoding;
	my $frame = 1;
	while (my ($hints) = (caller($frame++))[8]) {
	if ($hints & HINT_UNICODE_MASK) {
	$encoding = ($hints & HINT_UNICODE_MASK) >> 25;
	last;
	}
	}
	(exists $Encoding[$encoding])
	\|\| croak(qq/unicode: panic: no encoding at index '$encoding'/);
	return $Encoding[$encoding];
	};

	my $get_scope_encoding_check = sub {
	my $wbits = (caller(1))[9];
	my $check = Encode::LEAVE_SRC;
	if (vec($wbits, $WarningsOffset, 1)) { # unicode.pm warnings ?
	my $is_fatal = vec($wbits, $WarningsOffset + 1, 1);
	$check \|= ($is_fatal ? Encode::DIE_ON_ERR : Encode::WARN_ON_ERR);
	}
	return $check;
	};

	sub decoded {
	@_ == 1 \|\| croak(q/Usage: unicode::decoded(string)/);
	return (defined $_[0] && (&utf8::is_utf8 \|\| $_[0] !~ /[^\x00-\x7F]/));
	}

	sub string {
	@_ == 1 \|\| croak(q/Usage: unicode::string(string)/);
	return !!0 unless &decoded;
	# prevent runtime warning (exception on $] < 5.011):
	# $] >= 5.011: Unicode non-character 0x%x is illegal for interchange
	# $] < 5.011: Malformed UTF-8 character (fatal)
	no warnings 'utf8';
	return $_[0] !~ / [^\x{00}-\x{10FFFF}] # perls UTF-X encoding supports 2**64
	\| \p{Surrogate}
	\| \p{Noncharacter_Code_Point}
	/x;
	}

	*valid = \&string;

	sub encoding {
	@_ == 0 \|\| croak(q/Usage: unicode::encoding()/);
	return &$get_scope_encoding->mime_name;
	}

	sub decode {
	@_ == 1 \|\| @_ == 2 \|\| croak(q/Usage: unicode::decode(octets [, encoding])/);
	my $encoding = @_ == 1 ? &$get_scope_encoding : $get_encoding->($_[1]);
	$encoding->decode($_[0], &$get_scope_encoding_check);
	}

	sub encode {
	@_ == 1 \|\| @_ == 2 \|\| croak(q/Usage: unicode::encode(string [, encoding])/);
	my $encoding = @_ == 1 ? &$get_scope_encoding : $get_encoding->($_[1]);
	$encoding->encode($_[0], &$get_scope_encoding_check);
	}

	1;