Skip to content

Instantly share code, notes, and snippets.

@abevoelker
Created May 8, 2012 03:27
Show Gist options
  • Save abevoelker/2632312 to your computer and use it in GitHub Desktop.
Save abevoelker/2632312 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/perl
#!/usr/bin/env perl
#!/bin/sh
######################################################################
# unichars - list characters for one or more properties
#
# Tom Christiansen <tchrist@perl.com>
# v1.0: Fri Oct 22 23:05:16 MDT 2010
# v1.2: Tue Oct 26 08:28:25 MDT 2010
# better 5.10 support and simpler evals
#
################################################################
#
# This is an sh wrapper to run the script under
# whichever perl occurs first in your path. See
# CHOICEs 1 and 2 below for alternate strategies.
# The -x will throw off your line numbers otherwise.
#
######################################################################
#
# The next line is legal in both shell and perl,
# but perl sees the if 0 so doesn't execute it.
#
eval 'exec perl -x -S $0 ${1+"$@"}'
if 0;
### CHOICE 1:
######################################################################
### MAKE FOLLOWING #! line THE TOP LINE, REPLACING /usr/local/bin ###
### with wherever you have a late enough version of Perl is ###
### installed. Will run under 5.10, but prefers 5.12 or better. ###
######################################################################
#!/usr/local/bin/perl
# ^^^^^^^^^^^^^^ <=== CHANGE ME ###
######################################################################
### CHOICE 2:
######################################################################
### ALTERNATELY, the following #! line does the same thing as ###
### the tricksy sh eval exec line: it finds whichever Perl is ###
### first in your path. However, it works only on BSD systems ###
### (including MacOS), but breaks under Solaris and Linux. ###
######################################################################
#!/usr/bin/env perl -CLA
######################################################################
use strict;
use warnings; # qw[ FATAL all ];
use charnames qw[ :full :short latin greek ];
use 5.10.1;
use File::Basename qw[ basename ];
use Getopt::Long qw[ GetOptions ];
use File::Spec;
use Carp;
use Pod::Usage qw[ pod2usage ];
use Encode qw[ decode ];
use Unicode::UCD qw(charinfo casefold);
## use if $^V >= v5.11.3, qw[ feature unicode_strings ];
# don't need to import this
sub utf::is_utf8($);
################################################################
sub ARGCOUNT;
sub CF();
sub IT();
sub NAME();
sub NOT_REACHED;
sub NUM();
sub am_running_perldb;
sub check_options();
sub compile_filter();
sub deQ($);
sub deQQ($);
sub debug($);
sub dequeue($$);
sub display;
sub fork_pager;
sub genfuncs;
sub is_runnable;
sub locate_program;
sub main();
sub panic;
sub run_filter();
sub start_pager;
sub stupid_evil_and_wrong;
sub titlecase;
sub underscore;
################################################################
our $VERSION = "1.4 (2011-04-11)";
$| = 1; # command buffering quick-feeds piped stdout
$0 = basename($0); # shorten up warnings/errors
our %Opt;
our $CF;
our $CI;
our $Shown_Count = 0;
main();
exit;
################################################################
sub IT() { $_ }
sub NAME() { charnames::viacode(ord $_) || "" }
sub genfuncs {
for my $nf ( qw< NFD NFC NFKD NFKC FCD FCC > ) {
no strict "refs";
*$nf = sub(_) {
require Unicode::Normalize;
"Unicode::Normalize::$nf"->($_);
};
}
for my $check ( qw< checkNFD checkNFC checkNFKD checkNFKC checkFCD checkFCC > ) {
no strict "refs";
*$check = sub(_) {
require Unicode::Normalize;
my $stat = "Unicode::Normalize::$check"->($_);
if (defined $stat) {
return $stat || "0 but true";
} else {
# trick to quiet zero-conversion under -w
return 0 == 1;
}
}
}
for my $nf ( qw< Singleton Exclusion NonStDecomp Comp_Ex
NFD_NO NFC_NO NFC_MAYBE
NFKD_NO NFKC_NO NFKC_MAYBE >
)
{
no strict "refs";
*$nf = sub() {
require Unicode::Normalize;
"Unicode::Normalize::is$nf"->(ord);
};
}
for my $nl ( 1 .. 4 ) {
no strict "refs";
*{ "UCA$nl" } = sub(_) {
require Unicode::Collate;
my $class = Unicode::Collate:: ;
my @args = (level => $nl, variable => "Non-Ignorable");
if ($Opt{locale}) {
require Unicode::Collate::Locale;
$class = Unicode::Collate::Locale:: ;
push @args, locale => $Opt{locale};
}
state $coll = $class->new(@args);
return $coll->getSortKey($_[0]);
};
}
no warnings "once";
*UCA = \&UCA1;
}
sub CF() {
$CF = casefold(ord);
return ($CF && $CF->{status}) || "";
}
sub NUM() {
require Unicode::UCD;
Unicode::UCD->VERSION(0.32);
my $n = Unicode::UCD::num($_);
if (defined $n) {
return $n || "0 but true";
} else {
# trick to quiet zero-conversion under -w
return 0 == 1;
}
}
################################################################
sub main() {
for my $fh ( qw[STDOUT STDERR] ) {
binmode($fh, ":utf8")
|| die "can't binmode($fh) to :utf8 encoding: $!";
}
check_options();
genfuncs();
compile_filter();
$SIG{PIPE} = sub {exit 0};
run_filter();
if ($Opt{verbose}) {
print STDERR "$0: $Shown_Count code points matched.\n";
}
close(STDOUT) || warn "$0: close stdout failed: $!\n";
if ($Shown_Count) {
exit 0;
} else {
exit 1;
}
}
################################################################
sub debug($) {
return unless $Opt{debug};
my $msg = shift();
print STDERR "$msg\n";
}
sub check_options() {
Getopt::Long::Configure qw[ bundling auto_version ];
if (@ARGV == 0) {
@ARGV = qw{
--all
--category
--script
};
}
GetOptions(\%Opt, qw[
help|h|?
man|m
debug|d
unnamed|u
bmp
smp
astral|all|a
casefold|f
decimal|d
category|general|c|g
combining|C
script|s
block|b
bidi|B
numeric|n
locale|l=s
nopager
verbose
]) || pod2usage(2);
pod2usage(0) if $Opt{help};
pod2usage(-exitstatus => 0, -verbose => 2) if $Opt{man};
@ARGV = (1) unless @ARGV;
#$Opt{smp}++;
#$Opt{bmp}++;
pod2usage("$0: missing arguments") if @ARGV == 0;
if (grep /\P{ASCII}/ => @ARGV) {
@ARGV = map { decode("UTF-8", $_) } @ARGV;
}
}
sub compile_filter() {
my @criteria;
for my $i ( 0 .. $#ARGV ) {
my $snippet = $ARGV[$i];
$snippet =~ s/^\s+//;
# args starting with a backslash or which are a bracketed
# espression are interpreted as pattern matches
if ($snippet =~ m{ ^ \\ | ^ \[ .* \] $ }x) {
$snippet = "/$snippet/";
}
my $test_compile = deQ <<'START_TEST';
|Q| use warnings qw[FATAL all];
|Q| my $ignore =
START_TEST
$test_compile .= deQQ(<<"END_TEST");
|QQ| sub { $snippet };
|QQ|
|QQ| # so eval returns true
|QQ| 1;
|QQ|
END_TEST
# debug("test compile:\n$test_compile");
eval($test_compile) ||
die "$0: invalid criterion in '$snippet': $@\n";
$criteria[$i] = "do { $snippet }";
}
my $real_code = deQ(<<'START_CODE') . "\t";
|Q| use warnings;
|Q| #use warnings qw[FATAL all];
|Q| #no warnings qw[deprecated];
|Q|
|Q| sub filter {
|Q|
|Q| debug(sprintf("testing code point %X", ord()));
|Q|
|Q| my $result =
|Q|
START_CODE
$real_code .= join("\n &&\n\t" => @criteria)
. deQ(<<'END_CODE');
|Q|
|Q| ;
|Q|
|Q| debug("result of " . join(" && ",@criteria) . " is $result");
|Q| return $result;
|Q| }
|Q|
|Q| # so eval returns true
|Q| 1;
END_CODE
debug("CRITERIA are\n$real_code");
eval($real_code) || die;
}
sub run_filter() {
my $first_codepoint = 0x00_0000;
my $last_codepoint = 0x10_FFFF;
unless ($Opt{astral} || $Opt{smp}) {
$last_codepoint = 0x00_FFFF;
}
if ($Opt{bmp}) {
$first_codepoint = 0x00_0000;
$last_codepoint = 0x00_FFFF;
}
if ($Opt{smp}) {
$first_codepoint = 0x01_0000 unless $Opt{bmp};
$last_codepoint = 0x01_FFFF;
}
if ($Opt{astral}) {
$last_codepoint = 0x10_FFFF;
}
my $hex_width = length(sprintf("%x", $last_codepoint));
my $dec_width = length(sprintf("%d", $last_codepoint));
--$hex_width if $last_codepoint == 0x10_FFFF;
debug(sprintf("checking codepoints %0${hex_width}X .. %0${hex_width}X",
$first_codepoint, $last_codepoint));
CODEPOINT:
for my $codepoint ( $first_codepoint .. $last_codepoint ) {
# gaggy UTF-16 surrogates are invalid UTF-8 code points
next if $codepoint >= 0xD800 && $codepoint <= 0xDFFF;
# from utf8.c in perl src; must avoid fatals in 5.10
next if $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF;
next if 0xFFFE == ($codepoint & 0xFFFE); # both FFFE and FFFF
# debug("testing codepoint $codepoint");
# see "Unicode non-character %s is illegal for interchange" in perldiag(1)
$_ = do { no warnings "utf8"; chr($codepoint) };
# fixes "the Unicode bug"
unless (utf8::is_utf8($_)) {
$_ = decode("iso-8859-1", $_);
}
unless ($Opt{unnamed}) {
# won't find string names for any of these, so don't bother printing
next if m{ \p{Unassigned} }x;
next if m{ \p{PrivateUse} }x;
next if m{ \p{Han} }x;
next if m{ \p{InHangulSyllables} }x;
}
next unless &filter;
$Shown_Count++;
$CI = charinfo(ord);
if (/[\pC\pZ]/) {
display " ---- ";
} else {
# display "\N{LEFT-TO-RIGHT OVERRIDE}" ;# if /[\p{BC=R}\p{BC=AL}\p{BC=AN}\p{BC=ON}]/;
# display " " if /[\p{BC=R}\p{BC=AL}\p{BC=AN}]/;
display " ";
display "\N{DOTTED CIRCLE}" if /\p{BC=NSM}/;
# display " \N{LEFT-TO-RIGHT MARK}$_\N{LEFT-TO-RIGHT MARK} ";
display "$_ ";
# display " " unless /[\p{BC=R}\p{BC=AL}\p{BC=AN}]/;
display " " unless /[\p{EA=F}\p{EA=W}]/;
}
display sprintf "%${dec_width}d %0${hex_width}X ", ($codepoint) x 2
if $Opt{decimal};
display sprintf "U+%0${hex_width}X ", $codepoint;
if ($Opt{category}) {
display sprintf("GC=%2s ", $CI->{category});
}
if ($Opt{casefold}) {
display sprintf("CF=%1s ", CF());
}
if ($Opt{bidi}) {
display sprintf("BC=%-3s ", $CI->{bidi});
}
if ($Opt{numeric}) {
display sprintf("NV=%-4s ", $CI->{numeric});
}
if ($Opt{block}) {
display sprintf("BLK=%-22s ", underscore($CI->{block}));
}
if ($Opt{script}) {
display sprintf("SC=%-12s ", titlecase($CI->{script}));
}
if ($Opt{combining}) {
display sprintf("CC=%-3s ", $CI->{combining});
}
display sprintf "%s\n", charnames::viacode($codepoint) || "<unnamed codepoint>";
}
}
sub underscore {
local $_ = shift();
y/ /_/;
return $_;
}
sub titlecase {
local $_ = shift();
s/[-_]\K(\p{Ll})/\u$1/g;
return $_;
}
sub display {
ARGCOUNT() unless @_ == 1;
my $string = $_[0];
state $begun_pager;
start_pager() unless $begun_pager++;
print $string;
}
sub am_running_perldb {
no warnings "once";
return keys(%DB::sub) > 0;
}
sub locate_program {
ARGCOUNT() unless @_ == 1;
my $program = $_[0];
return unless defined $program
&& length $program;
if (File::Spec->file_name_is_absolute($program)) {
return is_runnable($program);
}
my @path_dirs = File::Spec->path();
for my $dir (@path_dirs) {
my $pathname = File::Spec->catfile($dir, $program);
my $runpath;
return $runpath if $runpath = is_runnable($pathname);
}
return;
}
sub is_runnable {
ARGCOUNT() unless @_ == 1;
my $fullpath = $_[0];
if (-x $fullpath && ! -d _) {
return $fullpath;
}
elsif (stupid_evil_and_wrong() && $fullpath !~ /\.exe\z/i) {
return is_runnable("$fullpath.exe")
}
else {
return ();
}
NOT_REACHED();
}
sub stupid_evil_and_wrong {
return lc $^O ~~ [ qw<dos os2 netware symbian mswin32> ];
}
sub panic {
confess "$0: INTERNAL ERROR: @_";
}
sub NOT_REACHED {
panic("NOT REACHED");
}
sub ARGCOUNT {
panic("wrong arguments to function");
}
sub dequeue($$) {
my($leader, $body) = @_;
$body =~ s/^\s*\Q$leader\E ?//gm;
return $body;
}
sub deQ($) {
my $text = $_[0];
return dequeue q<|Q|>, $text;
}
sub deQQ($) {
my $text = $_[0];
return dequeue qq<|QQ|>, $text;
}
sub start_pager {
ARGCOUNT() unless @_ == 0;
return if am_running_perldb();
return if $Opt{nopager};
return unless -t STDOUT;
my $his_pager = locate_program($ENV{PAGER})
|| locate_program("less")
|| locate_program("more")
|| locate_program("type")
;
return unless $his_pager;
my $am_less = ($his_pager =~ /\bless\b/i);
local $ENV{LESSCHARSET} = "utf-8" if $am_less;
my @pager_args = ();
push (@pager_args, "-r") if $am_less;
open(STDOUT, "|- :utf8", $his_pager, @pager_args);
}
sub fork_pager {
if (-t STDOUT) {
}
}
################################################################
################################################################
################################################################
__END__
=encoding utf8
=head1 NAME
unichars - list characters for one or more properties
=head1 SYNOPSIS
B<unichars> [I<options>] I<criterion> ...
Each criterion is either a square-bracketed character class, a regex
starting with a backslash, or an arbitrary Perl expression. See the
EXAMPLES section below.
OPTIONS:
Selection Options:
--bmp include the Basic Multilingual Plane (plane 0) [DEFAULT]
--smp include the Supplementary Multilingual Plane (plane 1)
--astral -a include planes above the BMP (planes 1-15)
--unnamed -u include various unnamed characters (see DESCRIPTION)
--locale -l specify the locale used for UCA functions
Display Options:
--category -c include the general category (GC=)
--script -s include the script name (SC=)
--block -b include the block name (BLK=)
--bidi -B include the bidi class (BC=)
--combining -C include the canonical combining class (CCC=)
--numeric -n include the numeric value (NV=)
--casefold -f include the casefold status
--decimal -d include the decimal representation of the code point
Miscellaneous Options:
--version -v print version information and exit
--help -h this message
--man -m full manpage
--debug -d show debugging of criteria and examined code point span
Special Functions:
$_ is the current code point
ord is the current code point's ordinal
NAME is charname::viacode(ord)
NUM is Unicode::UCD::num(ord), not code point number
CF is casefold->{status}
NFD, NFC, NFKD, NFKC, FCD, FCC (normalization)
UCA, UCA1, UCA2, UCA3, UCA4 (binary sort keys)
Singleton, Exclusion, NonStDecomp, Comp_Ex
checkNFD, checkNFC, checkNFKD, checkNFKC, checkFCD, checkFCC
NFD_NO, NFC_NO, NFC_MAYBE, NFKD_NO, NFKC_NO, NFKC_MAYBE
=head1 DESCRIPTION
The I<unichars> program reports which characters match all selection criteria
I<and>ed together.
A criterion beginning with a square bracket or a backslash is assumed to be
a regular expression. Anything else is a Perl expression such as you might
pass to the Perl C<grep> function. The C<$_> variable is set to each
successive Unicode character, and if all criteria match, that character is
displayed.
The numeric code point is therefore accessible as C<ord>.
The special token C<NAME> is set to the full name of the current code point.
Also, the tokens C<NFD>, C<NFKD>, C<NFC>, and C<NFKC> are set to the
corresponding normalization form.
By default only plane 0, the Basic Multilingual Plane, is examined.
For plane 1, the Supplementary Multilingual Plane, use B<--smp>.
To examine either, specify both B<--bmp> and B<--smp> options, or B<-bs>.
To include I<all> valid code points, use the B<-a> or B<--astral> option.
Unless the B<--unnamed> option is given, characters with any of the
properties Unassigned, PrivateUse, Han, or InHangulSyllables will be
excluded.
=head1 EXAMPLES
Could all non-ASCII digits:
$ unichars -a '\d' '\P{ASCII}' | wc -l
401
Find all line terminators:
$ unichars '\R'
-- 10 0000A LINE FEED (LF)
-- 11 0000B LINE TABULATION
-- 12 0000C FORM FEED (FF)
-- 13 0000D CARRIAGE RETURN (CR)
-- 133 00085 NEXT LINE (NEL)
-- 8232 02028 LINE SEPARATOR
-- 8233 02029 PARAGRAPH SEPARATOR
Find what is not C<\s> but is C<[\h\v]>:
$ unichars '\S' '[\h\v]'
-- 11 0000B LINE TABULATION
Count how many code points in the Basic Multilingual Plane
are I<not> marks but I<are> diacritics:
$ unichars '\PM' '\p{Diacritic}' | wc -l
209
Count how many code points in the Basic Multilingual Plane
I<are> marks but are I<not> diacritics:
$ unichars '\pM' '\P{Diacritic}' | wc -l
750
Find all code points that are Letters, are in the Greek script,
have differing canonical and compatibility decompositions, and
whose name contains "SYMBOL":
$ unichars -a '\pL' '\p{Greek}' 'NFD ne NFKD' 'NAME =~ /SYMBOL/'
ϐ 976 003D0 GREEK BETA SYMBOL
ϑ 977 003D1 GREEK THETA SYMBOL
ϒ 978 003D2 GREEK UPSILON WITH HOOK SYMBOL
ϓ 979 003D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
ϔ 980 003D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL
ϕ 981 003D5 GREEK PHI SYMBOL
ϖ 982 003D6 GREEK PI SYMBOL
ϰ 1008 003F0 GREEK KAPPA SYMBOL
ϱ 1009 003F1 GREEK RHO SYMBOL
ϲ 1010 003F2 GREEK LUNATE SIGMA SYMBOL
ϴ 1012 003F4 GREEK CAPITAL THETA SYMBOL
ϵ 1013 003F5 GREEK LUNATE EPSILON SYMBOL
Ϲ 1017 003F9 GREEK CAPITAL LUNATE SIGMA SYMBOL
Find all numeric nondigits in the Latin script (within the BMP):
$ unichars '\pN' '\D' '\p{Latin}'
Ⅰ 8544 02160 ROMAN NUMERAL ONE
Ⅱ 8545 02161 ROMAN NUMERAL TWO
Ⅲ 8546 02162 ROMAN NUMERAL THREE
Ⅳ 8547 02163 ROMAN NUMERAL FOUR
Ⅴ 8548 02164 ROMAN NUMERAL FIVE
Ⅵ 8549 02165 ROMAN NUMERAL SIX
Ⅶ 8550 02166 ROMAN NUMERAL SEVEN
Ⅷ 8551 02167 ROMAN NUMERAL EIGHT
(etc)
Find the first three alphanumunderish code points with no assigned name:
$ unichars -au '\w' '!length NAME' | head -3
㐀 13312 003400 <unnamed codepoint>
㐁 13313 003401 <unnamed codepoint>
㐂 13314 003402 <unnamed codepoint>
Count the combining characters in the Suuplemental Multilingual Plane:
$ unichars -s '\pM' | wc -l
61
=head1 ENVIRONMENT
If your environment smells like it's in a Unicode encoding,
program arguments will be in UTF-8.
=head1 BUGS
The B<--man> option does not correctly process the page for UTF-8, because
it does not pass the necessary B<--utf8> option to L<pod2man>.
=head1 SEE ALSO
L<uniprops>,
L<uninames>,
L<perluniprops>,
L<perlunicode>,
L<perlrecharclass>,
L<perlre>
=head1 AUTHOR
Tom Christiansen <I<tchrist@perl.com>>
=head1 COPYRIGHT AND LICENCE
Copyright 2010 Tom Christiansen.
This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.
#!/usr/bin/env perl
#
# unifuck - print infinite permutations of fuck in unicode aliases
#
# Tom Christiansen <tchrist@perl.com>
# Mon May 23 09:37:27 MDT 2011
use strict;
use warnings;
use charnames ":full";
use Unicode::Normalize;
binmode(STDOUT, ":utf8");
our(@diddle, @fuck, %fuck); # initted down below
while (my($f,$u,$c,$k) = splice(@fuck, 0, 4)) {
$fuck{F}{$f}++;
$fuck{U}{$u}++;
$fuck{C}{$c}++;
$fuck{K}{$k}++;
}
my @F = keys %{ $fuck{F} };
my @U = keys %{ $fuck{U} };
my @C = keys %{ $fuck{C} };
my @K = keys %{ $fuck{K} };
while (1) {
my $f = $F[rand @F];
my $u = $U[rand @U];
my $c = $C[rand @C];
my $k = $K[rand @K];
for ($f,$u,$c,$k) {
next if length > 1;
next if /\p{EA=W}/;
next if /\pM/;
next if /\p{InEnclosedAlphanumerics}/;
s/$/$diddle[rand @diddle]/ if rand(100) < 15;
s/$/\N{COMBINING ENCLOSING KEYCAP}/ if rand(100) < 1;
}
if ( 0) { }
elsif (rand(100) < 5) { $u = q(@) }
elsif (rand(100) < 5) { $c = q(*) }
elsif (rand(100) < 10) { ($c,$k) = ($k,$c) }
elsif (rand(100) < 15) { ($f,$u,$c,$k) = reverse ($f,$u,$c,$k) }
print NFC("$f $u $c $k\n");
}
BEGIN {
# ok to have repeats in each position, since they'll be counted only once
# per unique strings
@fuck = (
"\N{LATIN CAPITAL LETTER F}",
"\N{LATIN CAPITAL LETTER U}",
"\N{LATIN CAPITAL LETTER C}",
"\N{LATIN CAPITAL LETTER K}",
"\N{LATIN SMALL LETTER F}",
"\N{LATIN SMALL LETTER U}",
"\N{LATIN SMALL LETTER C}",
"\N{LATIN SMALL LETTER K}",
"\N{LATIN SMALL LETTER F}",
"\N{INFINITY}",
"\N{LATIN SMALL LETTER C}",
"\N{LATIN SMALL LETTER K}",
"\N{LATIN SMALL LETTER F}",
"\N{LATIN SMALL LETTER O}\N{LATIN SMALL LETTER O}",
"\N{LATIN SMALL LETTER C}",
"\N{KELVIN SIGN}",
"\N{LATIN SMALL LETTER F}",
"\N{DIGIT ZERO}\N{DIGIT ZERO}",
"\N{CENT SIGN}",
"\N{LATIN CAPITAL LETTER K}",
"\N{LATIN LETTER SMALL CAPITAL F}",
"\N{LATIN LETTER SMALL CAPITAL U}",
"\N{LATIN LETTER SMALL CAPITAL C}",
"\N{LATIN LETTER SMALL CAPITAL K}",
"\N{MODIFIER LETTER SMALL F}",
"\N{MODIFIER LETTER SMALL U}",
"\N{MODIFIER LETTER SMALL C}",
"\N{MODIFIER LETTER SMALL K}",
"\N{MATHEMATICAL SCRIPT SMALL F}",
"\N{MATHEMATICAL SCRIPT SMALL U}",
"\N{MATHEMATICAL SCRIPT SMALL C}",
"\N{MATHEMATICAL SCRIPT SMALL K}",
"\N{MATHEMATICAL BOLD FRAKTUR CAPITAL F}",
"\N{MATHEMATICAL BOLD FRAKTUR CAPITAL U}",
"\N{MATHEMATICAL BOLD FRAKTUR CAPITAL C}",
"\N{MATHEMATICAL BOLD FRAKTUR CAPITAL K}",
"\N{MATHEMATICAL BOLD FRAKTUR SMALL F}",
"\N{MATHEMATICAL BOLD FRAKTUR SMALL U}",
"\N{MATHEMATICAL BOLD FRAKTUR SMALL C}",
"\N{MATHEMATICAL BOLD FRAKTUR SMALL K}",
"\N{MATHEMATICAL BOLD SCRIPT CAPITAL F}",
"\N{MATHEMATICAL SCRIPT CAPITAL U}",
"\N{MATHEMATICAL SCRIPT CAPITAL C}",
"\N{MATHEMATICAL SCRIPT CAPITAL K}",
"\N{CIRCLED LATIN SMALL LETTER F}",
"\N{CIRCLED LATIN SMALL LETTER U}",
"\N{CIRCLED LATIN SMALL LETTER C}",
"\N{CIRCLED LATIN SMALL LETTER K}",
"\N{PARENTHESIZED LATIN SMALL LETTER F}",
"\N{PARENTHESIZED LATIN SMALL LETTER U}",
"\N{PARENTHESIZED LATIN SMALL LETTER C}",
"\N{PARENTHESIZED LATIN SMALL LETTER K}",
"\N{GREEK CAPITAL LETTER GAMMA}\N{COMBINING SHORT STROKE OVERLAY}",
"\N{GOTHIC LETTER QAIRTHRA}",
"\N{CHEROKEE LETTER TLI}",
"\N{CHEROKEE LETTER TSO}",
"\N{LATIN SMALL LETTER F WITH HOOK}",
"\N{GREEK SMALL LETTER MU}",
"\N{LATIN SMALL LETTER C WITH CURL}",
"\N{CYRILLIC CAPITAL LETTER IOTIFIED E}",
"\N{CYRILLIC CAPITAL LETTER GHE}\N{COMBINING SHORT STROKE OVERLAY}",
"\N{CYRILLIC CAPITAL LETTER TSE}",
"\N{CYRILLIC CAPITAL LETTER ES}",
"\N{CYRILLIC CAPITAL LETTER KA}",
"\N{CYRILLIC SMALL LETTER GHE WITH STROKE}",
"\N{LATIN SMALL CAPITAL LETTER U WITH STROKE}",
"\N{LATIN SMALL LETTER C WITH STROKE}",
"\N{LATIN SMALL LETTER K WITH HOOK}",
"\N{GREEK LETTER DIGAMMA}",
"\N{GREEK SMALL LETTER UPSILON}",
"\N{GREEK LETTER STIGMA}",
"\N{GREEK CAPITAL LETTER KAPPA}",
"\N{HANGUL JONGSEONG KHIEUKH}",
"\N{LATIN CAPITAL LETTER U}",
"\N{ROMAN NUMERAL REVERSED ONE HUNDRED}",
"\N{CYRILLIC CAPITAL LETTER ZHE}",
"\N{LATIN SMALL LETTER DOTLESS J WITH STROKE}",
"\N{LATIN SMALL LETTER N}",
"\N{LATIN SMALL LETTER OPEN O}",
"\N{LATIN SMALL LETTER TURNED K}",
"\N{FULLWIDTH LATIN CAPITAL LETTER F}",
"\N{FULLWIDTH LATIN CAPITAL LETTER U}",
"\N{FULLWIDTH LATIN CAPITAL LETTER C}",
"\N{FULLWIDTH LATIN CAPITAL LETTER K}",
);
@diddle = (
"\N{COMBINING GRAVE ACCENT}",
"\N{COMBINING ACUTE ACCENT}",
"\N{COMBINING CIRCUMFLEX ACCENT}",
"\N{COMBINING TILDE}",
"\N{COMBINING BREVE}",
"\N{COMBINING DOT ABOVE}",
"\N{COMBINING DIAERESIS}",
"\N{COMBINING CARON}",
"\N{COMBINING CANDRABINDU}",
"\N{COMBINING INVERTED BREVE}",
"\N{COMBINING GRAVE TONE MARK}",
"\N{COMBINING ACUTE TONE MARK}",
"\N{COMBINING GREEK PERISPOMENI}",
"\N{COMBINING FERMATA}",
"\N{COMBINING SUSPENSION MARK}",
);
}
#!/usr/local/bin/perl
######################################################################
# uniprops - list regex properties of one or more characters
#
# Tom Christiansen <tchrist@perl.com>
######################################################################
#
# This is an sh wrapper to run the script under
# whichever perl occurs first in your path. See
# CHOICEs 1 and 2 below for alternate strategies.
#
######################################################################
#
# The next line is legal in both shell and perl,
# but perl sees the if 0 so doesn't execute it.
#
eval 'exec perl -x -S $0 ${1+"$@"}'
if 0;
### CHOICE 1:
######################################################################
### MAKE FOLLOWING #! line THE TOP LINE, REPLACING /usr/local/bin ###
### with wherever you have a late enough version of Perl is ###
### installed. Will run under 5.10, but prefers 5.12 or better. ###
######################################################################
#!/usr/local/bin/perl
# ^^^^^^^^^^^^^^ <=== CHANGE ME ###
######################################################################
### CHOICE 2:
######################################################################
### ALTERNATELY, the following #! line does the same thing as ###
### the tricksy sh eval exec line: it finds whichever Perl is ###
### first in your path. However, it works only on BSD systems ###
### (including MacOS), but breaks under Solaris and Linux. ###
######################################################################
#!/usr/bin/env perl
######################################################################
# Revision History:
# v1.0: Fri Oct 22 19:17:20 MDT 2010
# v1.1: Sun Oct 24 16:33:07 MDT 2010
# linux patches
# v1.2: Sun Oct 24 17:51:29 MDT 2010
# rework proplist reading for backwards compat on 5.10
# or anywhere can't find proplist
# v1.3: Tue Oct 26 08:00:30 MDT 2010
#
######################################################################
use 5.10.0; # but prefer 5.12.0
use strict;
use warnings; # qw[ FATAL all ]
# gives flexibility in specifying chars
use charnames qw[
:short
:full
latin
greek
];
################################################################
use Scalar::Util qw[ looks_like_number ];
use Encode qw[ decode ];
use File::Basename qw[ basename ];
use Getopt::Long qw[ GetOptions ];
use Carp qw[ confess ];
use Pod::Usage;
# don't need to import this
sub utf::is_utf8($);
################################################################
sub am_unixy();
sub attractively;
sub cp2name($);
sub debug($);
sub dequeue($$);
sub init_screen();
sub load_properties();
sub main();
sub perlprops($);
sub quote($);
sub reorder(@);
sub sysprops();
sub uniprops($);
sub uprops_chr($);
sub uprops_cp($);
sub valid_prop($);
sub verbose(@);
sub writeln(@);
################################################################
$| = 1; # feed the hungry pipe
$0 = basename($0); # shorten up warnings/errors
our @All_Properties = ();
our %Opt = ();
our $VERSION = "1.5 (2011-04-11)";
our $Errors = 0;
main();
exit($Errors != 0);
# good idea in general, but critical if forkopened in sysprops()
END {
close(STDOUT)
|| die "can't close STDOUT: $!";
}
################################################################
sub main() {
for my $fh ( qw[STDOUT STDERR] ) {
binmode($fh, ":utf8")
|| die "can't binmode($fh) to :utf8 encoding: $!";
}
pod2usage("$0: usage error: expected arguments\n") if @ARGV == 0;
local $SIG{__DIE__} = sub {
confess "Untrapped fatal exception: @_" unless $^S;
};
Getopt::Long::Configure qw[ bundling auto_version ];
GetOptions(\%Opt => qw[
all|a
debug|d
general|g
help|?
list|l
man|m
negated|n
perl|p
reorder|r
single|1
CamelCase|C
titlecase|t
unicode|u
verbose|v
width|w|columns|c=i
]) || pod2usage(2);
pod2usage(0) if $Opt{help};
pod2usage(-exitstatus => 0, -verbose => 2) if $Opt{man};
sysprops() if $Opt{list};
pod2usage("$0: expected arguments\n") if @ARGV == 0;
if (grep /\P{ASCII}/ => @ARGV) {
@ARGV = map { decode("UTF-8", $_) } @ARGV;
}
my $hex_spec = qr{
# let them specify hex code point in many ways
(?: 0x | U\+ ) (?<HEX> (?&hex) )
|
\\[xu] (?:
(?<HEX> (?&hex) )
|
\{ (?<HEX> (?&hex) ) \}
)
(?(DEFINE) (?<hex> \p{HexDigit}+ ) )
}xi;
ARG: for my $_ (@ARGV) {
if (length == 1) {
uprops_chr($_);
next ARG;
}
if (/^$hex_spec$/) {
my $codepoint = hex($+{HEX});
uprops_cp($codepoint);
next ARG;
}
my $codepoint = charnames::vianame($_);
if (!defined $codepoint) {
# can't get use warnings qw[FATAL all] to work here
local $SIG{__WARN__} = sub {
if ($_[0] =~ /Unknown charname/) {
die "@_";
} else {
my $err = $_[0];
$err =~ s/ at .*\n//;
$Errors++;
warn "$0: $err.\n";
}
};
eval "\$codepoint = ord qq(\\N{$_})";
undef $codepoint if $@;
}
if (!defined $codepoint) {
if (/^\p{HexDigit}+$/) {
$codepoint = hex();
} else {
printf STDERR "$0: no character named ". quote($_). "\n";
$Errors++;
next ARG;
}
}
uprops_cp($codepoint);
}
}
################################################################
sub debug($) {
return unless $Opt{debug};
my $msg = shift();
print STDERR "$msg\n";
}
sub quote($) {
my $_ = shift();
my($LQ, $RQ) = ${^UTF8LOCALE}
? ("\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}",
"\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}")
: qw[ < > ];
# my $quoted = $LQ . ((ord > 0x10_FFFF || /^[\pC\pZ]\z/) ? sprintf("U+%04X", ord) : $_) . $RQ;
my $string = $_;
$string = sprintf("U+%04X", ord) if ord > 0x10_FFFF || /^[\pC\pZ]\z/;
return $LQ . $string . $RQ;
}
sub cp2name($) {
my $cp = shift();
if (! looks_like_number($cp)) {
die "bad number: $cp";
}
return "NULL" if $cp == 0;
return ($cp <= 0x10_FFFF && charnames::viacode($cp)) || sprintf("U+%04X", $cp);
}
{
my $_Format_Line;
sub init_screen() {
my $cols;
if ($Opt{width}) {
$cols = $Opt{width};
} elsif (am_unixy()) {
($cols) = `stty size 2>&1` =~ /^\d+ (\d+)$/;
}
$cols ||= 80; # non-unix or stty error
debug("columns are $cols");
$cols -= 2;
my $format = "format STDOUT = \n"
. ' ^' . '<' x ($cols-4) . "\n"
. '$_Format_Line' . "\n"
. " ~~ ^" . "<" x ($cols-7) . "\n"
. '$_Format_Line' . "\n"
. ".\n"
. "1;"
;
debug($format);
eval($format) || die "FORMAT FAILED: $@";
}
sub writeln(@) {
return unless @_;
if ($Opt{single}) {
say for verbose @_;
return;
}
init_screen() unless *STDOUT{FORMAT};
$_Format_Line = join (" " => verbose @_);
write if length($_Format_Line);
}
}
sub verbose(@) {
die("MISCALLED") unless wantarray();
return map {
$Opt{verbose}
? "\\p{$_}"
: $_
} @_;
}
sub am_unixy() {
my @aliens = qw[
MSWin32 MacOS VMS DOS
NetWare beos epoc os2
symbian
];
for my $alien (@aliens) {
return 0 if lc($^O) eq lc($alien);
}
return 1;
}
sub uprops_cp($) {
my $codepoint = shift();
if ($codepoint >= 0x00_D800 && $codepoint <= 0x00_DFFF) {{
next if $^V gt 5.13.4;
printf STDERR "$0: code point U+%06X is a UTF-16 surrogate\n",
$codepoint;
$Errors++;
return;
}}
my $am_super = ($codepoint > 0x10_FFFF);
if ($am_super) {{
next if $^V gt 5.13.4;
printf STDERR "$0: code point U+%06X above Unicode maximum\n",
$codepoint;
$Errors++;
return;
}}
local $Opt{negated} = 1 if $am_super;
printf STDERR <<"EO_WARNING", $codepoint if $am_super;
$0: Code point 0x%X is not Unicode: No properties match it, but all inverse properties do so.
EO_WARNING
# see "Unicode non-character %s is illegal for interchange" in perldiag(1)
my $character = eval {
# use warnings "utf8";
no warnings "utf8";
chr($codepoint);
};
if ($@) {
if ($@ =~ /is illegal/) {
chomp $@;
$@ =~ s/ at \S+ line \d+//;
printf STDERR "$0: $@\n";
$Errors++;
# return;
}
# propagate unforeseen exception
die "UNFORESEEN EXCEPTION $@";
}
uprops_chr($character);
}
sub uprops_chr($) {
my @props;
my $char = shift();
# fixes "the Unicode bug"
unless (utf8::is_utf8($char)) {
$char = decode("iso-8859-1", $char);
}
printf "U+%04X %s ", ord($char), quote($char);
printf "\\N{%s}\n", cp2name(ord $char);
if (!$Opt{unicode} || $Opt{perl} || $Opt{negated}) {
@props = perlprops($char);
writeln(grep { / \\ ( \p{Lower} | R ) /x } @props);
if ($Opt{negated}) {
writeln(grep { ! / \\ ( \p{Lower} | R ) /x } @props);
}
}
if (!$Opt{perl} || $Opt{unicode}) {
@props = uniprops($char);
my @shortprops = grep { /\\/ } @props;
@props = grep { ! /\\/ } @props;
writeln(reorder @shortprops);
my @longprops = grep { /(?:^_|[=:])/ } @props;
@props = grep { ! /(?:^_|[=:])/ } @props;
writeln(reorder @props);
if ($Opt{all} || $Opt{general}) {
@longprops = grep { ! /^(?:gc|general_category)[:=]/i } @longprops
unless $Opt{general};
@longprops = grep { /^(?:gc|general_category)[:=]/i } @longprops
unless $Opt{all};
writeln(reorder @longprops);
}
}
}
sub reorder(@) {
if ($Opt{reorder}) {
return sort attractively @_;
} else {
return @_;
}
}
sub attractively {
length($a) <=> length($b)
||
uc($a) cmp uc($b)
||
$a cmp $b
}
sub perlprops($) {
my $_ = shift();
my @retlist = ();
no warnings "utf8";
/\w/ && push @retlist => q/\w/;
/\W/ && push @retlist => q/\W/;
/\s/ && push @retlist => q/\s/;
/\S/ && push @retlist => q/\S/;
/\d/ && push @retlist => q/\d/;
/\D/ && push @retlist => q/\D/;
/\h/ && push @retlist => q/\h/;
/\H/ && push @retlist => q/\H/;
/\v/ && push @retlist => q/\v/;
/\V/ && push @retlist => q/\V/;
/\R/ && push @retlist => q/\R/;
# General_Category=Letter
/\pL/ && push @retlist => q/\pL/;
/\p{LC}/ && push @retlist => q/\p{LC}/;
/\p{L_}/ && push @retlist => q/\p{L_}/;
/\p{L&}/ && push @retlist => q/\p{L&}/;
/\p{Lu}/ && push @retlist => q/\p{Lu}/;
/\p{Ll}/ && push @retlist => q/\p{Ll}/;
/\p{Lt}/ && push @retlist => q/\p{Lt}/;
/\p{Lm}/ && push @retlist => q/\p{Lm}/;
/\p{Lo}/ && push @retlist => q/\p{Lo}/;
# General_Category=Mark
/\pM/ && push @retlist => q/\pM/;
/\p{Mn}/ && push @retlist => q/\p{Mn}/;
/\p{Mc}/ && push @retlist => q/\p{Mc}/;
/\p{Me}/ && push @retlist => q/\p{Me}/;
# General_Category=Number
/\pN/ && push @retlist => q/\pN/;
/\p{Nd}/ && push @retlist => q/\p{Nd}/;
/\p{Nl}/ && push @retlist => q/\p{Nl}/;
/\p{No}/ && push @retlist => q/\p{No}/;
# General_Category=Punctuation
/\pP/ && push @retlist => q/\pP/;
/\p{Pc}/ && push @retlist => q/\p{Pc}/;
/\p{Pd}/ && push @retlist => q/\p{Pd}/;
/\p{Ps}/ && push @retlist => q/\p{Ps}/;
/\p{Pe}/ && push @retlist => q/\p{Pe}/;
/\p{Pi}/ && push @retlist => q/\p{Pi}/;
/\p{Pf}/ && push @retlist => q/\p{Pf}/;
/\p{Po}/ && push @retlist => q/\p{Po}/;
# General_Category=Symbol
/\pS/ && push @retlist => q/\pS/;
/\p{Sm}/ && push @retlist => q/\p{Sm}/;
/\p{Sc}/ && push @retlist => q/\p{Sc}/;
/\p{Sk}/ && push @retlist => q/\p{Sk}/;
/\p{So}/ && push @retlist => q/\p{So}/;
# General_Category=Separator
/\pZ/ && push @retlist => q/\pZ/;
/\p{Zs}/ && push @retlist => q/\p{Zs}/;
/\p{Zl}/ && push @retlist => q/\p{Zl}/;
/\p{Zp}/ && push @retlist => q/\p{Zp}/;
# General_Category=Other
/\pC/ && push @retlist => q/\pC/;
/\p{Cc}/ && push @retlist => q/\p{Cc}/;
/\p{Cf}/ && push @retlist => q/\p{Cf}/;
/\p{Cs}/ && push @retlist => q/\p{Cs}/;
/\p{Co}/ && push @retlist => q/\p{Co}/;
/\p{Cn}/ && push @retlist => q/\p{Cn}/;
return @retlist;
}
################################################################
# This autoloading stub replaces itself with the real function,
# then jumps directly into its replacement via magic goto.
################################################################
sub uniprops($) {
load_properties() unless @All_Properties;
my $code = dequeue q<|Q|> => <<'END_OF_CODE_1';
|Q|
|Q| no warnings "redefine";
|Q|
|Q| sub uniprops($) {
|Q| my $_ = shift();
|Q| my @retlist = ();
|Q|
END_OF_CODE_1
for my $propname (@All_Properties) {
$code .= dequeue qq<|QQ|> => <<"END_OF_CODE_2";
|QQ|
|QQ| no warnings "utf8";
|QQ|
|QQ| /\\p{$propname}/ && push \@retlist => "$propname";
|QQ|
END_OF_CODE_2
}
$code .= dequeue q<|Q|> => <<'END_OF_CODE_3';
|Q|
|Q| return @retlist;
|Q| }
|Q|
|Q| 1; # so result evals to true
|Q|
END_OF_CODE_3
eval($code) || die "CODE ERR";
goto \&uniprops;
}
sub dequeue($$) {
my($leader, $body) = @_;
$body =~ s/^\s*\Q$leader\E ?//gm;
return $body;
}
sub beautify(_) {
my $_ = shift();
s/\s//g;
s/:/=/g;
s/-/_/g;
my $short = qr{
Bc
| Blk | Ccc | Dt | Ea | Gc | GCB | Hst | In | Jg
| Jt | Lb | Nt | Nv | SB | Sc | WB
}ix;
s/CJK\K(\p{Lu})/_$1/;
s/IPA\K(\p{Lu})/_$1/;
s/POSIX(\p{Lu})/POSIX_$1/i;
s/XPOSIX/X_POSIX/;
s/Linear_?B\K(\p{Lu})/_\u$1/;
s/^($short)(?==)/\U$1/;
if ($Opt{titlecase}) {
s/(?!NaN)\pL\K(\p{Ll})(\p{Lu})/${1}_${2}/g;
s/_(and|to)_/_\L${1}_/gi;
}
elsif ($Opt{CamelCase}) {
s/\p{Ll}\K_(\pL)/\u$1/g;
}
return $_;
}
sub load_properties() {
my %seen;
use Config;
my $privlib = "$Config{installprivlib}";
my $podlib = "";
if (-d "$privlib/pods") {
$podlib = "$privlib/pods";
}
elsif (-d "$privlib/pod") {
$podlib = "$privlib/pod";
}
elsif (-d "pod") {
$podlib = "pod";
}
else {
# FALL THROUGH
}
my $unipod = "$podlib/perluniprops.pod";
if (-e $unipod) {
debug("reading props from $unipod");
} else {
debug("reading properties from DATA");
# can't find it, reading static list from <DATA>
my $_;
while (<DATA>) {
chomp;
next if /^\s*$/;
next if /^\s*#/; # in case wish some commented out
next unless valid_prop($_);
$_ = beautify($_);
next if $seen{$_}++;
push @All_Properties, $_;
}
# perl5.10 bug: leaves "In" in the format accumulator!!
$^A = q();
my $count = @All_Properties;
debug("read $count properties from <DATA>");
return;
}
open(my $pod_fh, "< $unipod")
|| die "can't open $unipod: $!";
local $/ = undef;
my $_ = <$pod_fh>;
close($pod_fh) || die "$0: can't close $unipod: $!";
s/ .*? ^ \s+ NAME \s+ INFO \s* \n //msx
|| die "$0: $unipod changed format";
s/ ^ =head1 .* \z //msx
|| die "$0: $unipod changed format";
s/\n {10,}/ /g; # fix continuation lines
## D means this is deprecated.
## O means this is obsolete.
## S means this is stabilized.
## T means tighter (stricter) name matching applies.
## X means use of this form is discouraged.
0 and s/ ^ \s+ [DSX] \s+ .* \n//gmx;
my $prop_rx = qr{
\\ p \{
(?<PROPNAME> [\w\s\-.:=] + )
\}
}x;
while (/$prop_rx/g) {
my $propname = beautify($+{PROPNAME});
next unless valid_prop($propname);
# just once each
next if $seen{$propname}++;
# remove props with leading underscores
next if 0 and $propname =~ /^_/;
# remove redundant booleans
next if $propname =~ m{
[:=]
(?:
Y (?: es )?
| N o ?
)
$
}x;
push @All_Properties, $propname;
}
my $count = @All_Properties;
debug("read $count properties from $unipod");
}
sub valid_prop($) {
my $propname = shift();
return eval(dequeue("|QQ|", <<"VALIDATE_PROPERTY")) || 0;
|QQ|
|QQ| \$SIG{__WARN__} = sub {die "PROPERTY ERROR \@_"};
|QQ|
|QQ| "whatever" =~ /\\p{$propname}/;
|QQ|
|QQ| 1;
|QQ|
VALIDATE_PROPERTY
}
sub sysprops() {
load_properties() unless @All_Properties;
if (-t STDOUT) {
my $pager = $ENV{PAGER} || "more";
if ($pager =~ /more|less/ && ($ENV{LESSCHARSET} || "") ne "utf-8") {
$ENV{LESSCHARSET} = "utf-8";
}
open(STDOUT, "| $pager")
|| die "can't open pipe to $pager: $!";
$SIG{PIPE} = sub { exit };
}
say for reorder verbose @All_Properties;
exit; # explicit pclose() in atexit() handler
}
################################################################
################################################################
################################################################
=encoding utf8
=head1 NAME
uniprops - list unicode properties for one or more characters
=head1 SYNOPSIS
uniprops [I<options>] I<character> | U+I<codepoint> | "I<name>" ...
Options:
--version print version information
--help this message
--man full manpage
--unicode list simple Unicode properties (DEFAULT)
--general include even the long form of general properties
--perl list lowercase Perl short-cuts, plus \R (DEFAULT)
--negated list uppercase Perl short-cuts
--all list all Unicode categories, not just one-parters
--list list all known Unicode properties, then exit
--reorder sort Unicode property lists shortest first
--single output each property one per line
--verbose wrap Unicode properties in \p{xxx}
--width N set column width
--debug noisy internal processing
options may be bundled if used in the short form; e.g., -va
=head1 DESCRIPTION
Each argument to I<uniprops> specifies a character in one of three forms:
=over
=item 1.
a one-character literal, such as "#" or "A".
=item 2.
a code point number in hex, (optionally) prefixed by "0x" or "U+", or "\x"
or "\u", with the backslash prefixes admitting but not requiring enclosing
curly braces. Examples: "0x23", "U+394", "\x{0394}", "0394".
=item 3.
a case-sensitive character name, such as "COMMA" or "GREEK CAPITAL LETTER DELTA".
Names may be specified by their full names or their short names
per the L<charnames> pragma, or they may be Latin or Greek (in that order).
See the EXAMPLES.
=back
The I<uniprops> program reports the properties that apply to a given
character for use in regular expressions. By default, the Perl character
class short-cuts and the one-part Unicode properties are listed, which
are mostly those from the general category.
The B<--all> option adds all the two-part Unicode properties from the
non-general categories.
Long, two-part forms of general category properties are not listed unless
the B<--general> option is given.
The B<--negated> option adds the Perl shortcuts that are in capitals. The
B<--verbose> option encloses Unicode properties with C<\p{I<PROPNAME>}>.
To simply list out all available Unicode properties, use the B<--list>
option, which then exits without processing further arguments.
Lines will be wrapped before the edge of your screen. You can override
the window width with the B<--width I<NN>> option. To get only one property
per line without any indentation, use the B<--single> or B<-1> option.
Unicode properties are by default listed in the same order in which they
occur in L<perluniprops>(), but the B<--reorder> option will sort them
smallest to largest.
Unicode properties designated as deprecated, obsolete, or discouraged,
or which begin with an underscore, are ignored.
It takes quite some time to load up and test all the Unicode properties,
so if you just need confirmation of a character, just ask for Perl
properties, not Unicode ones, and it will run at least six times faster.
=head1 EXAMPLES
Count known Unicode properties:
$ uniprops -l | wc -l
2478
List all known Unicode properties, sorted by length:
$ uniprops -lr
List all known Unicode properties, sorted by name:
$ uniprops -l | sort -df | more
List Greek-related Unicode properties:
$ uniprops -l | grep Greek | sort -dfu
Blk=Greek
Block:Ancient_Greek_Musical_Notation
Block:Ancient_Greek_Numbers
Block:Greek
Block=Greek_And_Coptic
Block:Greek_Extended
Greek
Greek_And_Coptic
InAncientGreekMusicalNotation
InAncientGreekNumbers
InGreek
InGreekExtended
Is_Greek
Script=Greek
List just Perl properties for three I<named> characters:
$ uniprops -p delta greek:delta Greek:Delta
U+1E9F ‹ẟ› \N{ LATIN SMALL LETTER DELTA }:
\w \pL \p{LC} \p{L_} \p{L&} \p{Ll}
U+03B4 ‹δ› \N{ GREEK SMALL LETTER DELTA }:
\w \pL \p{LC} \p{L_} \p{L&} \p{Ll}
U+0394 ‹Δ› \N{ GREEK CAPITAL LETTER DELTA }:
\w \pL \p{LC} \p{L_} \p{L&} \p{Lu}
List just Perl properties negations for four I<named> characters:
$ uniprops -p Thorn pi hebrew:alef cyrillic:be
U+00DE ‹Þ› \N{ LATIN CAPITAL LETTER THORN }:
\w \pL \p{LC} \p{L_} \p{L&} \p{Lu}
U+03C0 ‹π› \N{ GREEK SMALL LETTER PI }:
\w \pL \p{LC} \p{L_} \p{L&} \p{Ll}
U+05D0 ‹א› \N{ HEBREW LETTER ALEF }:
\w \pL \p{L_} \p{Lo}
U+0431 ‹б› \N{ CYRILLIC SMALL LETTER BE }:
\w \pL \p{LC} \p{L_} \p{L&} \p{Ll}
List Perl and Unicode properties for three different I<literal> characters:
$ uniprops \# ç π
U+0023 ‹#› \N{ NUMBER SIGN }:
\pP \p{Po}
All Any ASCII Assigned Common Zyyy Po P Gr_Base
Grapheme_Base Graph GrBase Other_Punctuation Punct Pat_Syn
Pattern_Syntax PatSyn PosixGraph PosixPrint PosixPunct
Print Punctuation
U+00E7 ‹ç› \N{ LATIN SMALL LETTER C WITH CEDILLA }:
\w \pL \p{LC} \p{L_} \p{L&} \p{Ll}
All Any Alnum Alpha Alphabetic Assigned InLatin1 Cased
Cased_Letter LC Changes_When_Casemapped CWCM
Changes_When_Titlecased CWT Changes_When_Uppercased CWU Ll
L Gr_Base Grapheme_Base Graph GrBase ID_Continue IDC
ID_Start IDS Letter L_ Latin Latn Lowercase_Letter Lower
Lowercase Print Word XID_Continue XIDC XID_Start XIDS
U+03C0 ‹π› \N{ GREEK SMALL LETTER PI }:
\w \pL \p{LC} \p{L_} \p{L&} \p{Ll}
All Any Alnum Alpha Alphabetic Assigned Greek Is_Greek
InGreek Cased Cased_Letter LC Changes_When_Casemapped CWCM
Changes_When_Titlecased CWT Changes_When_Uppercased CWU Ll
L Gr_Base Grapheme_Base Graph GrBase Grek Greek_And_Coptic
ID_Continue IDC ID_Start IDS Letter L_ Lowercase_Letter
Lower Lowercase Print Word XID_Continue XIDC XID_Start XIDS
Just list Perl shortcuts, including negated ones, for a named character:
$ uniprops -pn LF
U+000A ‹U+000A› \N{ LINE FEED (LF) }:
\s \v \R \pC \p{Cc}
\W \D \H
For the Greek final sigma character, list Unicode properties that are
either one-parters or else two-part general categories
$ uniprops -ug "greek:final sigma"
U+03C2 ‹ς› \N{ GREEK SMALL LETTER FINAL SIGMA }:
All Any Alnum Alpha Alphabetic Assigned Greek Is_Greek InGreek
Cased Cased_Letter LC Changes_When_Casefolded CWCF
Changes_When_Casemapped CWCM Changes_When_NFKC_Casefolded CWKCF
Changes_When_Titlecased CWT Changes_When_Uppercased CWU Ll L
Gr_Base Grapheme_Base Graph GrBase Grek Greek_And_Coptic
ID_Continue IDC ID_Start IDS Letter L_ Lowercase_Letter Lower
Lowercase Print Word XID_Continue XIDC XID_Start XIDS
General_Category=Cased_Letter General_Category:Cased_Letter Gc=LC
General_Category:L General_Category=Letter General_Category:LC
General_Category:Letter Gc=L General_Category:Ll
General_Category=Lowercase_Letter
General_Category:Lowercase_Letter Gc=Ll
List just Unicode properties for a code point, given in hex:
$ uniprops -u 0xDF
U+00DF ‹ß› \N{ LATIN SMALL LETTER SHARP S }:
All Any Alnum Alpha Alphabetic Assigned InLatin1 Cased
Cased_Letter LC Changes_When_Casefolded CWCF
Changes_When_Casemapped CWCM Changes_When_NFKC_Casefolded
CWKCF Changes_When_Titlecased CWT Changes_When_Uppercased
CWU Ll L Gr_Base Grapheme_Base Graph GrBase ID_Continue
IDC ID_Start IDS Letter L_ Latin Latn Lowercase_Letter
Lower Lowercase Print Word XID_Continue XIDC XID_Start XIDS
List Perl and Unicode properties for a named character, verbosely:
$ uniprops -v "ALEF SYMBOL"
U+2135 ‹ℵ› \N{ ALEF SYMBOL }:
\w \pL \p{L_} \p{Lo}
\p{All} \p{Any} \p{Alnum} \p{Alpha} \p{Alphabetic} \p{Assigned}
\p{InLetterlikeSymbols} \p{Changes_When_NFKC_Casefolded}
\p{CWKCF} \p{Common} \p{Zyyy} \p{L} \p{Lo} \p{Gr_Base}
\p{Grapheme_Base} \p{Graph} \p{GrBase} \p{ID_Continue} \p{IDC}
\p{ID_Start} \p{IDS} \p{Letter} \p{L_} \p{Other_Letter}
\p{Math} \p{Print} \p{Word} \p{XID_Continue} \p{XIDC}
\p{XID_Start} \p{XIDS}
List Unicode properties in all categories except for two-part
general categories:
$ uniprops -au INFINITY
U+221E ‹∞› \N{ INFINITY }:
All Any Assigned InMathematicalOperators Common Zyyy Sm S
Gr_Base Grapheme_Base Graph GrBase Math Math_Symbol
Pat_Syn Pattern_Syntax PatSyn Print Symbol
Age:1.1 Bidi_Class:ON Bidi_Class=Other_Neutral
Bidi_Class:Other_Neutral Bc=ON Block:Mathematical_Operators
Canonical_Combining_Class:0
Canonical_Combining_Class=Not_Reordered
Canonical_Combining_Class:Not_Reordered Ccc=NR
Canonical_Combining_Class:NR Script=Common
Decomposition_Type:None Dt=None East_Asian_Width:A
East_Asian_Width=Ambiguous East_Asian_Width:Ambiguous Ea=A
Grapheme_Cluster_Break:Other GCB=XX Grapheme_Cluster_Break:XX
Grapheme_Cluster_Break=Other Hangul_Syllable_Type:NA
Hangul_Syllable_Type=Not_Applicable
Hangul_Syllable_Type:Not_Applicable Hst=NA
Joining_Group:No_Joining_Group Jg=NoJoiningGroup
Joining_Type:Non_Joining Jt=U Joining_Type:U
Joining_Type=Non_Joining Line_Break:AI Line_Break=Ambiguous
Line_Break:Ambiguous Lb=AI Numeric_Type:None Nt=None
Numeric_Value:NaN Nv=NaN Present_In:1.1 Age=1.1 In=1.1
Present_In:2.0 In=2.0 Present_In:2.1 In=2.1 Present_In:3.0
In=3.0 Present_In:3.1 In=3.1 Present_In:3.2 In=3.2
Present_In:4.0 In=4.0 Present_In:4.1 In=4.1 Present_In:5.0
In=5.0 Present_In:5.1 In=5.1 Present_In:5.2 In=5.2
Script:Common Sc=Zyyy Script:Zyyy Sentence_Break:Other SB=XX
Sentence_Break:XX Sentence_Break=Other Word_Break:Other WB=XX
Word_Break:XX Word_Break=Other
For the HYPHEN character, verbosely list all Unicode properties
including the two-part general categories, one per line, and sort them:
$ uniprops -1vgau HYPHEN | sort
List Perl and Unicode properties for code point U+2212, reordered by
length and with width set to 50:
$ uniprops -r -w 50 U+2212
U+2212 ‹−› \N{ MINUS SIGN }:
\pS \p{Sm}
S Sm All Any Dash Math Zyyy Graph Print
Common GrBase PatSyn Symbol Gr_Base Pat_Syn
Assigned Math_Symbol Grapheme_Base
Pattern_Syntax InMathematicalOperators
Ask for a (currently) unassigned code point:
$ uniprops 1F12F
U+1F12F ‹U+1F12F› \N{ U+1F12F }:
\pC \p{Cn}
All Any InEnclosedAlphanumericSupplement C Other Cn
Unassigned Zzzz Unknown
=head1 ERRORS
It is an error to ask for properties of code points representing
a UTF-16 surrogate.
Characters not legal for interchange are flagged as errors.
=head1 ENVIRONMENT
If your environment smells like it's in a Unicode encoding,
program arguments and output will be in UTF-8. This allows you
to enter a single, literal UTF-8 character as a program argument.
The PAGER environment variable is used for the B<--list> option.
=head1 FILES
The pod source for the L<perluniprops>(1) manpage is parsed to
determine Unicode properties. This is expected to be found
in the Config module's F<$installprivlib/pods> directory.
=head1 PROGRAMS
The L<stty>(1) program is called on Unix systems to determine
the window size.
If the standard output is to a tty when the B<--list> option
is requested, the user's pager is used, defaulting to L<more>(1).
=head1 BUGS
The B<--man> option does not correctly process the page
for UTF-8; L<pod2text>(1) works fine, though.
=head1 SEE ALSO
L<unichars>,
L<uninames>,
L<perluniprops>,
L<perlunicode>,
L<perlrecharclass>,
L<perlre>
=head1 AUTHOR
Tom Christiansen <I<tchrist@perl.com>>
=head1 COPYRIGHT AND LICENCE
Copyright 2011 Tom Christiansen.
This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.
=cut
# static list of properties from 5.14.0 in case can't find
# proplist (like on 5.10)
__END__
Aegean_Numbers
Block=Aegean_Numbers
Age=1.1
Age=2.0
Age=2.1
Age=3.0
Age=3.1
Age=3.2
Age=4.0
Age=4.1
Age=5.0
Age=5.1
Age=5.2
Age=6.0
Age=Unassigned
AHex
ASCII_Hex_Digit
Alchemical_Symbols
Block=Alchemical_Symbols
All
Any
Alnum
Alpha
Alphabetic
Alphabetic_Presentation_Forms
Block=Alphabetic_Presentation_Forms
Ancient_Greek_Musical_Notation
Block=Ancient_Greek_Musical_Notation
Ancient_Greek_Numbers
Block=Ancient_Greek_Numbers
Ancient_Symbols
Block=Ancient_Symbols
Arab
Arabic
Script=Arabic
Block=Arabic
Arabic_Presentation_Forms_A
Block=Arabic_Presentation_Forms_A
Arabic_Presentation_Forms_B
Block=Arabic_Presentation_Forms_B
Arabic_Supplement
Block=Arabic_Supplement
Armenian
Script=Armenian
Armn
Block=Armenian
Armi
Imperial_Aramaic
Script=Imperial_Aramaic
Block=Imperial_Aramaic
Arrows
Block=Arrows
ASCII
Block=Basic_Latin
Assigned
Avestan
Script=Avestan
Avst
Block=Avestan
Bali
Balinese
Script=Balinese
Block=Balinese
Bamu
Bamum
Script=Bamum
Block=Bamum
Bamum_Supplement
Block=Bamum_Supplement
Basic_Latin
Batak
Script=Batak
Batk
Block=Batak
Beng
Bengali
Script=Bengali
Block=Bengali
Bidi_C
Bidi_Control
Bidi_Class=AL
Bidi_Class=Arabic_Letter
Bidi_Class=AN
Bidi_Class=Arabic_Number
BC=AL
BC=AN
Bidi_Class=B
Bidi_Class=Paragraph_Separator
Bidi_Class=BN
Bidi_Class=Boundary_Neutral
BC=BN
Bidi_Class=Common_Separator
BC=CS
Bidi_Class=CS
Bidi_Class=EN
Bidi_Class=European_Number
Bidi_Class=ES
Bidi_Class=European_Separator
Bidi_Class=ET
Bidi_Class=European_Terminator
BC=EN
BC=ES
BC=ET
Bidi_Class=L
Bidi_Class=Left_To_Right
BC=L
Bidi_Class=Left_To_Right_Embedding
BC=LRE
Bidi_Class=Left_To_Right_Override
BC=LRO
Bidi_Class=LRE
Bidi_Class=LRO
Bidi_Class=Nonspacing_Mark
BC=NSM
Bidi_Class=NSM
Bidi_Class=ON
Bidi_Class=Other_Neutral
BC=ON
BC=B
Bidi_Class=PDF
Bidi_Class=Pop_Directional_Format
BC=PDF
Bidi_Class=R
Bidi_Class=Right_To_Left
BC=R
Bidi_Class=Right_To_Left_Embedding
BC=RLE
Bidi_Class=Right_To_Left_Override
BC=RLO
Bidi_Class=RLE
Bidi_Class=RLO
Bidi_Class=S
Bidi_Class=Segment_Separator
BC=S
Bidi_Class=White_Space
BC=WS
Bidi_Class=WS
BidiC
Bidi_M
Bidi_Mirrored
BidiM
Blank
InAegeanNumbers
InAlchemicalSymbols
InAlphabeticPresentationForms
InAncientGreekMusicalNotation
InAncientGreekNumbers
InAncientSymbols
InArabic
Is_Arabic
InArabicPresentationFormsA
InArabicPresentationFormsB
InArabicSupplement
InArmenian
Is_Armenian
InArrows
Block=ASCII
InAvestan
Is_Avestan
InBalinese
Is_Balinese
InBamum
Is_Bamum
InBamumSupplement
BLK=ASCII
InBatak
Is_Batak
InBengali
Is_Bengali
Block=Block_Elements
InBlockElements
Block=Bopomofo
InBopomofo
Bopomofo
Is_Bopomofo
Block=Bopomofo_Extended
InBopomofoExtended
Block=Box_Drawing
InBoxDrawing
Block=Brahmi
InBrahmi
Brahmi
Is_Brahmi
Block=Braille_Patterns
InBraillePatterns
Block=Buginese
InBuginese
Buginese
Is_Buginese
Block=Buhid
InBuhid
Buhid
Is_Buhid
Block=Byzantine_Musical_Symbols
InByzantineMusicalSymbols
Block=Canadian_Syllabics
Block=Unified_Canadian_Aboriginal_Syllabics
Block=Carian
InCarian
Carian
Is_Carian
Block=Cham
InCham
Cham
Is_Cham
Block=Cherokee
InCherokee
Cherokee
Is_Cherokee
Block=CJK_Compatibility
InCJK_Compatibility
Block=CJK_Compatibility_Forms
InCJK_CompatibilityForms
Block=CJK_Compatibility_Ideographs
InCJK_CompatibilityIdeographs
Block=CJK_Compatibility_Ideographs_Supplement
InCJK_CompatibilityIdeographs_Supplement
Block=CJK_Radicals_Supplement
InCJK_RadicalsSupplement
Block=CJK_Strokes
InCJK_Strokes
Block=CJK_Symbols_And_Punctuation
InCJK_SymbolsAndPunctuation
Block=CJK_Unified_Ideographs
InCJK_UnifiedIdeographs
Block=CJK_Unified_Ideographs_Extension_A
InCJK_UnifiedIdeographsExtensionA
Block=CJK_Unified_Ideographs_Extension_B
InCJK_UnifiedIdeographsExtensionB
Block=CJK_Unified_Ideographs_Extension_C
InCJK_UnifiedIdeographsExtensionC
Block=CJK_Unified_Ideographs_Extension_D
InCJK_UnifiedIdeographsExtensionD
Block=Combining_Diacritical_Marks
InCombiningDiacriticalMarks
Block=Combining_Diacritical_Marks_For_Symbols
BLK=CombiningMarksForSymbols
InCombiningMarksForSymbols
Block=Combining_Diacritical_Marks_Supplement
InCombiningDiacriticalMarks_Supplement
Block=Combining_Half_Marks
InCombiningHalfMarks
Block=Combining_Marks_For_Symbols
Block=Common_Indic_Number_Forms
InCommonIndicNumberForms
Block=Control_Pictures
InControlPictures
Block=Coptic
InCoptic
Coptic
Is_Coptic
Block=Counting_Rod_Numerals
InCountingRodNumerals
Block=Cuneiform
InCuneiform
Cuneiform
Is_Cuneiform
Block=Cuneiform_Numbers_And_Punctuation
InCuneiformNumbersAndPunctuation
Block=Currency_Symbols
InCurrencySymbols
Block=Cypriot_Syllabary
InCypriotSyllabary
Block=Cyrillic
InCyrillic
Cyrillic
Is_Cyrillic
Block=Cyrillic_Extended_A
InCyrillicExtendedA
Block=Cyrillic_Extended_B
InCyrillicExtendedB
Block=Cyrillic_Supplement
InCyrillicSupplement
Block=Cyrillic_Supplementary
Block=Deseret
InDeseret
Block=Devanagari
InDevanagari
Devanagari
Is_Devanagari
Block=Devanagari_Extended
InDevanagariExtended
Block=Dingbats
InDingbats
Block=Domino_Tiles
InDominoTiles
Block=Egyptian_Hieroglyphs
InEgyptianHieroglyphs
Egyptian_Hieroglyphs
Is_Egyptian_Hieroglyphs
Block=Emoticons
InEmoticons
Block=Enclosed_Alphanumeric_Supplement
InEnclosedAlphanumericSupplement
Block=Enclosed_Alphanumerics
InEnclosedAlphanumerics
Block=Enclosed_CJK_Letters_And_Months
InEnclosedCJK_LettersAndMonths
Block=Enclosed_Ideographic_Supplement
InEnclosedIdeographicSupplement
Block=Ethiopic
InEthiopic
Ethiopic
Is_Ethiopic
Block=Ethiopic_Extended
InEthiopicExtended
Block=Ethiopic_Extended_A
InEthiopicExtendedA
Block=Ethiopic_Supplement
InEthiopicSupplement
Block=General_Punctuation
InGeneralPunctuation
Block=Geometric_Shapes
InGeometricShapes
Block=Georgian
InGeorgian
Georgian
Is_Georgian
Block=Georgian_Supplement
InGeorgianSupplement
Block=Glagolitic
InGlagolitic
Glagolitic
Is_Glagolitic
Block=Gothic
InGothic
Gothic
Is_Gothic
Block=Greek
Block=Greek_And_Coptic
Greek
Is_Greek
BLK=Greek
InGreek
Block=Greek_Extended
InGreekExtended
Block=Gujarati
InGujarati
Gujarati
Is_Gujarati
Block=Gurmukhi
InGurmukhi
Gurmukhi
Is_Gurmukhi
Block=Halfwidth_And_Fullwidth_Forms
InHalfwidthAndFullwidthForms
Block=Hangul_Compatibility_Jamo
InHangulCompatibilityJamo
Block=Hangul_Jamo
InHangulJamo
Block=Hangul_Jamo_Extended_A
InHangulJamoExtendedA
Block=Hangul_Jamo_Extended_B
InHangulJamoExtendedB
Block=Hangul_Syllables
InHangulSyllables
Block=Hanunoo
InHanunoo
Hanunoo
Is_Hanunoo
Block=Hebrew
InHebrew
Hebrew
Is_Hebrew
Block=High_Private_Use_Surrogates
InHighPrivateUseSurrogates
Block=High_Surrogates
InHighSurrogates
Block=Hiragana
InHiragana
Hiragana
Is_Hiragana
Block=Ideographic_Description_Characters
InIdeographicDescriptionCharacters
InImperialAramaic
Is_Imperial_Aramaic
Block=Inscriptional_Pahlavi
InInscriptionalPahlavi
Inscriptional_Pahlavi
Is_Inscriptional_Pahlavi
Block=Inscriptional_Parthian
InInscriptionalParthian
Inscriptional_Parthian
Is_Inscriptional_Parthian
Block=IPA_Extensions
InIPA_Extensions
Block=Javanese
InJavanese
Javanese
Is_Javanese
Block=Kaithi
InKaithi
Kaithi
Is_Kaithi
Block=Kana_Supplement
InKanaSupplement
Block=Kanbun
InKanbun
Block=Kangxi_Radicals
InKangxiRadicals
Block=Kannada
InKannada
Kannada
Is_Kannada
Block=Katakana
InKatakana
Katakana
Is_Katakana
Block=Katakana_Phonetic_Extensions
InKatakanaPhoneticExtensions
Block=Kayah_Li
InKayahLi
Block=Kharoshthi
InKharoshthi
Kharoshthi
Is_Kharoshthi
Block=Khmer
InKhmer
Khmer
Is_Khmer
Block=Khmer_Symbols
InKhmerSymbols
Block=Lao
InLao
Lao
Is_Lao
Block=Latin_1
Block=Latin_1_Supplement
BLK=Latin1
InLatin1
Block=Latin_Extended_A
InLatinExtendedA
Block=Latin_Extended_Additional
InLatinExtendedAdditional
Block=Latin_Extended_B
InLatinExtendedB
Block=Latin_Extended_C
InLatinExtendedC
Block=Latin_Extended_D
InLatinExtendedD
Block=Lepcha
InLepcha
Lepcha
Is_Lepcha
Block=Letterlike_Symbols
InLetterlikeSymbols
Block=Limbu
InLimbu
Limbu
Is_Limbu
Block=Linear_B_Ideograms
InLinearB_Ideograms
Block=Linear_B_Syllabary
InLinearB_Syllabary
Block=Lisu
InLisu
Block=Low_Surrogates
InLowSurrogates
Block=Lycian
InLycian
Lycian
Is_Lycian
Block=Lydian
InLydian
Lydian
Is_Lydian
Block=Mahjong_Tiles
InMahjongTiles
Block=Malayalam
InMalayalam
Malayalam
Is_Malayalam
Block=Mandaic
InMandaic
Mandaic
Is_Mandaic
Block=Mathematical_Alphanumeric_Symbols
InMathematicalAlphanumericSymbols
Block=Mathematical_Operators
InMathematicalOperators
Block=Meetei_Mayek
InMeeteiMayek
Meetei_Mayek
Is_Meetei_Mayek
Block=Miscellaneous_Mathematical_Symbols_A
InMiscellaneousMathematicalSymbolsA
Block=Miscellaneous_Mathematical_Symbols_B
InMiscellaneousMathematicalSymbolsB
Block=Miscellaneous_Symbols
InMiscellaneousSymbols
Block=Miscellaneous_Symbols_And_Arrows
InMiscellaneousSymbolsAndArrows
Block=Miscellaneous_Symbols_And_Pictographs
InMiscellaneousSymbolsAnd_Pictographs
Block=Miscellaneous_Technical
InMiscellaneousTechnical
Block=Modifier_Tone_Letters
InModifierToneLetters
Block=Mongolian
InMongolian
Mongolian
Is_Mongolian
Block=Musical_Symbols
InMusicalSymbols
Block=Myanmar
InMyanmar
Myanmar
Is_Myanmar
Block=Myanmar_Extended_A
InMyanmarExtendedA
Block=New_Tai_Lue
InNewTaiLue
New_Tai_Lue
Is_New_Tai_Lue
Block=NKo
InNKo
Nko
Is_NKo
Block=No_Block
InNoBlock
Block=Number_Forms
InNumberForms
Block=Ogham
InOgham
Ogham
Is_Ogham
Block=Ol_Chiki
InOlChiki
Block=Old_Italic
InOldItalic
Old_Italic
Is_Old_Italic
Block=Old_Persian
InOldPersian
Old_Persian
Is_Old_Persian
Block=Old_South_Arabian
InOldSouthArabian
Block=Old_Turkic
InOldTurkic
Old_Turkic
Is_Old_Turkic
Block=Optical_Character_Recognition
InOpticalCharacterRecognition
Block=Oriya
InOriya
Oriya
Is_Oriya
Block=Osmanya
InOsmanya
Osmanya
Is_Osmanya
Block=Phags_Pa
InPhagsPa
Phags_Pa
Is_Phags_Pa
Block=Phaistos_Disc
InPhaistosDisc
Block=Phoenician
InPhoenician
Phoenician
Is_Phoenician
Block=Phonetic_Extensions
InPhoneticExtensions
Block=Phonetic_Extensions_Supplement
InPhoneticExtensionsSupplement
Block=Playing_Cards
InPlayingCards
Block=Private_Use
Block=Private_Use_Area
Private_Use
Is_Private_Use
BLK=PrivateUse
InPrivateUse
Block=Rejang
InRejang
Rejang
Is_Rejang
Block=Rumi_Numeral_Symbols
InRumiNumeralSymbols
Block=Runic
InRunic
Runic
Is_Runic
Block=Samaritan
InSamaritan
Samaritan
Is_Samaritan
Block=Saurashtra
InSaurashtra
Saurashtra
Is_Saurashtra
Block=Shavian
InShavian
Block=Sinhala
InSinhala
Sinhala
Is_Sinhala
Block=Small_Form_Variants
InSmallFormVariants
Block=Spacing_Modifier_Letters
InSpacingModifierLetters
Block=Specials
InSpecials
Block=Sundanese
InSundanese
Sundanese
Is_Sundanese
Block=Superscripts_And_Subscripts
InSuperscriptsAndSubscripts
Block=Supplemental_Arrows_A
InSupplementalArrowsA
Block=Supplemental_Arrows_B
InSupplementalArrowsB
Block=Supplemental_Mathematical_Operators
InSupplementalMathematicalOperators
Block=Supplemental_Punctuation
InSupplementalPunctuation
Block=Supplementary_Private_Use_Area_A
InSupplementaryPrivateUseAreaA
Block=Supplementary_Private_Use_Area_B
InSupplementaryPrivateUseAreaB
Block=Syloti_Nagri
InSylotiNagri
Syloti_Nagri
Is_Syloti_Nagri
Block=Syriac
InSyriac
Syriac
Is_Syriac
Block=Tagalog
InTagalog
Tagalog
Is_Tagalog
Block=Tagbanwa
InTagbanwa
Tagbanwa
Is_Tagbanwa
Block=Tags
InTags
Block=Tai_Le
InTaiLe
Tai_Le
Is_Tai_Le
Block=Tai_Tham
InTaiTham
Tai_Tham
Is_Tai_Tham
Block=Tai_Viet
InTaiViet
Tai_Viet
Is_Tai_Viet
Block=Tai_Xuan_Jing_Symbols
InTaiXuanJingSymbols
Block=Tamil
InTamil
Tamil
Is_Tamil
Block=Telugu
InTelugu
Telugu
Is_Telugu
Block=Thaana
InThaana
Thaana
Is_Thaana
Block=Thai
InThai
Thai
Is_Thai
Block=Tibetan
InTibetan
Tibetan
Is_Tibetan
Block=Tifinagh
InTifinagh
Tifinagh
Is_Tifinagh
Block=Transport_And_Map_Symbols
InTransportAndMapSymbols
Block=Ugaritic
InUgaritic
Ugaritic
Is_Ugaritic
BLK=CanadianSyllabics
InCanadianSyllabics
Block=Unified_Canadian_Aboriginal_Syllabics_Extended
InUnifiedCanadianAboriginalSyllabics_Extended
Block=Vai
InVai
Vai
Is_Vai
Block=Variation_Selectors
InVariationSelectors
Block=Variation_Selectors_Supplement
InVariationSelectorsSupplement
Block=Vedic_Extensions
InVedicExtensions
Block=Vertical_Forms
InVerticalForms
Block=Yi_Radicals
InYiRadicals
Block=Yi_Syllables
InYiSyllables
Block=Yijing_Hexagram_Symbols
InYijingHexagramSymbols
Block_Elements
Bopo
Script=Bopomofo
Bopomofo_Extended
Box_Drawing
Brah
Script=Brahmi
Brai
Braille
Script=Braille
Braille_Patterns
Bugi
Script=Buginese
Buhd
Script=Buhid
Byzantine_Musical_Symbols
C
Other
General_Category=Other
Canadian_Aboriginal
Script=Canadian_Aboriginal
Cans
Canadian_Syllabics
Unified_Canadian_Aboriginal_Syllabics
Canonical_Combining_Class=0
Canonical_Combining_Class=Not_Reordered
Canonical_Combining_Class=1
Canonical_Combining_Class=Overlay
Canonical_Combining_Class=7
Canonical_Combining_Class=Nukta
Canonical_Combining_Class=8
Canonical_Combining_Class=Kana_Voicing
Canonical_Combining_Class=9
Canonical_Combining_Class=Virama
Canonical_Combining_Class=10
CCC=10
Canonical_Combining_Class=11
CCC=11
Canonical_Combining_Class=12
CCC=12
Canonical_Combining_Class=13
CCC=13
Canonical_Combining_Class=14
CCC=14
Canonical_Combining_Class=15
CCC=15
Canonical_Combining_Class=16
CCC=16
Canonical_Combining_Class=17
CCC=17
Canonical_Combining_Class=18
CCC=18
Canonical_Combining_Class=19
CCC=19
Canonical_Combining_Class=20
CCC=20
Canonical_Combining_Class=21
CCC=21
Canonical_Combining_Class=22
CCC=22
Canonical_Combining_Class=23
CCC=23
Canonical_Combining_Class=24
CCC=24
Canonical_Combining_Class=25
CCC=25
Canonical_Combining_Class=26
CCC=26
Canonical_Combining_Class=27
CCC=27
Canonical_Combining_Class=28
CCC=28
Canonical_Combining_Class=29
CCC=29
Canonical_Combining_Class=30
CCC=30
Canonical_Combining_Class=31
CCC=31
Canonical_Combining_Class=32
CCC=32
Canonical_Combining_Class=33
CCC=33
Canonical_Combining_Class=34
CCC=34
Canonical_Combining_Class=35
CCC=35
Canonical_Combining_Class=36
CCC=36
Canonical_Combining_Class=84
CCC=84
Canonical_Combining_Class=91
CCC=91
Canonical_Combining_Class=103
CCC=103
Canonical_Combining_Class=107
CCC=107
Canonical_Combining_Class=118
CCC=118
Canonical_Combining_Class=122
CCC=122
Canonical_Combining_Class=129
CCC=129
Canonical_Combining_Class=130
CCC=130
Canonical_Combining_Class=132
CCC=132
Canonical_Combining_Class=200
Canonical_Combining_Class=Attached_Below_Left
Canonical_Combining_Class=202
Canonical_Combining_Class=Attached_Below
Canonical_Combining_Class=214
Canonical_Combining_Class=Attached_Above
Canonical_Combining_Class=216
Canonical_Combining_Class=Attached_Above_Right
Canonical_Combining_Class=218
Canonical_Combining_Class=Below_Left
Canonical_Combining_Class=220
Canonical_Combining_Class=Below
Canonical_Combining_Class=222
Canonical_Combining_Class=Below_Right
Canonical_Combining_Class=224
Canonical_Combining_Class=Left
Canonical_Combining_Class=226
Canonical_Combining_Class=Right
Canonical_Combining_Class=228
Canonical_Combining_Class=Above_Left
Canonical_Combining_Class=230
Canonical_Combining_Class=Above
Canonical_Combining_Class=232
Canonical_Combining_Class=Above_Right
Canonical_Combining_Class=233
Canonical_Combining_Class=Double_Below
Canonical_Combining_Class=234
Canonical_Combining_Class=Double_Above
Canonical_Combining_Class=240
Canonical_Combining_Class=Iota_Subscript
Canonical_Combining_Class=A
CCC=A
CCC=AL
CCC=AR
Canonical_Combining_Class=AL
Canonical_Combining_Class=AR
Canonical_Combining_Class=ATA
Canonical_Combining_Class=ATAR
Canonical_Combining_Class=ATB
Canonical_Combining_Class=ATBL
CCC=ATA
CCC=ATAR
CCC=ATB
CCC=ATBL
Canonical_Combining_Class=B
CCC=B
CCC=BL
CCC=BR
Canonical_Combining_Class=BL
Canonical_Combining_Class=BR
Canonical_Combining_Class=DA
Canonical_Combining_Class=DB
CCC=DA
CCC=DB
CCC=IS
Canonical_Combining_Class=IS
CCC=KV
Canonical_Combining_Class=KV
Canonical_Combining_Class=L
CCC=L
Canonical_Combining_Class=NK
CCC=NR
Canonical_Combining_Class=NR
CCC=NK
Canonical_Combining_Class=OV
CCC=OV
Canonical_Combining_Class=R
CCC=R
CCC=VR
Canonical_Combining_Class=VR
Cari
Script=Carian
Case_Ignorable
CI
Cased
Cased_Letter
General_Category=Cased_Letter
LC
Cc
Cntrl
General_Category=Control
CE
Composition_Exclusion
Cf
Format
General_Category=Format
Script=Cham
Changes_When_Casefolded
CWCF
Changes_When_Casemapped
CWCM
Changes_When_Lowercased
CWL
Changes_When_NFKC_Casefolded
CWKCF
Changes_When_Titlecased
CWT
Changes_When_Uppercased
CWU
Cher
Script=Cherokee
CJK_Compatibility
CJK_Compatibility_Forms
CJK_Compatibility_Ideographs
CJK_Compatibility_Ideographs_Supplement
CJK_Radicals_Supplement
CJK_Strokes
CJK_Symbols_And_Punctuation
CJK_Unified_Ideographs
CJK_Unified_Ideographs_Extension_A
CJK_Unified_Ideographs_Extension_B
CJK_Unified_Ideographs_Extension_C
CJK_Unified_Ideographs_Extension_D
Close_Punctuation
General_Category=Close_Punctuation
Pe
Cn
Unassigned
General_Category=Unassigned
Co
General_Category=Private_Use
Private_Use_Area
Combining_Diacritical_Marks
Combining_Diacritical_Marks_For_Symbols
Combining_Diacritical_Marks_Supplement
Combining_Half_Marks
Combining_Marks_For_Symbols
Combining_Diacritical_Marks_For__Symbols
Block=Combining_Diacritical_Marks_For__Symbols
Common
Script=Common
Zyyy
Common_Indic_Number_Forms
Comp_Ex
Full_Composition_Exclusion
Connector_Punctuation
General_Category=Connector_Punctuation
Pc
Control
Control_Pictures
Copt
Script=Coptic
Counting_Rod_Numerals
Cprt
Cypriot
Script=Cypriot
Cs
Surrogate
General_Category=Surrogate
Script=Cuneiform
Xsux
Cuneiform_Numbers_And_Punctuation
Currency_Symbol
General_Category=Currency_Symbol
Sc
Currency_Symbols
Cypriot_Syllabary
Script=Cyrillic
Cyrl
Cyrillic_Extended_A
Cyrillic_Extended_B
Cyrillic_Supplement
Cyrillic_Supplementary
Dash
Dash_Punctuation
General_Category=Dash_Punctuation
Pd
Decimal_Number
Digit
General_Category=Decimal_Number
Decomposition_Type=Can
Decomposition_Type=Canonical
DT=Can
Decomposition_Type=Circle
DT=Enc
Decomposition_Type=Com
Decomposition_Type=Compat
DT=Com
Decomposition_Type=Enc
Decomposition_Type=Fin
Decomposition_Type=Final
DT=Fin
Decomposition_Type=Font
DT=Font
Decomposition_Type=Fra
Decomposition_Type=Fraction
DT=Fra
Decomposition_Type=Init
Decomposition_Type=Initial
DT=Init
Decomposition_Type=Iso
Decomposition_Type=Isolated
DT=Iso
Decomposition_Type=Med
Decomposition_Type=Medial
DT=Med
Decomposition_Type=Nar
Decomposition_Type=Narrow
DT=Nar
Decomposition_Type=Nb
Decomposition_Type=Nobreak
DT=Nb
Decomposition_Type=Non_Canon
Decomposition_Type=Non_Canonical
DT=NonCanon
Decomposition_Type=None
DT=None
Decomposition_Type=Small
DT=Sml
Decomposition_Type=Sml
Decomposition_Type=Sqr
Decomposition_Type=Square
DT=Sqr
Decomposition_Type=Sub
DT=Sub
Decomposition_Type=Sup
Decomposition_Type=Super
DT=Sup
Decomposition_Type=Vert
Decomposition_Type=Vertical
DT=Vert
Decomposition_Type=Wide
DT=Wide
Default_Ignorable_Code_Point
DI
Dep
Deprecated
Deseret
Script=Deseret
Dsrt
Deva
Script=Devanagari
Devanagari_Extended
Dia
Diacritic
Nd
Dingbats
Domino_Tiles
East_Asian_Width=A
East_Asian_Width=Ambiguous
EA=A
East_Asian_Width=F
East_Asian_Width=Fullwidth
EA=F
East_Asian_Width=H
East_Asian_Width=Halfwidth
EA=H
East_Asian_Width=Neutral
East_Asian_Width=Na
East_Asian_Width=Narrow
EA=Na
East_Asian_Width=W
East_Asian_Width=Wide
EA=W
Egyp
Script=Egyptian_Hieroglyphs
Emoticons
Enclosed_Alphanumeric_Supplement
Enclosed_Alphanumerics
Enclosed_CJK_Letters_And_Months
Enclosed_Ideographic_Supplement
Enclosing_Mark
General_Category=Enclosing_Mark
Me
Ethi
Script=Ethiopic
Ethiopic_Extended
Ethiopic_Extended_A
Ethiopic_Supplement
Ext
Extender
Final_Punctuation
General_Category=Final_Punctuation
Pf
CompEx
General_Category=C
Ll
Lu
Lt
GC=LC
General_Category=Cc
General_Category=Cf
GC=Pe
General_Category=Cn
General_Category=Cntrl
General_Category=Co
GC=Pc
GC=Cc
General_Category=Cs
GC=Sc
GC=Pd
GC=Nd
General_Category=Digit
GC=Me
GC=Pf
GC=Cf
General_Category=Initial_Punctuation
GC=Pi
Pi
General_Category=L
General_Category=Letter
General_Category=L_
General_Category=LC
GC=L
L
General_Category=Letter_Number
GC=Nl
Nl
General_Category=Line_Separator
GC=Zl
Zl
General_Category=Ll
General_Category=Lowercase_Letter
General_Category=Lm
General_Category=Modifier_Letter
General_Category=Lo
General_Category=Other_Letter
GC=Ll
General_Category=Lt
General_Category=Titlecase_Letter
General_Category=Lu
General_Category=Uppercase_Letter
General_Category=M
General_Category=Mark
GC=M
M
General_Category=Math_Symbol
GC=Sm
Sm
General_Category=Mc
General_Category=Spacing_Mark
General_Category=Me
General_Category=Mn
General_Category=Nonspacing_Mark
GC=Lm
Lm
General_Category=Modifier_Symbol
GC=Sk
Sk
General_Category=Number
General_Category=Nd
General_Category=Nl
General_Category=Other_Number
GC=Mn
Mn
N
General_Category=Open_Punctuation
GC=Ps
Ps
GC=C
GC=Lo
Lo
No
General_Category=Other_Punctuation
GC=Po
Po
General_Category=Other_Symbol
GC=So
So
General_Category=P
General_Category=Punctuation
General_Category=Paragraph_Separator
GC=Zp
Zp
General_Category=Pc
General_Category=Pd
General_Category=Pe
General_Category=Pf
General_Category=Pi
General_Category=Po
GC=Co
General_Category=Ps
General_Category=Punct
GC=P
P
General_Category=S
General_Category=Symbol
General_Category=Sc
General_Category=Separator
GC=Z
Z
General_Category=Sk
General_Category=Sm
General_Category=So
General_Category=Space_Separator
GC=Zs
Zs
GC=Mc
Mc
GC=Cs
GC=S
S
GC=Lt
GC=Cn
GC=Lu
General_Category=Z
General_Category=Zl
General_Category=Zp
General_Category=Zs
General_Punctuation
Geometric_Shapes
Geor
Script=Georgian
Georgian_Supplement
Glag
Script=Glagolitic
Goth
Script=Gothic
Gr_Base
Grapheme_Base
Gr_Ext
Grapheme_Extend
Graph
GrBase
Grapheme_Cluster_Break=CN
Grapheme_Cluster_Break=Control
GCB=CN
Grapheme_Cluster_Break=CR
GCB=CR
Grapheme_Cluster_Break=EX
Grapheme_Cluster_Break=Extend
GCB=EX
Grapheme_Cluster_Break=L
GCB=L
Grapheme_Cluster_Break=LF
GCB=LF
Grapheme_Cluster_Break=LV
GCB=LV
Grapheme_Cluster_Break=LVT
GCB=LVT
Grapheme_Cluster_Break=Other
GCB=XX
Grapheme_Cluster_Break=PP
Grapheme_Cluster_Break=Prepend
GCB=PP
Grapheme_Cluster_Break=SM
Grapheme_Cluster_Break=SpacingMark
GCB=SM
Grapheme_Cluster_Break=T
GCB=T
Grapheme_Cluster_Break=V
GCB=V
Grapheme_Cluster_Break=XX
GrExt
Script=Greek
Grek
Greek_And_Coptic
Greek_Extended
Script=Gujarati
Gujr
Script=Gurmukhi
Guru
Halfwidth_And_Fullwidth_Forms
Han
Script=Han
Hang
Hangul
Script=Hangul
Hangul_Compatibility_Jamo
Hangul_Jamo
Hangul_Jamo_Extended_A
Hangul_Jamo_Extended_B
Hangul_Syllable_Type=L
Hangul_Syllable_Type=Leading_Jamo
HST=L
Hangul_Syllable_Type=LV
Hangul_Syllable_Type=LV_Syllable
HST=LV
Hangul_Syllable_Type=LVT
Hangul_Syllable_Type=LVT_Syllable
HST=LVT
Hangul_Syllable_Type=NA
Hangul_Syllable_Type=Not_Applicable
HST=NA
Hangul_Syllable_Type=T
Hangul_Syllable_Type=Trailing_Jamo
HST=T
Hangul_Syllable_Type=V
Hangul_Syllable_Type=Vowel_Jamo
HST=V
Hangul_Syllables
Hani
Hano
Script=Hanunoo
Hebr
Script=Hebrew
Hex
XDigit
Hex_Digit
High_Private_Use_Surrogates
High_Surrogates
Hira
Script=Hiragana
HorizSpace
ID_Continue
IDC
ID_Start
IDS
Ideo
Ideographic
Ideographic_Description_Characters
IDS_Binary_Operator
IDSB
IDS_Trinary_Operator
IDST
Inherited
Script=Inherited
Zinh
Initial_Punctuation
Script=Inscriptional_Pahlavi
Phli
Script=Inscriptional_Parthian
Prti
IPA_Extensions
Ital
Script=Old_Italic
Java
Script=Javanese
Join_C
Join_Control
JoinC
Joining_Group=Ain
JG=Ain
Joining_Group=Alaph
JG=Alaph
Joining_Group=Alef
JG=Alef
Joining_Group=Beh
JG=Beh
Joining_Group=Beth
JG=Beth
Joining_Group=Burushaski_Yeh_Barree
JG=BurushaskiYehBarree
Joining_Group=Dal
JG=Dal
Joining_Group=Dalath_Rish
JG=DalathRish
Joining_Group=E
JG=E
Joining_Group=Farsi_Yeh
JG=FarsiYeh
Joining_Group=Fe
JG=Fe
Joining_Group=Feh
JG=Feh
Joining_Group=Final_Semkath
JG=FinalSemkath
Joining_Group=Gaf
JG=Gaf
Joining_Group=Gamal
JG=Gamal
Joining_Group=Hah
JG=Hah
Joining_Group=Hamza_On_Heh_Goal
Joining_Group=Teh_Marbuta_Goal
Joining_Group=He
JG=He
Joining_Group=Heh
JG=Heh
Joining_Group=Heh_Goal
JG=HehGoal
Joining_Group=Heth
JG=Heth
Joining_Group=Kaf
JG=Kaf
Joining_Group=Kaph
JG=Kaph
Joining_Group=Khaph
JG=Khaph
Joining_Group=Knotted_Heh
JG=KnottedHeh
Joining_Group=Lam
JG=Lam
Joining_Group=Lamadh
JG=Lamadh
Joining_Group=Meem
JG=Meem
Joining_Group=Mim
JG=Mim
Joining_Group=No_Joining_Group
JG=NoJoiningGroup
Joining_Group=Noon
JG=Noon
Joining_Group=Nun
JG=Nun
Joining_Group=Nya
JG=Nya
Joining_Group=Pe
JG=Pe
Joining_Group=Qaf
JG=Qaf
Joining_Group=Qaph
JG=Qaph
Joining_Group=Reh
JG=Reh
Joining_Group=Reversed_Pe
JG=ReversedPe
Joining_Group=Sad
JG=Sad
Joining_Group=Sadhe
JG=Sadhe
Joining_Group=Seen
JG=Seen
Joining_Group=Semkath
JG=Semkath
Joining_Group=Shin
JG=Shin
Joining_Group=Swash_Kaf
JG=SwashKaf
Joining_Group=Syriac_Waw
JG=SyriacWaw
Joining_Group=Tah
JG=Tah
Joining_Group=Taw
JG=Taw
Joining_Group=Teh_Marbuta
JG=TehMarbuta
JG=TehMarbutaGoal
Joining_Group=Teth
JG=Teth
Joining_Group=Waw
JG=Waw
Joining_Group=Yeh
JG=Yeh
Joining_Group=Yeh_Barree
JG=YehBarree
Joining_Group=Yeh_With_Tail
JG=YehWithTail
Joining_Group=Yudh
JG=Yudh
Joining_Group=Yudh_He
JG=YudhHe
Joining_Group=Zain
JG=Zain
Joining_Group=Zhain
JG=Zhain
Joining_Type=C
Joining_Type=Join_Causing
Joining_Type=D
Joining_Type=Dual_Joining
JT=D
JT=C
Joining_Type=L
Joining_Type=Left_Joining
JT=L
Joining_Type=Non_Joining
JT=U
Joining_Type=R
Joining_Type=Right_Joining
JT=R
Joining_Type=T
Joining_Type=Transparent
JT=T
Joining_Type=U
Script=Kaithi
Kthi
Kali
Kayah_Li
Script=Kayah_Li
Kana
Script=Katakana
Kana_Supplement
Kanbun
Kangxi_Radicals
Script=Kannada
Knda
Katakana_Phonetic_Extensions
Khar
Script=Kharoshthi
Script=Khmer
Khmr
Khmer_Symbols
Letter
L_
Lana
Script=Tai_Tham
Script=Lao
Laoo
Latin
Script=Latin
Latn
Latin_1
Latin_1_Supplement
Latin_Extended_A
Latin_Extended_Additional
Latin_Extended_B
Latin_Extended_C
Latin_Extended_D
Lepc
Script=Lepcha
Letter_Number
Letterlike_Symbols
Limb
Script=Limbu
Linb
Linear_B
Script=Linear_B
Line_Break=AI
Line_Break=Ambiguous
Line_Break=AL
Line_Break=Alphabetic
LB=AL
LB=AI
Line_Break=B2
Line_Break=Break_Both
Line_Break=BA
Line_Break=Break_After
Line_Break=BB
Line_Break=Break_Before
Line_Break=BK
Line_Break=Mandatory_Break
LB=BA
LB=BB
LB=B2
Line_Break=Break_Symbols
LB=SY
Line_Break=Carriage_Return
LB=CR
Line_Break=CB
Line_Break=Contingent_Break
Line_Break=CL
Line_Break=Close_Punctuation
Line_Break=Close_Parenthesis
LB=CP
LB=CL
Line_Break=CM
Line_Break=Combining_Mark
LB=CM
Line_Break=Complex_Context
LB=SA
LB=CB
Line_Break=CP
Line_Break=CR
Line_Break=EX
Line_Break=Exclamation
LB=EX
Line_Break=GL
Line_Break=Glue
LB=GL
Line_Break=H2
LB=H2
Line_Break=H3
LB=H3
Line_Break=HY
Line_Break=Hyphen
LB=HY
Line_Break=ID
Line_Break=Ideographic
LB=ID
Line_Break=IN
Line_Break=Inseparable
Line_Break=Infix_Numeric
LB=IS
LB=IN
Line_Break=Inseperable
Line_Break=IS
Line_Break=JL
LB=JL
Line_Break=JT
LB=JT
Line_Break=JV
LB=JV
Line_Break=LF
Line_Break=Line_Feed
LB=LF
LB=BK
Line_Break=Next_Line
LB=NL
Line_Break=NL
Line_Break=Nonstarter
LB=NS
Line_Break=NS
Line_Break=NU
Line_Break=Numeric
LB=NU
Line_Break=OP
Line_Break=Open_Punctuation
LB=OP
Line_Break=PO
Line_Break=Postfix_Numeric
LB=PO
Line_Break=PR
Line_Break=Prefix_Numeric
LB=PR
Line_Break=QU
Line_Break=Quotation
LB=QU
Line_Break=SA
Line_Break=SP
Line_Break=Space
LB=SP
Line_Break=SY
Line_Break=Unknown
LB=XX
Line_Break=WJ
Line_Break=Word_Joiner
LB=WJ
Line_Break=XX
Line_Break=ZW
Line_Break=ZWSpace
LB=ZW
Line_Separator
Linear_B_Ideograms
Linear_B_Syllabary
Lisu
Script=Lisu
Lowercase_Letter
Modifier_Letter
Other_Letter
LOE
Logical_Order_Exception
Low_Surrogates
Lower
Lowercase
Titlecase_Letter
Uppercase_Letter
Lyci
Script=Lycian
Lydi
Script=Lydian
Mark
Mahjong_Tiles
Script=Malayalam
Mlym
Mand
Script=Mandaic
Math
Math_Symbol
Mathematical_Alphanumeric_Symbols
Mathematical_Operators
Spacing_Mark
Script=Meetei_Mayek
Mtei
Miscellaneous_Mathematical_Symbols_A
Miscellaneous_Mathematical_Symbols_B
Miscellaneous_Symbols
Miscellaneous_Symbols_And_Arrows
Miscellaneous_Symbols_And_Pictographs
Miscellaneous_Technical
Nonspacing_Mark
Modifier_Symbol
Modifier_Tone_Letters
Mong
Script=Mongolian
Musical_Symbols
Script=Myanmar
Mymr
Myanmar_Extended_A
Number
NChar
Noncharacter_Code_Point
Script=New_Tai_Lue
Talu
NFC_Quick_Check=M
NFC_Quick_Check=Maybe
NFCQC=M
NFKC_Quick_Check=M
NFKC_Quick_Check=Maybe
NFKCQC=M
Script=Nko
NKo
Nkoo
Other_Number
No_Block
Number_Forms
Numeric_Type=De
Numeric_Type=Decimal
NT=De
Numeric_Type=Di
Numeric_Type=Digit
NT=Di
Numeric_Type=None
NT=None
Numeric_Type=Nu
Numeric_Type=Numeric
NT=Nu
Numeric_Value=0
NV=0
Numeric_Value=1
NV=1
Numeric_Value=2
NV=2
Numeric_Value=3
NV=3
Numeric_Value=4
NV=4
Numeric_Value=5
NV=5
Numeric_Value=6
NV=6
Numeric_Value=7
NV=7
Numeric_Value=8
NV=8
Numeric_Value=9
NV=9
Numeric_Value=10
NV=10
Numeric_Value=11
NV=11
Numeric_Value=12
NV=12
Numeric_Value=13
NV=13
Numeric_Value=14
NV=14
Numeric_Value=15
NV=15
Numeric_Value=16
NV=16
Numeric_Value=17
NV=17
Numeric_Value=18
NV=18
Numeric_Value=19
NV=19
Numeric_Value=20
NV=20
Numeric_Value=21
NV=21
Numeric_Value=22
NV=22
Numeric_Value=23
NV=23
Numeric_Value=24
NV=24
Numeric_Value=25
NV=25
Numeric_Value=26
NV=26
Numeric_Value=27
NV=27
Numeric_Value=28
NV=28
Numeric_Value=29
NV=29
Numeric_Value=30
NV=30
Numeric_Value=31
NV=31
Numeric_Value=32
NV=32
Numeric_Value=33
NV=33
Numeric_Value=34
NV=34
Numeric_Value=35
NV=35
Numeric_Value=36
NV=36
Numeric_Value=37
NV=37
Numeric_Value=38
NV=38
Numeric_Value=39
NV=39
Numeric_Value=40
NV=40
Numeric_Value=41
NV=41
Numeric_Value=42
NV=42
Numeric_Value=43
NV=43
Numeric_Value=44
NV=44
Numeric_Value=45
NV=45
Numeric_Value=46
NV=46
Numeric_Value=47
NV=47
Numeric_Value=48
NV=48
Numeric_Value=49
NV=49
Numeric_Value=50
NV=50
Numeric_Value=60
NV=60
Numeric_Value=70
NV=70
Numeric_Value=80
NV=80
Numeric_Value=90
NV=90
Numeric_Value=100
NV=100
Numeric_Value=200
NV=200
Numeric_Value=300
NV=300
Numeric_Value=400
NV=400
Numeric_Value=500
NV=500
Numeric_Value=600
NV=600
Numeric_Value=700
NV=700
Numeric_Value=800
NV=800
Numeric_Value=900
NV=900
Numeric_Value=1000
NV=1000
Numeric_Value=2000
NV=2000
Numeric_Value=3000
NV=3000
Numeric_Value=4000
NV=4000
Numeric_Value=5000
NV=5000
Numeric_Value=6000
NV=6000
Numeric_Value=7000
NV=7000
Numeric_Value=8000
NV=8000
Numeric_Value=9000
NV=9000
Numeric_Value=10000
NV=10000
Numeric_Value=20000
NV=20000
Numeric_Value=30000
NV=30000
Numeric_Value=40000
NV=40000
Numeric_Value=50000
NV=50000
Numeric_Value=60000
NV=60000
Numeric_Value=70000
NV=70000
Numeric_Value=80000
NV=80000
Numeric_Value=90000
NV=90000
Numeric_Value=100000
NV=100000
Numeric_Value=100000000
NV=100000000
Numeric_Value=1000000000000
NV=1000000000000
Numeric_Value=NaN
NV=NaN
Ogam
Script=Ogham
Ol_Chiki
Script=Ol_Chiki
Olck
Script=Old_Persian
Xpeo
Old_South_Arabian
Script=Old_South_Arabian
Sarb
Script=Old_Turkic
Orkh
Open_Punctuation
Optical_Character_Recognition
Script=Oriya
Orya
Osma
Script=Osmanya
Other_Punctuation
Other_Symbol
Punct
Paragraph_Separator
Pat_Syn
Pattern_Syntax
Pat_WS
Pattern_White_Space
PatSyn
PatWS
PerlSpace
PerlWord
Phag
Script=Phags_Pa
Phaistos_Disc
Phnx
Script=Phoenician
Phonetic_Extensions
Phonetic_Extensions_Supplement
Playing_Cards
POSIX_Alnum
POSIX_Alpha
POSIX_Blank
POSIX_Cntrl
POSIX_Digit
POSIX_Graph
POSIX_Lower
POSIX_Print
POSIX_Punct
POSIX_Space
POSIX_Upper
POSIX_Word
POSIX_XDigit
Present_In=1.1
IN=1.1
Present_In=2.0
IN=2.0
Present_In=2.1
IN=2.1
Present_In=3.0
IN=3.0
Present_In=3.1
IN=3.1
Present_In=3.2
IN=3.2
Present_In=4.0
IN=4.0
Present_In=4.1
IN=4.1
Present_In=5.0
IN=5.0
Present_In=5.1
IN=5.1
Present_In=5.2
IN=5.2
Present_In=6.0
IN=6.0
Present_In=Unassigned
IN=Unassigned
Print
Punctuation
Qaac
Qaai
QMark
Quotation_Mark
Radical
Script=Rejang
Rjng
Rumi_Numeral_Symbols
Script=Runic
Runr
Symbol
Script=Samaritan
Samr
Saur
Script=Saurashtra
Script=Arab
SC=Arab
SC=Armn
Script=Armi
Script=Armn
SC=Avst
Script=Avst
Script=Bali
SC=Bali
Script=Bamu
SC=Bamu
SC=Batk
Script=Batk
Script=Beng
SC=Beng
Script=Bopo
SC=Bopo
Script=Brah
SC=Brah
Script=Brai
SC=Brai
Script=Bugi
SC=Bugi
Script=Buhd
SC=Buhd
SC=Cans
Script=Cans
Script=Cari
SC=Cari
SC=Cham
Script=Cher
SC=Cher
SC=Zyyy
Script=Copt
SC=Copt
Script=Cprt
SC=Xsux
SC=Cprt
SC=Cyrl
Script=Cyrl
SC=Dsrt
Script=Deva
SC=Deva
Script=Dsrt
Script=Egyp
SC=Egyp
Script=Ethi
SC=Ethi
Script=Geor
SC=Geor
Script=Glag
SC=Glag
Script=Goth
SC=Goth
SC=Grek
Script=Grek
SC=Gujr
Script=Gujr
SC=Guru
Script=Guru
SC=Han
Script=Hang
SC=Hang
Script=Hani
Script=Hano
SC=Hano
Script=Hebr
SC=Hebr
Script=Hira
SC=Hira
SC=Armi
SC=Zinh
SC=Phli
SC=Prti
Script=Ital
Script=Java
SC=Java
SC=Kthi
Script=Kali
Script=Kana
SC=Knda
SC=Kana
SC=Kali
Script=Khar
SC=Khar
SC=Khmr
Script=Khmr
Script=Knda
Script=Kthi
Script=Lana
SC=Lao
Script=Laoo
SC=Latn
Script=Latn
Script=Lepc
SC=Lepc
Script=Limb
SC=Limb
Script=Linb
SC=Linb
SC=Lisu
Script=Lyci
SC=Lyci
Script=Lydi
SC=Lydi
SC=Mlym
Script=Mand
SC=Mand
SC=Mtei
Script=Mlym
Script=Mong
SC=Mong
Script=Mtei
SC=Mymr
Script=Mymr
SC=Talu
SC=Nko
Script=Nkoo
Script=Ogam
SC=Ogam
SC=Olck
Script=Olck
SC=Ital
SC=Xpeo
SC=Sarb
SC=Orkh
SC=Orya
Script=Orkh
Script=Orya
Script=Osma
SC=Osma
Script=Phag
SC=Phag
Script=Phli
Script=Phnx
SC=Phnx
Script=Prti
Script=Qaac
Script=Qaai
SC=Rjng
Script=Rjng
SC=Runr
Script=Runr
SC=Samr
Script=Samr
Script=Sarb
Script=Saur
SC=Saur
Script=Shavian
SC=Shaw
Shaw
Script=Shaw
Script=Sinh
Script=Sinhala
SC=Sinh
Sinh
Script=Sund
Script=Sundanese
SC=Sund
Sund
Script=Sylo
Script=Syloti_Nagri
SC=Sylo
Sylo
Script=Syrc
Script=Syriac
SC=Syrc
Syrc
Script=Tagalog
SC=Tglg
Tglg
Script=Tagb
Script=Tagbanwa
SC=Tagb
Tagb
Script=Tai_Le
SC=Tale
Tale
SC=Lana
Script=Tai_Viet
SC=Tavt
Tavt
Script=Tale
Script=Talu
Script=Tamil
SC=Taml
Taml
Script=Taml
Script=Tavt
Script=Telu
Script=Telugu
SC=Telu
Telu
Script=Tfng
Script=Tifinagh
Script=Tglg
Script=Thaa
Script=Thaana
SC=Thaa
Thaa
Script=Thai
SC=Thai
Script=Tibetan
SC=Tibt
Tibt
Script=Tibt
SC=Tfng
Tfng
Script=Ugar
Script=Ugaritic
SC=Ugar
Ugar
Script=Unknown
SC=Zzzz
Zzzz
Script=Vai
SC=Vai
Script=Vaii
Script=Xpeo
Script=Xsux
Script=Yi
SC=Yi
Yi
Script=Yiii
Script=Zinh
Script=Zyyy
Script=Zzzz
SD
Soft_Dotted
Sentence_Break=AT
Sentence_Break=ATerm
SB=AT
Sentence_Break=CL
Sentence_Break=Close
SB=CL
Sentence_Break=CR
SB=CR
Sentence_Break=EX
Sentence_Break=Extend
SB=EX
Sentence_Break=FO
Sentence_Break=Format
SB=FO
Sentence_Break=LE
Sentence_Break=OLetter
Sentence_Break=LF
SB=LF
Sentence_Break=LO
Sentence_Break=Lower
SB=LO
Sentence_Break=NU
Sentence_Break=Numeric
SB=NU
SB=LE
Sentence_Break=Other
SB=XX
Sentence_Break=SC
Sentence_Break=SContinue
SB=SC
Sentence_Break=SE
Sentence_Break=Sep
SB=SE
Sentence_Break=Sp
SB=Sp
Sentence_Break=ST
Sentence_Break=STerm
SB=ST
Sentence_Break=UP
Sentence_Break=Upper
SB=UP
Sentence_Break=XX
Separator
Shavian
Small_Form_Variants
Space
Space_Separator
SpacePerl
XPerlSpace
Spacing_Modifier_Letters
Specials
STerm
Superscripts_And_Subscripts
Supplemental_Arrows_A
Supplemental_Arrows_B
Supplemental_Mathematical_Operators
Supplemental_Punctuation
Supplementary_Private_Use_Area_A
Supplementary_Private_Use_Area_B
Tags
Tai_Xuan_Jing_Symbols
Term
Terminal_Punctuation
Title
Titlecase
Transport_And_Map_Symbols
UIdeo
Unified_Ideograph
Unified_Canadian_Aboriginal_Syllabics_Extended
Block=Unified_Canadian_Aboriginal_Syllabics__Extended
Unknown
Upper
Uppercase
Vaii
Variation_Selector
VS
Variation_Selectors
Variation_Selectors_Supplement
Vedic_Extensions
Vertical_Forms
VertSpace
White_Space
WSpace
Word
Word_Break=ALetter
WB=LE
Word_Break=CR
WB=CR
Word_Break=EX
Word_Break=ExtendNumLet
Word_Break=Extend
WB=Extend
WB=EX
Word_Break=FO
Word_Break=Format
WB=FO
Word_Break=KA
Word_Break=Katakana
WB=KA
Word_Break=LE
Word_Break=LF
WB=LF
Word_Break=MB
Word_Break=MidNumLet
Word_Break=MidLetter
WB=ML
Word_Break=MidNum
WB=MN
WB=MB
Word_Break=ML
Word_Break=MN
Word_Break=Newline
WB=NL
Word_Break=NL
Word_Break=NU
Word_Break=Numeric
WB=NU
Word_Break=Other
WB=XX
Word_Break=XX
XID_Continue
XIDC
XID_Start
XIDS
X_POSIX_Alnum
X_POSIX_Alpha
X_POSIX_Blank
X_POSIX_Cntrl
X_POSIX_Digit
X_POSIX_Graph
X_POSIX_Lower
X_POSIX_Print
X_POSIX_Punct
X_POSIX_Space
X_POSIX_Upper
X_POSIX_Word
X_POSIX_XDigit
Yi_Radicals
Yi_Syllables
Yiii
Yijing_Hexagram_Symbols
_CanonDCIJ
_Case_Ignorable
_CombAbove
_X_Begin
_X_Extend
_X_LV_LVT_V
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment