Skip to content

Instantly share code, notes, and snippets.

@japhb
Last active January 29, 2024 05:45
Show Gist options
  • Save japhb/7b8a68da1b4b6c201e5334c1269b951a to your computer and use it in GitHub Desktop.
Save japhb/7b8a68da1b4b6c201e5334c1269b951a to your computer and use it in GitHub Desktop.
japhb's Unicode search/display script 'u'
#!/usr/bin/env raku
use sigpipe;
use Text::MiscUtils::Layout;
sub show-cp($cp) {
my $char = $cp.chr;
my $width = duospace-width($char);
my $spacer = ' ' x (2 - $width);
my $name = $cp.uniname;
my $age = $char.uniprop('Age');
$age = -1 if $age eq 'Unassigned';
my $script = $char.uniprop('Script');
my @props = $char.uniprops.unique;
my $skip = $script eq 'Unknown'
|| 'Cc' ∈ @props || 'Cn' ∈ @props || 'Cs' ∈ @props;
my $glyph = $skip ?? '' !! $char;
# U+202D = LEFT-TO-RIGHT OVERRIDE (prevents column flipping for RTL characters)
my $bidi = $char.uniprop('Bidi_Class');
my $ltro = $*IGNORE-RTL || $bidi eq 'L' ?? '' !! "\x[202D]";
try printf "%5X $ltro%s$ltro%s %d %4.1f %-5s %-11s %s\n",
$cp, $glyph, $spacer, $width, $age, ~@props, $script, $name;
say "Unable to show codepoint U+{$cp.base(16)}:\n$!" if $!;
}
#| Describe a single unicode character
multi sub MAIN($character where *.chars == 1, Bool :ignore-rtl($*IGNORE-RTL)) {
show-cp($_) for $character.ords;
}
#| Describe a unicode codepoint in +HEX or U+HEX format
multi sub MAIN($codepoint where /^ [u|U]? '+' <:hex>+ $/, Bool :ignore-rtl($*IGNORE-RTL)) {
my $cp = $codepoint.subst(/[u|U]? '+'/, '0x').Numeric;
show-cp($cp);
}
#| Decode a string into individual codepoints
multi sub MAIN('decode', Str:D $string, Bool :ignore-rtl($*IGNORE-RTL)) {
show-cp($_) for $string.ords;
}
#| Search charnames
multi sub MAIN(
*@substrings, #= Primary search terms (auto-uppercased as substrings)
Str :$regex, #= Refinement regex (case insensitive)
Str :$prop, #= Required property
Str :$script, #= Required script
UInt :$first = 0, #= First codepoint to search
UInt :$last = 0x1FFFF, #= Last codepoint to search
Real :$age = Real, #= Exact Unicode version
Real :$oldest = $age || 0, #= Oldest Unicode version
Real :$newest = $age || +~Unicode.version, #= Newest Unicode version
Bool :ignore-rtl($*IGNORE-RTL), #= Ignore RTL (don't force LTR formatting)
) {
# See https://irclog.perlgeek.de/perl6/2016-12-12
# for various conversations on this technique
my $sieve = $first..$last;
for @substrings -> $ss {
my $needle = $ss.uc;
$sieve .= grep: { uniname($_).contains($needle) }
}
$sieve .= grep({ $prop ∈ $_.chr.uniprops }) if $prop;
$sieve .= grep({ $script.fc eq $_.chr.uniprop('Script').fc }) if $script;
$sieve .= grep({ uniname($_).match(rx:i/<$regex>/) }) if $regex;
$sieve .= grep({ my $age = $_.chr.uniprop('Age');
$age eq 'Unassigned' ?? !$oldest
!! $oldest <= $age.Rat <= $newest });
show-cp($_) for @$sieve;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment