Skip to content

Instantly share code, notes, and snippets.

@nanto
Created June 5, 2016 02:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nanto/502769aef22cc07a93349351ffd2550c to your computer and use it in GitHub Desktop.
Save nanto/502769aef22cc07a93349351ffd2550c to your computer and use it in GitHub Desktop.
Convert Perl regexp `\p{...}` to JS regexp
# Usage: $0 <property>
#
# Print a JavaScript regular expression pattern
# corresponding to a Perl regular expression pattern `\p{property}`.
#
# See `perldoc perluniprops` for values that can be specified as <property>.
#
# Author: nanto_vi
# License: Public Domain
use strict;
use warnings;
use utf8;
use feature qw(say);
use Unicode::UCD qw(prop_invlist);
my $property = shift;
die 'No property is specified' unless defined $property;
my $invlist = [prop_invlist($property)];
push @$invlist, 0x10FFFF + 1 if @$invlist % 2 == 1;
my $bmp_range_list = []; # bmp: Basic Multilingual Plane
my $sps_range_list = []; # sps: Supplementaly Planes
while (my ($a, $b) = splice @$invlist, 0, 2) {
# Ignore single surrogate
if ($a < 0xD800 && 0xD800 < $b) {
unshift @$invlist, $a, 0xD800, 0xD800, $b;
next;
}
if (0xD800 <= $a && $b <= 0xDFFF + 1) {
next;
}
if ($a < 0xDFFF + 1 && 0xDFFF + 1 < $b) {
unshift @$invlist, 0xDFFF + 1, $b;
next;
}
if ($a < 0x10000 && 0x10000 < $b) {
unshift @$invlist, $a, 0x10000, 0x10000, $b;
next;
}
if ($b <= 0x10000) {
push @$bmp_range_list, [$a, $b - 1];
} else {
push @$sps_range_list, [$a, $b - 1];
}
}
die 'Invalid property' if !@$bmp_range_list && !@$sps_range_list;
my $sps_naive_alternatives = [];
for (@$sps_range_list) {
my $a_hi = int(($_->[0] - 0x10000) / 0x400) + 0xD800;
my $a_lo = (($_->[0] - 0x10000) % 0x400) + 0xDC00;
my $b_hi = int(($_->[1] - 0x10000) / 0x400) + 0xD800;
my $b_lo = (($_->[1] - 0x10000) % 0x400) + 0xDC00;
if ($a_hi == $b_hi) {
push @$sps_naive_alternatives, { hi_range => [$a_hi, $a_hi], lo_range => [$a_lo, $b_lo] };
} else {
my $has_head = ($a_lo != 0xDC00);
my $has_tail = ($b_lo != 0xDFFF);
my $body_hi_start = $has_head ? $a_hi + 1 : $a_hi;
my $body_hi_end = $has_tail ? $b_hi - 1 : $b_hi;
my $has_body = ($body_hi_start <= $body_hi_end);
push @$sps_naive_alternatives, { hi_range => [$a_hi, $a_hi], lo_range => [$a_lo, 0xDFFF] } if $has_head;
push @$sps_naive_alternatives, { hi_range => [$body_hi_start, $body_hi_end], lo_range => [0xDC00, 0xDFFF] } if $has_body;
push @$sps_naive_alternatives, { hi_range => [$b_hi, $b_hi], lo_range => [0xDC00, $b_lo] } if $has_tail;
}
}
my $sps_alternatives = [];
my $last_hi = -1;
for (@$sps_naive_alternatives) {
my $hi = $_->{hi_range}->[1];
if ($hi == $last_hi) {
push @{ $sps_alternatives->[-1]->{lo_range_list} }, $_->{lo_range};
} else {
push @$sps_alternatives, {
hi_range_list => [$_->{hi_range}],
lo_range_list => [$_->{lo_range}],
};
}
$last_hi = $hi;
}
sub char_class_string {
my ($range_list) = @_;
if (@$range_list == 1 && $range_list->[0]->[0] == $range_list->[0]->[1]) {
return sprintf('\u%04X', $range_list->[0]->[0]);
}
my $range_strings = [ map {
my ($a, $b) = @$_;
($a == $b) ? sprintf('\u%04X', $a) : sprintf('\u%04X-\u%04X', $a, $b);
} @$range_list ];
return '[' . join('', @$range_strings) . ']';
}
my $alternative_strings = [
@$bmp_range_list ? char_class_string($bmp_range_list) : (),
map {
char_class_string($_->{hi_range_list}) .
char_class_string($_->{lo_range_list})
} @$sps_alternatives,
];
my $pattern = @$sps_alternatives
? '(?:' . join('|', @$alternative_strings) . ')'
: $alternative_strings->[0];
say Unicode::UCD::UnicodeVersion;
say $pattern;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment