Skip to content

Instantly share code, notes, and snippets.

@Narsil
Last active July 9, 2020 08:32
Show Gist options
  • Save Narsil/1df9fbbf5296a8d4d62de55dcb2fe700 to your computer and use it in GitHub Desktop.
Save Narsil/1df9fbbf5296a8d4d62de55dcb2fe700 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
# Copyright 2016 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Generate unicode_sciript_data.h from Unicode Scripts.txt
#
# usage: perl unicode_script.pl < scripts.txt > src/models/unigram/unicode.rs
#
print "// Generated by modified Perl script at https://github.com/google/sentencepiece/blob/master/data/gen_unicode_scripts_code.pl\n";
print "// Unicode scripts : https://gist.github.com/Narsil/07556f26dc84a6baeff4d499e68d3cd2\n";
print "// Rust adaptation : https://gist.github.com/Narsil/1df9fbbf5296a8d4d62de55dcb2fe700\n\n";
print "#[derive(PartialEq, Debug, Clone, Copy)]\n";
print "pub enum Script {\n";
print " Any,\n";
print " Adlam,\n";
print " Ahom,\n";
print " AnatolianHieroglyphs,\n";
print " Arabic,\n";
print " Armenian,\n";
print " Avestan,\n";
print " Balinese,\n";
print " Bamum,\n";
print " BassaVah,\n";
print " Batak,\n";
print " Bengali,\n";
print " Bhaiksuki,\n";
print " Bopomofo,\n";
print " Brahmi,\n";
print " Braille,\n";
print " Buginese,\n";
print " Buhid,\n";
print " CanadianAboriginal,\n";
print " Carian,\n";
print " CaucasianAlbanian,\n";
print " Chakma,\n";
print " Cham,\n";
print " Cherokee,\n";
print " Common,\n";
print " Coptic,\n";
print " Cuneiform,\n";
print " Cypriot,\n";
print " Cyrillic,\n";
print " Deseret,\n";
print " Devanagari,\n";
print " Duployan,\n";
print " EgyptianHieroglyphs,\n";
print " Elbasan,\n";
print " Ethiopic,\n";
print " Georgian,\n";
print " Glagolitic,\n";
print " Gothic,\n";
print " Grantha,\n";
print " Greek,\n";
print " Gujarati,\n";
print " Gurmukhi,\n";
print " Han,\n";
print " Hangul,\n";
print " Hanunoo,\n";
print " Hatran,\n";
print " Hebrew,\n";
print " Hiragana,\n";
print " ImperialAramaic,\n";
print " Inherited,\n";
print " InscriptionalPahlavi,\n";
print " InscriptionalParthian,\n";
print " Javanese,\n";
print " Kaithi,\n";
print " Kannada,\n";
print " Katakana,\n";
print " KayahLi,\n";
print " Kharoshthi,\n";
print " Khmer,\n";
print " Khojki,\n";
print " Khudawadi,\n";
print " Lao,\n";
print " Latin,\n";
print " Lepcha,\n";
print " Limbu,\n";
print " LinearA,\n";
print " LinearB,\n";
print " Lisu,\n";
print " Lycian,\n";
print " Lydian,\n";
print " Mahajani,\n";
print " Malayalam,\n";
print " Mandaic,\n";
print " Manichaean,\n";
print " Marchen,\n";
print " MeeteiMayek,\n";
print " MendeKikakui,\n";
print " MeroiticCursive,\n";
print " MeroiticHieroglyphs,\n";
print " Miao,\n";
print " Modi,\n";
print " Mongolian,\n";
print " Mro,\n";
print " Multani,\n";
print " Myanmar,\n";
print " Nabataean,\n";
print " NewTaiLue,\n";
print " Newa,\n";
print " Nko,\n";
print " Ogham,\n";
print " OlChiki,\n";
print " OldHungarian,\n";
print " OldItalic,\n";
print " OldNorthArabian,\n";
print " OldPermic,\n";
print " OldPersian,\n";
print " OldSouthArabian,\n";
print " OldTurkic,\n";
print " Oriya,\n";
print " Osage,\n";
print " Osmanya,\n";
print " PahawhHmong,\n";
print " Palmyrene,\n";
print " PauCinHau,\n";
print " PhagsPa,\n";
print " Phoenician,\n";
print " PsalterPahlavi,\n";
print " Rejang,\n";
print " Runic,\n";
print " Samaritan,\n";
print " Saurashtra,\n";
print " Sharada,\n";
print " Shavian,\n";
print " Siddham,\n";
print " SignWriting,\n";
print " Sinhala,\n";
print " SoraSompeng,\n";
print " Sundanese,\n";
print " SylotiNagri,\n";
print " Syriac,\n";
print " Tagalog,\n";
print " Tagbanwa,\n";
print " TaiLe,\n";
print " TaiTham,\n";
print " TaiViet,\n";
print " Takri,\n";
print " Tamil,\n";
print " Tangut,\n";
print " Telugu,\n";
print " Thaana,\n";
print " Thai,\n";
print " Tibetan,\n";
print " Tifinagh,\n";
print " Tirhuta,\n";
print " Ugaritic,\n";
print " Vai,\n";
print " WarangCiti,\n";
print " Yi,\n";
print "}\n\n";
print "pub fn get_script(c: &char) -> Script {\n";
print " match *c as u32 {\n";
while (<>) {
chomp;
if (/^([0-9A-F]+)\s+;\s+(\S+)\s+\#/) {
my $char = $1;
my $script = $2;
$script =~ s/_//g;
printf(" 0x%s => Script::%s,\n", $char, $script);
} elsif (/^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\S+)\s+\#/) {
my $start = $1;
my $end = $2;
my $script = $3;
$script =~ s/_//g;
printf(" 0x%s..=0x%s => Script::%s,\n", $start, $end, $script);
} else {
next;
}
}
print " _ => Script::Any,\n";
print " }\n";
print "}\n";
print "\n";
print "#[cfg(test)]\n";
print "mod tests {\n";
print " use super::*;\n";
print "\n";
print " #[test]\n";
print " fn test_unicode_script() {\n";
print " assert_eq!(Script::Han, get_script(&'京'));\n";
print " assert_eq!(Script::Han, get_script(&'太'));\n";
print " assert_eq!(Script::Hiragana, get_script(&'い'));\n";
print " assert_eq!(Script::Katakana, get_script(&'グ'));\n";
print " assert_eq!(Script::Common, get_script(&'ー'));\n";
print " assert_eq!(Script::Latin, get_script(&'a'));\n";
print " assert_eq!(Script::Latin, get_script(&'A'));\n";
print " assert_eq!(Script::Common, get_script(&'0'));\n";
print " assert_eq!(Script::Common, get_script(&'\$'));\n";
print " assert_eq!(Script::Common, get_script(&'\@'));\n";
print " assert_eq!(Script::Common, get_script(&'-'));\n";
print " }\n";
print "}\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment