Skip to content

Instantly share code, notes, and snippets.

@vsemozhetbyt
Created March 5, 2018 02:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vsemozhetbyt/893044264a088b96025b2d51ed17aec3 to your computer and use it in GitHub Desktop.
Save vsemozhetbyt/893044264a088b96025b2d51ed17aec3 to your computer and use it in GitHub Desktop.
re-unicode-properties.js
'use strict';
const unicodeProperties = {
nonBinaryNames: {
General_Category: [
'Letter',
'Cased_Letter',
'Uppercase_Letter',
'Lowercase_Letter',
'Titlecase_Letter',
'Modifier_Letter',
'Other_Letter',
'Mark',
'Nonspacing_Mark',
'Spacing_Mark',
'Enclosing_Mark',
'Number',
'Decimal_Number',
'Letter_Number',
'Other_Number',
'Symbol',
'Math_Symbol',
'Currency_Symbol',
'Modifier_Symbol',
'Other_Symbol',
'Punctuation',
'Connector_Punctuation',
'Dash_Punctuation',
'Open_Punctuation',
'Close_Punctuation',
'Initial_Punctuation',
'Final_Punctuation',
'Other_Punctuation',
'Separator',
'Space_Separator',
'Line_Separator',
'Paragraph_Separator',
'Other',
'Control',
'Format',
'Surrogate',
'Private_Use',
'Unassigned',
],
Script: [
'Adlam',
'Ahom',
'Anatolian_Hieroglyphs',
'Arabic',
'Armenian',
'Avestan',
'Balinese',
'Bamum',
'Bassa_Vah',
'Batak',
'Bengali',
'Bhaiksuki',
'Bopomofo',
'Brahmi',
'Braille',
'Buginese',
'Buhid',
'Canadian_Aboriginal',
'Carian',
'Caucasian_Albanian',
'Chakma',
'Cham',
'Cherokee',
'Common',
'Coptic',
'Cuneiform',
'Cypriot',
'Cyrillic',
'Deseret',
'Devanagari',
'Duployan',
'Egyptian_Hieroglyphs',
'Elbasan',
'Ethiopic',
'Georgian',
'Glagolitic',
'Gothic',
'Grantha',
'Greek',
'Gujarati',
'Gurmukhi',
'Han',
'Hangul',
'Hanunoo',
'Hatran',
'Hebrew',
'Hiragana',
'Imperial_Aramaic',
'Inherited',
'Inscriptional_Pahlavi',
'Inscriptional_Parthian',
'Javanese',
'Kaithi',
'Kannada',
'Katakana',
'Kayah_Li',
'Kharoshthi',
'Khmer',
'Khojki',
'Khudawadi',
'Lao',
'Latin',
'Lepcha',
'Limbu',
'Linear_A',
'Linear_B',
'Lisu',
'Lycian',
'Lydian',
'Mahajani',
'Malayalam',
'Mandaic',
'Manichaean',
'Marchen',
'Masaram_Gondi',
'Meetei_Mayek',
'Mende_Kikakui',
'Meroitic_Cursive',
'Meroitic_Hieroglyphs',
'Miao',
'Modi',
'Mongolian',
'Mro',
'Multani',
'Myanmar',
'Nabataean',
'New_Tai_Lue',
'Newa',
'Nko',
'Nushu',
'Ogham',
'Ol_Chiki',
'Old_Hungarian',
'Old_Italic',
'Old_North_Arabian',
'Old_Permic',
'Old_Persian',
'Old_South_Arabian',
'Old_Turkic',
'Oriya',
'Osage',
'Osmanya',
'Pahawh_Hmong',
'Palmyrene',
'Pau_Cin_Hau',
'Phags_Pa',
'Phoenician',
'Psalter_Pahlavi',
'Rejang',
'Runic',
'Samaritan',
'Saurashtra',
'Sharada',
'Shavian',
'Siddham',
'SignWriting',
'Sinhala',
'Sora_Sompeng',
'Soyombo',
'Sundanese',
'Syloti_Nagri',
'Syriac',
'Tagalog',
'Tagbanwa',
'Tai_Le',
'Tai_Tham',
'Tai_Viet',
'Takri',
'Tamil',
'Tangut',
'Telugu',
'Thaana',
'Thai',
'Tibetan',
'Tifinagh',
'Tirhuta',
'Ugaritic',
'Vai',
'Warang_Citi',
'Yi',
'Zanabazar_Square',
],
get Script_Extensions() {
return this.Script;
},
},
binaryNames: [
'ASCII',
'ASCII_Hex_Digit',
'Alphabetic',
'Any',
'Assigned',
'Bidi_Control',
'Bidi_Mirrored',
'Case_Ignorable',
'Cased',
'Changes_When_Casefolded',
'Changes_When_Casemapped',
'Changes_When_Lowercased',
'Changes_When_NFKC_Casefolded',
'Changes_When_Titlecased',
'Changes_When_Uppercased',
'Dash',
'Default_Ignorable_Code_Point',
'Deprecated',
'Diacritic',
'Emoji',
'Emoji_Component',
'Emoji_Modifier',
'Emoji_Modifier_Base',
'Emoji_Presentation',
'Extender',
'Grapheme_Base',
'Grapheme_Extend',
'Hex_Digit',
'IDS_Binary_Operator',
'IDS_Trinary_Operator',
'ID_Continue',
'ID_Start',
'Ideographic',
'Join_Control',
'Logical_Order_Exception',
'Lowercase',
'Math',
'Noncharacter_Code_Point',
'Pattern_Syntax',
'Pattern_White_Space',
'Quotation_Mark',
'Radical',
'Regional_Indicator',
'Sentence_Terminal',
'Soft_Dotted',
'Terminal_Punctuation',
'Unified_Ideograph',
'Uppercase',
'Variation_Selector',
'White_Space',
'XID_Continue',
'XID_Start',
],
};
module.exports = [
...unicodeProperties.nonBinaryNames.General_Category
.map(value => RegExp(`\\p{gc=${value}}`, 'u')),
...unicodeProperties.nonBinaryNames.Script
.map(value => RegExp(`\\p{sc=${value}}`, 'u')),
...unicodeProperties.nonBinaryNames.Script_Extensions
.map(value => RegExp(`\\p{scx=${value}}`, 'u')),
...unicodeProperties.binaryNames
.map(binaryName => RegExp(`\\p{${binaryName}}`, 'u')),
];
@vsemozhetbyt
Copy link
Author

vsemozhetbyt commented Mar 5, 2018

'use strict';

const { writeFileSync } = require('fs');
const reUnicodeProperties = require('./re-unicode-properties.js');


writeFileSync(
  're-unicode-properties.serialize.json',
  `${JSON.stringify(reUnicodeProperties.map(re => re.source), null, 2)}\n`,
);
[
  "\\p{gc=Letter}",
  "\\p{gc=Cased_Letter}",
  "\\p{gc=Uppercase_Letter}",
  "\\p{gc=Lowercase_Letter}",
  "\\p{gc=Titlecase_Letter}",
  "\\p{gc=Modifier_Letter}",
  "\\p{gc=Other_Letter}",
  "\\p{gc=Mark}",
  "\\p{gc=Nonspacing_Mark}",
  "\\p{gc=Spacing_Mark}",
  "\\p{gc=Enclosing_Mark}",
  "\\p{gc=Number}",
  "\\p{gc=Decimal_Number}",
  "\\p{gc=Letter_Number}",
  "\\p{gc=Other_Number}",
  "\\p{gc=Symbol}",
  "\\p{gc=Math_Symbol}",
  "\\p{gc=Currency_Symbol}",
  "\\p{gc=Modifier_Symbol}",
  "\\p{gc=Other_Symbol}",
  "\\p{gc=Punctuation}",
  "\\p{gc=Connector_Punctuation}",
  "\\p{gc=Dash_Punctuation}",
  "\\p{gc=Open_Punctuation}",
  "\\p{gc=Close_Punctuation}",
  "\\p{gc=Initial_Punctuation}",
  "\\p{gc=Final_Punctuation}",
  "\\p{gc=Other_Punctuation}",
  "\\p{gc=Separator}",
  "\\p{gc=Space_Separator}",
  "\\p{gc=Line_Separator}",
  "\\p{gc=Paragraph_Separator}",
  "\\p{gc=Other}",
  "\\p{gc=Control}",
  "\\p{gc=Format}",
  "\\p{gc=Surrogate}",
  "\\p{gc=Private_Use}",
  "\\p{gc=Unassigned}",
  "\\p{sc=Adlam}",
  "\\p{sc=Ahom}",
  "\\p{sc=Anatolian_Hieroglyphs}",
  "\\p{sc=Arabic}",
  "\\p{sc=Armenian}",
  "\\p{sc=Avestan}",
  "\\p{sc=Balinese}",
  "\\p{sc=Bamum}",
  "\\p{sc=Bassa_Vah}",
  "\\p{sc=Batak}",
  "\\p{sc=Bengali}",
  "\\p{sc=Bhaiksuki}",
  "\\p{sc=Bopomofo}",
  "\\p{sc=Brahmi}",
  "\\p{sc=Braille}",
  "\\p{sc=Buginese}",
  "\\p{sc=Buhid}",
  "\\p{sc=Canadian_Aboriginal}",
  "\\p{sc=Carian}",
  "\\p{sc=Caucasian_Albanian}",
  "\\p{sc=Chakma}",
  "\\p{sc=Cham}",
  "\\p{sc=Cherokee}",
  "\\p{sc=Common}",
  "\\p{sc=Coptic}",
  "\\p{sc=Cuneiform}",
  "\\p{sc=Cypriot}",
  "\\p{sc=Cyrillic}",
  "\\p{sc=Deseret}",
  "\\p{sc=Devanagari}",
  "\\p{sc=Duployan}",
  "\\p{sc=Egyptian_Hieroglyphs}",
  "\\p{sc=Elbasan}",
  "\\p{sc=Ethiopic}",
  "\\p{sc=Georgian}",
  "\\p{sc=Glagolitic}",
  "\\p{sc=Gothic}",
  "\\p{sc=Grantha}",
  "\\p{sc=Greek}",
  "\\p{sc=Gujarati}",
  "\\p{sc=Gurmukhi}",
  "\\p{sc=Han}",
  "\\p{sc=Hangul}",
  "\\p{sc=Hanunoo}",
  "\\p{sc=Hatran}",
  "\\p{sc=Hebrew}",
  "\\p{sc=Hiragana}",
  "\\p{sc=Imperial_Aramaic}",
  "\\p{sc=Inherited}",
  "\\p{sc=Inscriptional_Pahlavi}",
  "\\p{sc=Inscriptional_Parthian}",
  "\\p{sc=Javanese}",
  "\\p{sc=Kaithi}",
  "\\p{sc=Kannada}",
  "\\p{sc=Katakana}",
  "\\p{sc=Kayah_Li}",
  "\\p{sc=Kharoshthi}",
  "\\p{sc=Khmer}",
  "\\p{sc=Khojki}",
  "\\p{sc=Khudawadi}",
  "\\p{sc=Lao}",
  "\\p{sc=Latin}",
  "\\p{sc=Lepcha}",
  "\\p{sc=Limbu}",
  "\\p{sc=Linear_A}",
  "\\p{sc=Linear_B}",
  "\\p{sc=Lisu}",
  "\\p{sc=Lycian}",
  "\\p{sc=Lydian}",
  "\\p{sc=Mahajani}",
  "\\p{sc=Malayalam}",
  "\\p{sc=Mandaic}",
  "\\p{sc=Manichaean}",
  "\\p{sc=Marchen}",
  "\\p{sc=Masaram_Gondi}",
  "\\p{sc=Meetei_Mayek}",
  "\\p{sc=Mende_Kikakui}",
  "\\p{sc=Meroitic_Cursive}",
  "\\p{sc=Meroitic_Hieroglyphs}",
  "\\p{sc=Miao}",
  "\\p{sc=Modi}",
  "\\p{sc=Mongolian}",
  "\\p{sc=Mro}",
  "\\p{sc=Multani}",
  "\\p{sc=Myanmar}",
  "\\p{sc=Nabataean}",
  "\\p{sc=New_Tai_Lue}",
  "\\p{sc=Newa}",
  "\\p{sc=Nko}",
  "\\p{sc=Nushu}",
  "\\p{sc=Ogham}",
  "\\p{sc=Ol_Chiki}",
  "\\p{sc=Old_Hungarian}",
  "\\p{sc=Old_Italic}",
  "\\p{sc=Old_North_Arabian}",
  "\\p{sc=Old_Permic}",
  "\\p{sc=Old_Persian}",
  "\\p{sc=Old_South_Arabian}",
  "\\p{sc=Old_Turkic}",
  "\\p{sc=Oriya}",
  "\\p{sc=Osage}",
  "\\p{sc=Osmanya}",
  "\\p{sc=Pahawh_Hmong}",
  "\\p{sc=Palmyrene}",
  "\\p{sc=Pau_Cin_Hau}",
  "\\p{sc=Phags_Pa}",
  "\\p{sc=Phoenician}",
  "\\p{sc=Psalter_Pahlavi}",
  "\\p{sc=Rejang}",
  "\\p{sc=Runic}",
  "\\p{sc=Samaritan}",
  "\\p{sc=Saurashtra}",
  "\\p{sc=Sharada}",
  "\\p{sc=Shavian}",
  "\\p{sc=Siddham}",
  "\\p{sc=SignWriting}",
  "\\p{sc=Sinhala}",
  "\\p{sc=Sora_Sompeng}",
  "\\p{sc=Soyombo}",
  "\\p{sc=Sundanese}",
  "\\p{sc=Syloti_Nagri}",
  "\\p{sc=Syriac}",
  "\\p{sc=Tagalog}",
  "\\p{sc=Tagbanwa}",
  "\\p{sc=Tai_Le}",
  "\\p{sc=Tai_Tham}",
  "\\p{sc=Tai_Viet}",
  "\\p{sc=Takri}",
  "\\p{sc=Tamil}",
  "\\p{sc=Tangut}",
  "\\p{sc=Telugu}",
  "\\p{sc=Thaana}",
  "\\p{sc=Thai}",
  "\\p{sc=Tibetan}",
  "\\p{sc=Tifinagh}",
  "\\p{sc=Tirhuta}",
  "\\p{sc=Ugaritic}",
  "\\p{sc=Vai}",
  "\\p{sc=Warang_Citi}",
  "\\p{sc=Yi}",
  "\\p{sc=Zanabazar_Square}",
  "\\p{scx=Adlam}",
  "\\p{scx=Ahom}",
  "\\p{scx=Anatolian_Hieroglyphs}",
  "\\p{scx=Arabic}",
  "\\p{scx=Armenian}",
  "\\p{scx=Avestan}",
  "\\p{scx=Balinese}",
  "\\p{scx=Bamum}",
  "\\p{scx=Bassa_Vah}",
  "\\p{scx=Batak}",
  "\\p{scx=Bengali}",
  "\\p{scx=Bhaiksuki}",
  "\\p{scx=Bopomofo}",
  "\\p{scx=Brahmi}",
  "\\p{scx=Braille}",
  "\\p{scx=Buginese}",
  "\\p{scx=Buhid}",
  "\\p{scx=Canadian_Aboriginal}",
  "\\p{scx=Carian}",
  "\\p{scx=Caucasian_Albanian}",
  "\\p{scx=Chakma}",
  "\\p{scx=Cham}",
  "\\p{scx=Cherokee}",
  "\\p{scx=Common}",
  "\\p{scx=Coptic}",
  "\\p{scx=Cuneiform}",
  "\\p{scx=Cypriot}",
  "\\p{scx=Cyrillic}",
  "\\p{scx=Deseret}",
  "\\p{scx=Devanagari}",
  "\\p{scx=Duployan}",
  "\\p{scx=Egyptian_Hieroglyphs}",
  "\\p{scx=Elbasan}",
  "\\p{scx=Ethiopic}",
  "\\p{scx=Georgian}",
  "\\p{scx=Glagolitic}",
  "\\p{scx=Gothic}",
  "\\p{scx=Grantha}",
  "\\p{scx=Greek}",
  "\\p{scx=Gujarati}",
  "\\p{scx=Gurmukhi}",
  "\\p{scx=Han}",
  "\\p{scx=Hangul}",
  "\\p{scx=Hanunoo}",
  "\\p{scx=Hatran}",
  "\\p{scx=Hebrew}",
  "\\p{scx=Hiragana}",
  "\\p{scx=Imperial_Aramaic}",
  "\\p{scx=Inherited}",
  "\\p{scx=Inscriptional_Pahlavi}",
  "\\p{scx=Inscriptional_Parthian}",
  "\\p{scx=Javanese}",
  "\\p{scx=Kaithi}",
  "\\p{scx=Kannada}",
  "\\p{scx=Katakana}",
  "\\p{scx=Kayah_Li}",
  "\\p{scx=Kharoshthi}",
  "\\p{scx=Khmer}",
  "\\p{scx=Khojki}",
  "\\p{scx=Khudawadi}",
  "\\p{scx=Lao}",
  "\\p{scx=Latin}",
  "\\p{scx=Lepcha}",
  "\\p{scx=Limbu}",
  "\\p{scx=Linear_A}",
  "\\p{scx=Linear_B}",
  "\\p{scx=Lisu}",
  "\\p{scx=Lycian}",
  "\\p{scx=Lydian}",
  "\\p{scx=Mahajani}",
  "\\p{scx=Malayalam}",
  "\\p{scx=Mandaic}",
  "\\p{scx=Manichaean}",
  "\\p{scx=Marchen}",
  "\\p{scx=Masaram_Gondi}",
  "\\p{scx=Meetei_Mayek}",
  "\\p{scx=Mende_Kikakui}",
  "\\p{scx=Meroitic_Cursive}",
  "\\p{scx=Meroitic_Hieroglyphs}",
  "\\p{scx=Miao}",
  "\\p{scx=Modi}",
  "\\p{scx=Mongolian}",
  "\\p{scx=Mro}",
  "\\p{scx=Multani}",
  "\\p{scx=Myanmar}",
  "\\p{scx=Nabataean}",
  "\\p{scx=New_Tai_Lue}",
  "\\p{scx=Newa}",
  "\\p{scx=Nko}",
  "\\p{scx=Nushu}",
  "\\p{scx=Ogham}",
  "\\p{scx=Ol_Chiki}",
  "\\p{scx=Old_Hungarian}",
  "\\p{scx=Old_Italic}",
  "\\p{scx=Old_North_Arabian}",
  "\\p{scx=Old_Permic}",
  "\\p{scx=Old_Persian}",
  "\\p{scx=Old_South_Arabian}",
  "\\p{scx=Old_Turkic}",
  "\\p{scx=Oriya}",
  "\\p{scx=Osage}",
  "\\p{scx=Osmanya}",
  "\\p{scx=Pahawh_Hmong}",
  "\\p{scx=Palmyrene}",
  "\\p{scx=Pau_Cin_Hau}",
  "\\p{scx=Phags_Pa}",
  "\\p{scx=Phoenician}",
  "\\p{scx=Psalter_Pahlavi}",
  "\\p{scx=Rejang}",
  "\\p{scx=Runic}",
  "\\p{scx=Samaritan}",
  "\\p{scx=Saurashtra}",
  "\\p{scx=Sharada}",
  "\\p{scx=Shavian}",
  "\\p{scx=Siddham}",
  "\\p{scx=SignWriting}",
  "\\p{scx=Sinhala}",
  "\\p{scx=Sora_Sompeng}",
  "\\p{scx=Soyombo}",
  "\\p{scx=Sundanese}",
  "\\p{scx=Syloti_Nagri}",
  "\\p{scx=Syriac}",
  "\\p{scx=Tagalog}",
  "\\p{scx=Tagbanwa}",
  "\\p{scx=Tai_Le}",
  "\\p{scx=Tai_Tham}",
  "\\p{scx=Tai_Viet}",
  "\\p{scx=Takri}",
  "\\p{scx=Tamil}",
  "\\p{scx=Tangut}",
  "\\p{scx=Telugu}",
  "\\p{scx=Thaana}",
  "\\p{scx=Thai}",
  "\\p{scx=Tibetan}",
  "\\p{scx=Tifinagh}",
  "\\p{scx=Tirhuta}",
  "\\p{scx=Ugaritic}",
  "\\p{scx=Vai}",
  "\\p{scx=Warang_Citi}",
  "\\p{scx=Yi}",
  "\\p{scx=Zanabazar_Square}",
  "\\p{ASCII}",
  "\\p{ASCII_Hex_Digit}",
  "\\p{Alphabetic}",
  "\\p{Any}",
  "\\p{Assigned}",
  "\\p{Bidi_Control}",
  "\\p{Bidi_Mirrored}",
  "\\p{Case_Ignorable}",
  "\\p{Cased}",
  "\\p{Changes_When_Casefolded}",
  "\\p{Changes_When_Casemapped}",
  "\\p{Changes_When_Lowercased}",
  "\\p{Changes_When_NFKC_Casefolded}",
  "\\p{Changes_When_Titlecased}",
  "\\p{Changes_When_Uppercased}",
  "\\p{Dash}",
  "\\p{Default_Ignorable_Code_Point}",
  "\\p{Deprecated}",
  "\\p{Diacritic}",
  "\\p{Emoji}",
  "\\p{Emoji_Component}",
  "\\p{Emoji_Modifier}",
  "\\p{Emoji_Modifier_Base}",
  "\\p{Emoji_Presentation}",
  "\\p{Extender}",
  "\\p{Grapheme_Base}",
  "\\p{Grapheme_Extend}",
  "\\p{Hex_Digit}",
  "\\p{IDS_Binary_Operator}",
  "\\p{IDS_Trinary_Operator}",
  "\\p{ID_Continue}",
  "\\p{ID_Start}",
  "\\p{Ideographic}",
  "\\p{Join_Control}",
  "\\p{Logical_Order_Exception}",
  "\\p{Lowercase}",
  "\\p{Math}",
  "\\p{Noncharacter_Code_Point}",
  "\\p{Pattern_Syntax}",
  "\\p{Pattern_White_Space}",
  "\\p{Quotation_Mark}",
  "\\p{Radical}",
  "\\p{Regional_Indicator}",
  "\\p{Sentence_Terminal}",
  "\\p{Soft_Dotted}",
  "\\p{Terminal_Punctuation}",
  "\\p{Unified_Ideograph}",
  "\\p{Uppercase}",
  "\\p{Variation_Selector}",
  "\\p{White_Space}",
  "\\p{XID_Continue}",
  "\\p{XID_Start}"
]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment