Skip to content

Instantly share code, notes, and snippets.

@miken32
Last active May 10, 2017 23:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save miken32/5e6785b2d2bd0bf13bbb2d910df373a1 to your computer and use it in GitHub Desktop.
Save miken32/5e6785b2d2bd0bf13bbb2d910df373a1 to your computer and use it in GitHub Desktop.
Verify if all characters of a string belong to a given Unicode character block
<?php
/**
* BlockCheck 1.0
*
* Copyright (C) 2017 Michael Newton
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* @package BlockCheck
* @author Michael Newton <miken32>
* @copyright 2017 Michael Newton
* @license https://opensource.org/licenses/GPL-3.0 GPLv3
* @version 1.0
* @see http://stackoverflow.com/a/43010755/1255289
*/
namespace Miken32;
/**
* BlockCheck
*
* A utility class to check whether or not strings are part of
* a Unicode character block.
*
* Can be used statically or not. Sample usage:
* ```
* <?php
* use Miken32\BlockCheck as BC;
* if (BC::isValid("🜈🝮🝤", BC::ALCHEMICAL_SYMBOLS)) {
* echo "Valid characters\n";
* } else {
* echo "Invalid characters\n";
* }
*
* $myChecker = new BC("BASIC_LATIN");
* $test = "Meet me at the café.";
* if ($myChecker->check($test)) {
* echo "Valid characters\n";
* } else {
* echo "Invalid characters\n";
* }
* $myChecker->addBlock(BC::LATIN_1_SUPPLEMENT);
* if ($myChecker->check($test)) {
* echo "Valid characters\n";
* } else {
* echo "Invalid characters\n";
* }
* ```
*/
class BlockCheck
{
const ADLAM = [0x1E900, 0x1E95F];
const AEGEAN_NUMBERS = [0x10100, 0x1013F];
const AHOM = [0x11700, 0x1173F];
const ALCHEMICAL_SYMBOLS = [0x1F700, 0x1F77F];
const ALPHABETIC_PRESENTATION_FORMS = [0xFB00, 0xFB4F];
const ANATOLIAN_HIEROGLYPHS = [0x14400, 0x1467F];
const ANCIENT_GREEK_MUSICAL_NOTATION = [0x1D200, 0x1D24F];
const ANCIENT_GREEK_NUMBERS = [0x10140, 0x1018F];
const ANCIENT_SYMBOLS = [0x10190, 0x101CF];
const ARABIC = [0x0600, 0x06FF];
const ARABIC_EXTENDED_A = [0x08A0, 0x08FF];
const ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS = [0x1EE00, 0x1EEFF];
const ARABIC_PRESENTATION_FORMS_A = [0xFB50, 0xFDFF];
const ARABIC_PRESENTATION_FORMS_B = [0xFE70, 0xFEFF];
const ARABIC_SUPPLEMENT = [0x0750, 0x077F];
const ARMENIAN = [0x0530, 0x058F];
const ARROWS = [0x2190, 0x21FF];
const AVESTAN = [0x10B00, 0x10B3F];
const BALINESE = [0x1B00, 0x1B7F];
const BAMUM = [0xA6A0, 0xA6FF];
const BAMUM_SUPPLEMENT = [0x16800, 0x16A3F];
const BASIC_LATIN = [0x0000, 0x007F];
const BASSA_VAH = [0x16AD0, 0x16AFF];
const BATAK = [0x1BC0, 0x1BFF];
const BENGALI = [0x0980, 0x09FF];
const BHAIKSUKI = [0x11C00, 0x11C6F];
const BLOCK_ELEMENTS = [0x2580, 0x259F];
const BOPOMOFO = [0x3100, 0x312F];
const BOPOMOFO_EXTENDED = [0x31A0, 0x31BF];
const BOX_DRAWING = [0x2500, 0x257F];
const BRAHMI = [0x11000, 0x1107F];
const BRAILLE_PATTERNS = [0x2800, 0x28FF];
const BUGINESE = [0x1A00, 0x1A1F];
const BUHID = [0x1740, 0x175F];
const BYZANTINE_MUSICAL_SYMBOLS = [0x1D000, 0x1D0FF];
const CARIAN = [0x102A0, 0x102DF];
const CAUCASIAN_ALBANIAN = [0x10530, 0x1056F];
const CHAKMA = [0x11100, 0x1114F];
const CHAM = [0xAA00, 0xAA5F];
const CHEROKEE = [0x13A0, 0x13FF];
const CHEROKEE_SUPPLEMENT = [0xAB70, 0xABBF];
const CJK_COMPATIBILITY = [0x3300, 0x33FF];
const CJK_COMPATIBILITY_FORMS = [0xFE30, 0xFE4F];
const CJK_COMPATIBILITY_IDEOGRAPHS = [0xF900, 0xFAFF];
const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = [0x2F800, 0x2FA1F];
const CJK_RADICALS_SUPPLEMENT = [0x2E80, 0x2EFF];
const CJK_STROKES = [0x31C0, 0x31EF];
const CJK_SYMBOLS_AND_PUNCTUATION = [0x3000, 0x303F];
const CJK_UNIFIED_IDEOGRAPHS = [0x4E00, 0x9FFF];
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = [0x3400, 0x4DBF];
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = [0x20000, 0x2A6DF];
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = [0x2A700, 0x2B73F];
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = [0x2B740, 0x2B81F];
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E = [0x2B820, 0x2CEAF];
const COMBINING_DIACRITICAL_MARKS = [0x0300, 0x036F];
const COMBINING_DIACRITICAL_MARKS_EXTENDED = [0x1AB0, 0x1AFF];
const COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS = [0x20D0, 0x20FF];
const COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = [0x1DC0, 0x1DFF];
const COMBINING_HALF_MARKS = [0xFE20, 0xFE2F];
const COMMON_INDIC_NUMBER_FORMS = [0xA830, 0xA83F];
const CONTROL_PICTURES = [0x2400, 0x243F];
const COPTIC = [0x2C80, 0x2CFF];
const COPTIC_EPACT_NUMBERS = [0x102E0, 0x102FF];
const COUNTING_ROD_NUMERALS = [0x1D360, 0x1D37F];
const CUNEIFORM = [0x12000, 0x123FF];
const CUNEIFORM_NUMBERS_AND_PUNCTUATION = [0x12400, 0x1247F];
const CURRENCY_SYMBOLS = [0x20A0, 0x20CF];
const CYPRIOT_SYLLABARY = [0x10800, 0x1083F];
const CYRILLIC = [0x0400, 0x04FF];
const CYRILLIC_EXTENDED_A = [0x2DE0, 0x2DFF];
const CYRILLIC_EXTENDED_B = [0xA640, 0xA69F];
const CYRILLIC_EXTENDED_C = [0x1C80, 0x1C8F];
const CYRILLIC_SUPPLEMENT = [0x0500, 0x052F];
const DESERET = [0x10400, 0x1044F];
const DEVANAGARI = [0x0900, 0x097F];
const DEVANAGARI_EXTENDED = [0xA8E0, 0xA8FF];
const DINGBATS = [0x2700, 0x27BF];
const DOMINO_TILES = [0x1F030, 0x1F09F];
const DUPLOYAN = [0x1BC00, 0x1BC9F];
const EARLY_DYNASTIC_CUNEIFORM = [0x12480, 0x1254F];
const EGYPTIAN_HIEROGLYPHS = [0x13000, 0x1342F];
const ELBASAN = [0x10500, 0x1052F];
const EMOTICONS = [0x1F600, 0x1F64F];
const ENCLOSED_ALPHANUMERICS = [0x2460, 0x24FF];
const ENCLOSED_ALPHANUMERIC_SUPPLEMENT = [0x1F100, 0x1F1FF];
const ENCLOSED_CJK_LETTERS_AND_MONTHS = [0x3200, 0x32FF];
const ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = [0x1F200, 0x1F2FF];
const ETHIOPIC = [0x1200, 0x137F];
const ETHIOPIC_EXTENDED = [0x2D80, 0x2DDF];
const ETHIOPIC_EXTENDED_A = [0xAB00, 0xAB2F];
const ETHIOPIC_SUPPLEMENT = [0x1380, 0x139F];
const GENERAL_PUNCTUATION = [0x2000, 0x206F];
const GEOMETRIC_SHAPES = [0x25A0, 0x25FF];
const GEOMETRIC_SHAPES_EXTENDED = [0x1F780, 0x1F7FF];
const GEORGIAN = [0x10A0, 0x10FF];
const GEORGIAN_SUPPLEMENT = [0x2D00, 0x2D2F];
const GLAGOLITIC = [0x2C00, 0x2C5F];
const GLAGOLITIC_SUPPLEMENT = [0x1E000, 0x1E02F];
const GOTHIC = [0x10330, 0x1034F];
const GRANTHA = [0x11300, 0x1137F];
const GREEK_AND_COPTIC = [0x0370, 0x03FF];
const GREEK_EXTENDED = [0x1F00, 0x1FFF];
const GUJARATI = [0x0A80, 0x0AFF];
const GURMUKHI = [0x0A00, 0x0A7F];
const HALFWIDTH_AND_FULLWIDTH_FORMS = [0xFF00, 0xFFEF];
const HANGUL_COMPATIBILITY_JAMO = [0x3130, 0x318F];
const HANGUL_JAMO = [0x1100, 0x11FF];
const HANGUL_JAMO_EXTENDED_A = [0xA960, 0xA97F];
const HANGUL_JAMO_EXTENDED_B = [0xD7B0, 0xD7FF];
const HANGUL_SYLLABLES = [0xAC00, 0xD7AF];
const HANUNOO = [0x1720, 0x173F];
const HATRAN = [0x108E0, 0x108FF];
const HEBREW = [0x0590, 0x05FF];
const HIGH_PRIVATE_USE_SURROGATES = [0xDB80, 0xDBFF];
const HIGH_SURROGATES = [0xD800, 0xDB7F];
const HIRAGANA = [0x3040, 0x309F];
const IDEOGRAPHIC_DESCRIPTION_CHARACTERS = [0x2FF0, 0x2FFF];
const IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION = [0x16FE0, 0x16FFF];
const IMPERIAL_ARAMAIC = [0x10840, 0x1085F];
const INSCRIPTIONAL_PAHLAVI = [0x10B60, 0x10B7F];
const INSCRIPTIONAL_PARTHIAN = [0x10B40, 0x10B5F];
const IPA_EXTENSIONS = [0x0250, 0x02AF];
const JAVANESE = [0xA980, 0xA9DF];
const KAITHI = [0x11080, 0x110CF];
const KANA_SUPPLEMENT = [0x1B000, 0x1B0FF];
const KANBUN = [0x3190, 0x319F];
const KANGXI_RADICALS = [0x2F00, 0x2FDF];
const KANNADA = [0x0C80, 0x0CFF];
const KATAKANA = [0x30A0, 0x30FF];
const KATAKANA_PHONETIC_EXTENSIONS = [0x31F0, 0x31FF];
const KAYAH_LI = [0xA900, 0xA92F];
const KHAROSHTHI = [0x10A00, 0x10A5F];
const KHMER = [0x1780, 0x17FF];
const KHMER_SYMBOLS = [0x19E0, 0x19FF];
const KHOJKI = [0x11200, 0x1124F];
const KHUDAWADI = [0x112B0, 0x112FF];
const LAO = [0x0E80, 0x0EFF];
const LATIN_1_SUPPLEMENT = [0x0080, 0x00FF];
const LATIN_EXTENDED_A = [0x0100, 0x017F];
const LATIN_EXTENDED_ADDITIONAL = [0x1E00, 0x1EFF];
const LATIN_EXTENDED_B = [0x0180, 0x024F];
const LATIN_EXTENDED_C = [0x2C60, 0x2C7F];
const LATIN_EXTENDED_D = [0xA720, 0xA7FF];
const LATIN_EXTENDED_E = [0xAB30, 0xAB6F];
const LEPCHA = [0x1C00, 0x1C4F];
const LETTERLIKE_SYMBOLS = [0x2100, 0x214F];
const LIMBU = [0x1900, 0x194F];
const LINEAR_A = [0x10600, 0x1077F];
const LINEAR_B_IDEOGRAMS = [0x10080, 0x100FF];
const LINEAR_B_SYLLABARY = [0x10000, 0x1007F];
const LISU = [0xA4D0, 0xA4FF];
const LOW_SURROGATES = [0xDC00, 0xDFFF];
const LYCIAN = [0x10280, 0x1029F];
const LYDIAN = [0x10920, 0x1093F];
const MAHAJANI = [0x11150, 0x1117F];
const MAHJONG_TILES = [0x1F000, 0x1F02F];
const MALAYALAM = [0x0D00, 0x0D7F];
const MANDAIC = [0x0840, 0x085F];
const MANICHAEAN = [0x10AC0, 0x10AFF];
const MARCHEN = [0x11C70, 0x11CBF];
const MATHEMATICAL_ALPHANUMERIC_SYMBOLS = [0x1D400, 0x1D7FF];
const MATHEMATICAL_OPERATORS = [0x2200, 0x22FF];
const MEETEI_MAYEK = [0xABC0, 0xABFF];
const MEETEI_MAYEK_EXTENSIONS = [0xAAE0, 0xAAFF];
const MENDE_KIKAKUI = [0x1E800, 0x1E8DF];
const MEROITIC_CURSIVE = [0x109A0, 0x109FF];
const MEROITIC_HIEROGLYPHS = [0x10980, 0x1099F];
const MIAO = [0x16F00, 0x16F9F];
const MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = [0x27C0, 0x27EF];
const MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = [0x2980, 0x29FF];
const MISCELLANEOUS_SYMBOLS = [0x2600, 0x26FF];
const MISCELLANEOUS_SYMBOLS_AND_ARROWS = [0x2B00, 0x2BFF];
const MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = [0x1F300, 0x1F5FF];
const MISCELLANEOUS_TECHNICAL = [0x2300, 0x23FF];
const MODI = [0x11600, 0x1165F];
const MODIFIER_TONE_LETTERS = [0xA700, 0xA71F];
const MONGOLIAN = [0x1800, 0x18AF];
const MONGOLIAN_SUPPLEMENT = [0x11660, 0x1167F];
const MRO = [0x16A40, 0x16A6F];
const MULTANI = [0x11280, 0x112AF];
const MUSICAL_SYMBOLS = [0x1D100, 0x1D1FF];
const MYANMAR = [0x1000, 0x109F];
const MYANMAR_EXTENDED_A = [0xAA60, 0xAA7F];
const MYANMAR_EXTENDED_B = [0xA9E0, 0xA9FF];
const NABATAEAN = [0x10880, 0x108AF];
const NEWA = [0x11400, 0x1147F];
const NEW_TAI_LUE = [0x1980, 0x19DF];
const NKO = [0x07C0, 0x07FF];
const NUMBER_FORMS = [0x2150, 0x218F];
const OGHAM = [0x1680, 0x169F];
const OLD_HUNGARIAN = [0x10C80, 0x10CFF];
const OLD_ITALIC = [0x10300, 0x1032F];
const OLD_NORTH_ARABIAN = [0x10A80, 0x10A9F];
const OLD_PERMIC = [0x10350, 0x1037F];
const OLD_PERSIAN = [0x103A0, 0x103DF];
const OLD_SOUTH_ARABIAN = [0x10A60, 0x10A7F];
const OLD_TURKIC = [0x10C00, 0x10C4F];
const OL_CHIKI = [0x1C50, 0x1C7F];
const OPTICAL_CHARACTER_RECOGNITION = [0x2440, 0x245F];
const ORIYA = [0x0B00, 0x0B7F];
const ORNAMENTAL_DINGBATS = [0x1F650, 0x1F67F];
const OSAGE = [0x104B0, 0x104FF];
const OSMANYA = [0x10480, 0x104AF];
const PAHAWH_HMONG = [0x16B00, 0x16B8F];
const PALMYRENE = [0x10860, 0x1087F];
const PAU_CIN_HAU = [0x11AC0, 0x11AFF];
const PHAGS_PA = [0xA840, 0xA87F];
const PHAISTOS_DISC = [0x101D0, 0x101FF];
const PHOENICIAN = [0x10900, 0x1091F];
const PHONETIC_EXTENSIONS = [0x1D00, 0x1D7F];
const PHONETIC_EXTENSIONS_SUPPLEMENT = [0x1D80, 0x1DBF];
const PLAYING_CARDS = [0x1F0A0, 0x1F0FF];
const PRIVATE_USE_AREA = [0xE000, 0xF8FF];
const PSALTER_PAHLAVI = [0x10B80, 0x10BAF];
const REJANG = [0xA930, 0xA95F];
const RUMI_NUMERAL_SYMBOLS = [0x10E60, 0x10E7F];
const RUNIC = [0x16A0, 0x16FF];
const SAMARITAN = [0x0800, 0x083F];
const SAURASHTRA = [0xA880, 0xA8DF];
const SHARADA = [0x11180, 0x111DF];
const SHAVIAN = [0x10450, 0x1047F];
const SHORTHAND_FORMAT_CONTROLS = [0x1BCA0, 0x1BCAF];
const SIDDHAM = [0x11580, 0x115FF];
const SINHALA = [0x0D80, 0x0DFF];
const SINHALA_ARCHAIC_NUMBERS = [0x111E0, 0x111FF];
const SMALL_FORM_VARIANTS = [0xFE50, 0xFE6F];
const SORA_SOMPENG = [0x110D0, 0x110FF];
const SPACING_MODIFIER_LETTERS = [0x02B0, 0x02FF];
const SPECIALS = [0xFFF0, 0xFFFF];
const SUNDANESE = [0x1B80, 0x1BBF];
const SUNDANESE_SUPPLEMENT = [0x1CC0, 0x1CCF];
const SUPERSCRIPTS_AND_SUBSCRIPTS = [0x2070, 0x209F];
const SUPPLEMENTAL_ARROWS_A = [0x27F0, 0x27FF];
const SUPPLEMENTAL_ARROWS_B = [0x2900, 0x297F];
const SUPPLEMENTAL_ARROWS_C = [0x1F800, 0x1F8FF];
const SUPPLEMENTAL_MATHEMATICAL_OPERATORS = [0x2A00, 0x2AFF];
const SUPPLEMENTAL_PUNCTUATION = [0x2E00, 0x2E7F];
const SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS = [0x1F900, 0x1F9FF];
const SUPPLEMENTARY_PRIVATE_USE_AREA_A = [0xF0000, 0xFFFFF];
const SUPPLEMENTARY_PRIVATE_USE_AREA_B = [0x100000, 0x10FFFF];
const SUTTON_SIGNWRITING = [0x1D800, 0x1DAAF];
const SYLOTI_NAGRI = [0xA800, 0xA82F];
const SYRIAC = [0x0700, 0x074F];
const TAGALOG = [0x1700, 0x171F];
const TAGBANWA = [0x1760, 0x177F];
const TAGS = [0xE0000, 0xE007F];
const TAI_LE = [0x1950, 0x197F];
const TAI_THAM = [0x1A20, 0x1AAF];
const TAI_VIET = [0xAA80, 0xAADF];
const TAI_XUAN_JING_SYMBOLS = [0x1D300, 0x1D35F];
const TAKRI = [0x11680, 0x116CF];
const TAMIL = [0x0B80, 0x0BFF];
const TANGUT = [0x17000, 0x187FF];
const TANGUT_COMPONENTS = [0x18800, 0x18AFF];
const TELUGU = [0x0C00, 0x0C7F];
const THAANA = [0x0780, 0x07BF];
const THAI = [0x0E00, 0x0E7F];
const TIBETAN = [0x0F00, 0x0FFF];
const TIFINAGH = [0x2D30, 0x2D7F];
const TIRHUTA = [0x11480, 0x114DF];
const TRANSPORT_AND_MAP_SYMBOLS = [0x1F680, 0x1F6FF];
const UGARITIC = [0x10380, 0x1039F];
const UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = [0x1400, 0x167F];
const UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = [0x18B0, 0x18FF];
const VAI = [0xA500, 0xA63F];
const VARIATION_SELECTORS = [0xFE00, 0xFE0F];
const VARIATION_SELECTORS_SUPPLEMENT = [0xE0100, 0xE01EF];
const VEDIC_EXTENSIONS = [0x1CD0, 0x1CFF];
const VERTICAL_FORMS = [0xFE10, 0xFE1F];
const WARANG_CITI = [0x118A0, 0x118FF];
const YIJING_HEXAGRAM_SYMBOLS = [0x4DC0, 0x4DFF];
const YI_RADICALS = [0xA490, 0xA4CF];
const YI_SYLLABLES = [0xA000, 0xA48F];
/** @var string $block The Unicode character block to check against */
private $blocks = [];
/**
* Ojbect constructor
*
* Can be passed one or more of the following:
* * a constant defining a Unicode character block
* * a string containing the name of a constant
* * an array containing start and end code points in numeric form
*
* @param string|int[] ...$blocks The block(s) to check against
* @return void
* @throws \Exception if an invalid block is passed
*/
public function __construct(...$blocks)
{
foreach ($blocks as $block) {
if (is_string($block) && defined("self::$block")) {
$this->blocks[] = constant("self::$block");
} elseif (!is_array($block) || $block[1] < $block[0]) {
throw new \Exception("Bad character range passed!");
} else {
$this->blocks[] = $block;
}
}
}
/**
* Adds a block to the existing list
*
* @param string|int[] $block The block to add (see constructor for details)
* @return void
* @throws \Exception if an invalid block is passed
*/
public function addBlock($block)
{
if (is_string($block) && defined("self::$block")) {
$this->blocks[] = constant("self::$block");
} elseif (!is_array($block) || $block[1] < $block[0]) {
throw new \Exception("Bad character range passed!");
} else {
$this->blocks[] = $block;
}
}
/**
* Checks if the given string is composed only of characters in the defined block(s)
*
* @param string $string The string to check
* @return boolean
*/
public function check($string)
{
if (empty($string)) {
return true;
}
$pattern = "/^[";
foreach ($this->blocks as $block) {
$st = dechex($block[0]);
$fi = dechex($block[1]);
// note double braces to get literal braces in expression
$pattern .= "\x{{$st}}-\x{{$fi}}";
}
$pattern .= "]*$/u";
return (boolean)preg_match($pattern, $string);
}
/**
* Static method to check if the string is composed only of characters in a block
*
* @param string $string The string to check
* @param string|int[] $block The block to check against (see constructor for details)
* @return boolean
* @throws \Exception if an invalid block is passed
*/
public static function isValid($string, $block)
{
if (empty($string)) {
return true;
} elseif (is_string($block) && defined("self::$block")) {
$block = constant("self::$block");
} elseif (!is_array($block) || $block[1] < $block[0]) {
throw new Exception("Bad character range passed!");
}
$st = dechex($block[0]);
$fi = dechex($block[1]);
// note double braces to get literal braces in expression
$pattern = "/^[\x{{$st}}-\x{{$fi}}]*$/u";
return (boolean)preg_match($pattern, $string);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment