Skip to content

Instantly share code, notes, and snippets.

@siahr
Created February 18, 2012 03:24
Show Gist options
  • Save siahr/1857186 to your computer and use it in GitHub Desktop.
Save siahr/1857186 to your computer and use it in GitHub Desktop.
Token Utilities
<?php
/**
* Token Utilities class
*
* The part of this PHP code is based on the product of BaseX Team.
* https://github.com/BaseXdb/basex/blob/master/src/main/java/org/basex/util/Token.java
*
* @package Sugina
* @author BaseX Team 2005-12, BSD License
* @author Christian Gruen
* @author Toshio HIRAI (Porting to PHP)
* @license http://opensource.org/licenses/BSD-3-Clause The BSD License
*/
class TokenUtil {
private function __construct(){}
public static function toChars($str, $encoding=null) {
if ($encoding == null) {
$encoding = mb_detect_encoding($str);
}
if (class_exists("Tagger")) {
$dicEnc = Tagger::$DIC_ENC;
} else {
$dicEnc = "UTF-16LE";
}
return array_values(unpack("S*", mb_convert_encoding($str, $dicEnc, $encoding)));
}
public static function toString(array $chars, $encoding=null) {
if ($encoding == null) {
$encoding = mb_internal_encoding();
}
if (class_exists("Tagger")) {
$dicEnc = Tagger::$DIC_ENC;
} else {
$dicEnc = "UTF-16LE";
}
$s = "";
foreach($chars as $c) {
$s .= mb_convert_encoding(pack("S*", $c), $encoding, $dicEnc);;
}
return $s;
}
public static function lc($chr) {
if ($chr >= 65 && $chr <= 90) return $chr + 32;
else return $chr;
}
/** 0x0041-0x005A, 0x0061-0x007A */
public static function letter($chars) {
foreach($chars as $c) {
if (($c < 65 || $c > 90) && ($c < 97 || $c > 122)) return false;
}
return true;
}
/** 0x0030-0x0039 */
public static function digit($chars) {
foreach($chars as $c) {
if ($c < 48 || $c > 57) return false;
}
return true;
}
public static function letterOrDigit($chars) {
return self::letter($chars) || self::digit($chars);
}
/** 0x0000-0x007E */
public static function ascii($chars) {
foreach($chars as $c) {
if ($c >= 127) return false;
}
return true;
}
/** 0x0000-0x01FF */
public static function western($chars) {
foreach($chars as $c) {
if ($c >= 512) return false;
}
return true;
}
/**
* Returns a normalized character without diacritics.
* This method supports all latin1 characters, including supplements.
*/
public static function dia($chars) {
if (!self::western($chars)) {
return $chars;
}
$res = array();
foreach($chars as $c) {
if ($c < 192) {
$res[] = $c;
} else {
$nc = self::NC();
$res[] = ord($nc[self::hex($c)]);
}
}
return $res;
}
public static function hex($char) {
$h = strtoupper(dechex($char));
switch(strlen($h)){
case 0: return "0x0000";
case 1: return "0x000".$h;
case 2: return "0x00" .$h;
case 3: return "0x0" .$h;
default: return "0x" .$h;
}
}
/** Normalized characters. */
public static function NC() {
return array(
'0x00C0'=>'A' , '0x00C1'=>'A' , '0x00C2'=>'A' , '0x00C3'=>'A' ,
'0x00C4'=>'A' , '0x00C5'=>'A' , '0x00C6'=>'A' , '0x00C7'=>'C' ,
'0x00C8'=>'E' , '0x00C9'=>'E' , '0x00CA'=>'E' , '0x00CB'=>'E' ,
'0x00CC'=>'I' , '0x00CD'=>'I' , '0x00CE'=>'I' , '0x00CF'=>'I' ,
'0x00D0'=>'D' , '0x00D1'=>'N' , '0x00D2'=>'O' , '0x00D3'=>'O' ,
'0x00D4'=>'O' , '0x00D5'=>'O' , '0x00D6'=>'O' , '0x00D8'=>'O' ,
'0x00D9'=>'U' , '0x00DA'=>'U' , '0x00DB'=>'U' , '0x00DC'=>'U' ,
'0x00DD'=>'Y' , '0x00DE'=>'d' , '0x00DF'=>'s' , '0x00E0'=>'a' ,
'0x00E1'=>'a' , '0x00E2'=>'a' , '0x00E3'=>'a' , '0x00E4'=>'a' ,
'0x00E5'=>'a' , '0x00E6'=>'a' , '0x00E7'=>'c' , '0x00E8'=>'e' ,
'0x00E9'=>'e' , '0x00EA'=>'e' , '0x00EB'=>'e' , '0x00EC'=>'i' ,
'0x00ED'=>'i' , '0x00EE'=>'i' , '0x00EF'=>'i' , '0x00F0'=>'d' ,
'0x00F1'=>'n' , '0x00F2'=>'o' , '0x00F3'=>'o' , '0x00F4'=>'o' ,
'0x00F5'=>'o' , '0x00F6'=>'o' , '0x00F8'=>'o' , '0x00F9'=>'u' ,
'0x00FA'=>'u' , '0x00FB'=>'u' , '0x00FC'=>'u' , '0x00FD'=>'y' ,
'0x00FE'=>'d' , '0x00FF'=>'y' , '0x0100'=>'A' , '0x0101'=>'a' ,
'0x0102'=>'A' , '0x0103'=>'a' , '0x0104'=>'A' , '0x0105'=>'a' ,
'0x0106'=>'C' , '0x0107'=>'c' , '0x0108'=>'C' , '0x0109'=>'c' ,
'0x010A'=>'C' , '0x010B'=>'c' , '0x010C'=>'C' , '0x010D'=>'c' ,
'0x010E'=>'D' , '0x010F'=>'d' , '0x0110'=>'D' , '0x0111'=>'d' ,
'0x0112'=>'E' , '0x0113'=>'e' , '0x0114'=>'E' , '0x0115'=>'e' ,
'0x0116'=>'E' , '0x0117'=>'e' , '0x0118'=>'E' , '0x0119'=>'e' ,
'0x011A'=>'E' , '0x011B'=>'e' , '0x011C'=>'G' , '0x011D'=>'g' ,
'0x011E'=>'G' , '0x011F'=>'g' , '0x0120'=>'G' , '0x0121'=>'g' ,
'0x0122'=>'G' , '0x0123'=>'g' , '0x0124'=>'H' , '0x0125'=>'h' ,
'0x0126'=>'H' , '0x0127'=>'h' , '0x0128'=>'I' , '0x0129'=>'i' ,
'0x012A'=>'I' , '0x012B'=>'i' , '0x012C'=>'I' , '0x012D'=>'i' ,
'0x012E'=>'I' , '0x012F'=>'i' , '0x0130'=>'I' , '0x0131'=>'i' ,
'0x0132'=>'I' , '0x0133'=>'i' , '0x0134'=>'J' , '0x0135'=>'j' ,
'0x0136'=>'K' , '0x0137'=>'k' , '0x0138'=>'k' , '0x0139'=>'L' ,
'0x013A'=>'l' , '0x013B'=>'L' , '0x013C'=>'l' , '0x013D'=>'L' ,
'0x013E'=>'l' , '0x013F'=>'L' , '0x0140'=>'l' , '0x0141'=>'L' ,
'0x0142'=>'l' , '0x0143'=>'N' , '0x0144'=>'n' , '0x0145'=>'N' ,
'0x0146'=>'n' , '0x0147'=>'N' , '0x0148'=>'n' , '0x0149'=>'n' ,
'0x014A'=>'N' , '0x014B'=>'n' , '0x014C'=>'O' , '0x014D'=>'o' ,
'0x014E'=>'O' , '0x014F'=>'o' , '0x0150'=>'O' , '0x0151'=>'o' ,
'0x0152'=>'O' , '0x0153'=>'o' , '0x0154'=>'R' , '0x0155'=>'r' ,
'0x0156'=>'R' , '0x0157'=>'r' , '0x0158'=>'R' , '0x0159'=>'r' ,
'0x015A'=>'S' , '0x015B'=>'s' , '0x015C'=>'S' , '0x015D'=>'s' ,
'0x015E'=>'S' , '0x015F'=>'s' , '0x0160'=>'S' , '0x0161'=>'s' ,
'0x0162'=>'T' , '0x0163'=>'t' , '0x0164'=>'T' , '0x0165'=>'t' ,
'0x0166'=>'T' , '0x0167'=>'t' , '0x0168'=>'U' , '0x0169'=>'u' ,
'0x016A'=>'U' , '0x016B'=>'u' , '0x016C'=>'U' , '0x016D'=>'u' ,
'0x016E'=>'U' , '0x016F'=>'u' , '0x0170'=>'U' , '0x0171'=>'u' ,
'0x0172'=>'U' , '0x0173'=>'u' , '0x0174'=>'W' , '0x0175'=>'w' ,
'0x0176'=>'Y' , '0x0177'=>'y' , '0x0178'=>'Y' , '0x0179'=>'Z' ,
'0x017A'=>'z' , '0x017B'=>'Z' , '0x017C'=>'z' , '0x017D'=>'Z' ,
'0x017E'=>'z' , '0x01FA'=>'A' , '0x01FB'=>'a' , '0x01FC'=>'A' ,
'0x01FD'=>'a' , '0x01FE'=>'O' , '0x01FF'=>'o'
);
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment