Skip to content

Instantly share code, notes, and snippets.

@voku
Created May 2, 2018 07:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save voku/e5f634fd2fdc77b5363339fa5bed7e19 to your computer and use it in GitHub Desktop.
Save voku/e5f634fd2fdc77b5363339fa5bed7e19 to your computer and use it in GitHub Desktop.
simple, small and dependency-free version of https://github.com/voku/portable-utf8
<?php
/**
* Class StringLib
*
* @see https://github.com/voku/portable-utf8
*/
class StringLib {
/**
* @var array
*/
private static $SUPPORT = array();
/**
* @var array
*/
private static $ENCODINGS = array(
'ANSI_X3.4-1968',
'ANSI_X3.4-1986',
'ASCII',
'CP367',
'IBM367',
'ISO-IR-6',
'ISO646-US',
'ISO_646.IRV:1991',
'US',
'US-ASCII',
'CSASCII',
'UTF-8',
'ISO-10646-UCS-2',
'UCS-2',
'CSUNICODE',
'UCS-2BE',
'UNICODE-1-1',
'UNICODEBIG',
'CSUNICODE11',
'UCS-2LE',
'UNICODELITTLE',
'ISO-10646-UCS-4',
'UCS-4',
'CSUCS4',
'UCS-4BE',
'UCS-4LE',
'UTF-16',
'UTF-16BE',
'UTF-16LE',
'UTF-32',
'UTF-32BE',
'UTF-32LE',
'UNICODE-1-1-UTF-7',
'UTF-7',
'CSUNICODE11UTF7',
'UCS-2-INTERNAL',
'UCS-2-SWAPPED',
'UCS-4-INTERNAL',
'UCS-4-SWAPPED',
'C99',
'JAVA',
'CP819',
'IBM819',
'ISO-8859-1',
'ISO-IR-100',
'ISO8859-1',
'ISO_8859-1',
'ISO_8859-1:1987',
'L1',
'LATIN1',
'CSISOLATIN1',
'ISO-8859-2',
'ISO-IR-101',
'ISO8859-2',
'ISO_8859-2',
'ISO_8859-2:1987',
'L2',
'LATIN2',
'CSISOLATIN2',
'ISO-8859-3',
'ISO-IR-109',
'ISO8859-3',
'ISO_8859-3',
'ISO_8859-3:1988',
'L3',
'LATIN3',
'CSISOLATIN3',
'ISO-8859-4',
'ISO-IR-110',
'ISO8859-4',
'ISO_8859-4',
'ISO_8859-4:1988',
'L4',
'LATIN4',
'CSISOLATIN4',
'CYRILLIC',
'ISO-8859-5',
'ISO-IR-144',
'ISO8859-5',
'ISO_8859-5',
'ISO_8859-5:1988',
'CSISOLATINCYRILLIC',
'ARABIC',
'ASMO-708',
'ECMA-114',
'ISO-8859-6',
'ISO-IR-127',
'ISO8859-6',
'ISO_8859-6',
'ISO_8859-6:1987',
'CSISOLATINARABIC',
'ECMA-118',
'ELOT_928',
'GREEK',
'GREEK8',
'ISO-8859-7',
'ISO-IR-126',
'ISO8859-7',
'ISO_8859-7',
'ISO_8859-7:1987',
'ISO_8859-7:2003',
'CSISOLATINGREEK',
'HEBREW',
'ISO-8859-8',
'ISO-IR-138',
'ISO8859-8',
'ISO_8859-8',
'ISO_8859-8:1988',
'CSISOLATINHEBREW',
'ISO-8859-9',
'ISO-IR-148',
'ISO8859-9',
'ISO_8859-9',
'ISO_8859-9:1989',
'L5',
'LATIN5',
'CSISOLATIN5',
'ISO-8859-10',
'ISO-IR-157',
'ISO8859-10',
'ISO_8859-10',
'ISO_8859-10:1992',
'L6',
'LATIN6',
'CSISOLATIN6',
'ISO-8859-11',
'ISO8859-11',
'ISO_8859-11',
'ISO-8859-13',
'ISO-IR-179',
'ISO8859-13',
'ISO_8859-13',
'L7',
'LATIN7',
'ISO-8859-14',
'ISO-CELTIC',
'ISO-IR-199',
'ISO8859-14',
'ISO_8859-14',
'ISO_8859-14:1998',
'L8',
'LATIN8',
'ISO-8859-15',
'ISO-IR-203',
'ISO8859-15',
'ISO_8859-15',
'ISO_8859-15:1998',
'LATIN-9',
'ISO-8859-16',
'ISO-IR-226',
'ISO8859-16',
'ISO_8859-16',
'ISO_8859-16:2001',
'L10',
'LATIN10',
'KOI8-R',
'CSKOI8R',
'KOI8-U',
'KOI8-RU',
'CP1250',
'MS-EE',
'WINDOWS-1250',
'CP1251',
'MS-CYRL',
'WINDOWS-1251',
'CP1252',
'MS-ANSI',
'WINDOWS-1252',
'CP1253',
'MS-GREEK',
'WINDOWS-1253',
'CP1254',
'MS-TURK',
'WINDOWS-1254',
'CP1255',
'MS-HEBR',
'WINDOWS-1255',
'CP1256',
'MS-ARAB',
'WINDOWS-1256',
'CP1257',
'WINBALTRIM',
'WINDOWS-1257',
'CP1258',
'WINDOWS-1258',
'850',
'CP850',
'IBM850',
'CSPC850MULTILINGUAL',
'862',
'CP862',
'IBM862',
'CSPC862LATINHEBREW',
'866',
'CP866',
'IBM866',
'CSIBM866',
'MAC',
'MACINTOSH',
'MACROMAN',
'CSMACINTOSH',
'MACCENTRALEUROPE',
'MACICELAND',
'MACCROATIAN',
'MACROMANIA',
'MACCYRILLIC',
'MACUKRAINE',
'MACGREEK',
'MACTURKISH',
'MACHEBREW',
'MACARABIC',
'MACTHAI',
'HP-ROMAN8',
'R8',
'ROMAN8',
'CSHPROMAN8',
'NEXTSTEP',
'ARMSCII-8',
'GEORGIAN-ACADEMY',
'GEORGIAN-PS',
'KOI8-T',
'CP154',
'CYRILLIC-ASIAN',
'PT154',
'PTCP154',
'CSPTCP154',
'KZ-1048',
'RK1048',
'STRK1048-2002',
'CSKZ1048',
'MULELAO-1',
'CP1133',
'IBM-CP1133',
'ISO-IR-166',
'TIS-620',
'TIS620',
'TIS620-0',
'TIS620.2529-1',
'TIS620.2533-0',
'TIS620.2533-1',
'CP874',
'WINDOWS-874',
'VISCII',
'VISCII1.1-1',
'CSVISCII',
'TCVN',
'TCVN-5712',
'TCVN5712-1',
'TCVN5712-1:1993',
'ISO-IR-14',
'ISO646-JP',
'JIS_C6220-1969-RO',
'JP',
'CSISO14JISC6220RO',
'JISX0201-1976',
'JIS_X0201',
'X0201',
'CSHALFWIDTHKATAKANA',
'ISO-IR-87',
'JIS0208',
'JIS_C6226-1983',
'JIS_X0208',
'JIS_X0208-1983',
'JIS_X0208-1990',
'X0208',
'CSISO87JISX0208',
'ISO-IR-159',
'JIS_X0212',
'JIS_X0212-1990',
'JIS_X0212.1990-0',
'X0212',
'CSISO159JISX02121990',
'CN',
'GB_1988-80',
'ISO-IR-57',
'ISO646-CN',
'CSISO57GB1988',
'CHINESE',
'GB_2312-80',
'ISO-IR-58',
'CSISO58GB231280',
'CN-GB-ISOIR165',
'ISO-IR-165',
'ISO-IR-149',
'KOREAN',
'KSC_5601',
'KS_C_5601-1987',
'KS_C_5601-1989',
'CSKSC56011987',
'EUC-JP',
'EUCJP',
'EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE',
'CSEUCPKDFMTJAPANESE',
'MS_KANJI',
'SHIFT-JIS',
'SHIFT_JIS',
'SJIS',
'CSSHIFTJIS',
'CP932',
'ISO-2022-JP',
'CSISO2022JP',
'ISO-2022-JP-1',
'ISO-2022-JP-2',
'CSISO2022JP2',
'CN-GB',
'EUC-CN',
'EUCCN',
'GB2312',
'CSGB2312',
'GBK',
'CP936',
'MS936',
'WINDOWS-936',
'GB18030',
'ISO-2022-CN',
'CSISO2022CN',
'ISO-2022-CN-EXT',
'HZ',
'HZ-GB-2312',
'EUC-TW',
'EUCTW',
'CSEUCTW',
'BIG-5',
'BIG-FIVE',
'BIG5',
'BIGFIVE',
'CN-BIG5',
'CSBIG5',
'CP950',
'BIG5-HKSCS:1999',
'BIG5-HKSCS:2001',
'BIG5-HKSCS',
'BIG5-HKSCS:2004',
'BIG5HKSCS',
'EUC-KR',
'EUCKR',
'CSEUCKR',
'CP949',
'UHC',
'CP1361',
'JOHAB',
'ISO-2022-KR',
'CSISO2022KR',
'CP856',
'CP922',
'CP943',
'CP1046',
'CP1124',
'CP1129',
'CP1161',
'IBM-1161',
'IBM1161',
'CSIBM1161',
'CP1162',
'IBM-1162',
'IBM1162',
'CSIBM1162',
'CP1163',
'IBM-1163',
'IBM1163',
'CSIBM1163',
'DEC-KANJI',
'DEC-HANYU',
'437',
'CP437',
'IBM437',
'CSPC8CODEPAGE437',
'CP737',
'CP775',
'IBM775',
'CSPC775BALTIC',
'852',
'CP852',
'IBM852',
'CSPCP852',
'CP853',
'855',
'CP855',
'IBM855',
'CSIBM855',
'857',
'CP857',
'IBM857',
'CSIBM857',
'CP858',
'860',
'CP860',
'IBM860',
'CSIBM860',
'861',
'CP-IS',
'CP861',
'IBM861',
'CSIBM861',
'863',
'CP863',
'IBM863',
'CSIBM863',
'CP864',
'IBM864',
'CSIBM864',
'865',
'CP865',
'IBM865',
'CSIBM865',
'869',
'CP-GR',
'CP869',
'IBM869',
'CSIBM869',
'CP1125',
'EUC-JISX0213',
'SHIFT_JISX0213',
'ISO-2022-JP-3',
'BIG5-2003',
'ISO-IR-230',
'TDS565',
'ATARI',
'ATARIST',
'RISCOS-LATIN1',
);
/**
* @var array
*/
private static $ORD = array(
"\x00" => 0,
"\x01" => 1,
"\x02" => 2,
"\x03" => 3,
"\x04" => 4,
"\x05" => 5,
"\x06" => 6,
"\x07" => 7,
"\x08" => 8,
"\x09" => 9,
"\x0A" => 10,
"\x0B" => 11,
"\x0C" => 12,
"\x0D" => 13,
"\x0E" => 14,
"\x0F" => 15,
"\x10" => 16,
"\x11" => 17,
"\x12" => 18,
"\x13" => 19,
"\x14" => 20,
"\x15" => 21,
"\x16" => 22,
"\x17" => 23,
"\x18" => 24,
"\x19" => 25,
"\x1A" => 26,
"\x1B" => 27,
"\x1C" => 28,
"\x1D" => 29,
"\x1E" => 30,
"\x1F" => 31,
"\x20" => 32,
"\x21" => 33,
"\x22" => 34,
"\x23" => 35,
"\x24" => 36,
"\x25" => 37,
"\x26" => 38,
"\x27" => 39,
"\x28" => 40,
"\x29" => 41,
"\x2A" => 42,
"\x2B" => 43,
"\x2C" => 44,
"\x2D" => 45,
"\x2E" => 46,
"\x2F" => 47,
"\x30" => 48,
"\x31" => 49,
"\x32" => 50,
"\x33" => 51,
"\x34" => 52,
"\x35" => 53,
"\x36" => 54,
"\x37" => 55,
"\x38" => 56,
"\x39" => 57,
"\x3A" => 58,
"\x3B" => 59,
"\x3C" => 60,
"\x3D" => 61,
"\x3E" => 62,
"\x3F" => 63,
"\x40" => 64,
"\x41" => 65,
"\x42" => 66,
"\x43" => 67,
"\x44" => 68,
"\x45" => 69,
"\x46" => 70,
"\x47" => 71,
"\x48" => 72,
"\x49" => 73,
"\x4A" => 74,
"\x4B" => 75,
"\x4C" => 76,
"\x4D" => 77,
"\x4E" => 78,
"\x4F" => 79,
"\x50" => 80,
"\x51" => 81,
"\x52" => 82,
"\x53" => 83,
"\x54" => 84,
"\x55" => 85,
"\x56" => 86,
"\x57" => 87,
"\x58" => 88,
"\x59" => 89,
"\x5A" => 90,
"\x5B" => 91,
"\x5C" => 92,
"\x5D" => 93,
"\x5E" => 94,
"\x5F" => 95,
"\x60" => 96,
"\x61" => 97,
"\x62" => 98,
"\x63" => 99,
"\x64" => 100,
"\x65" => 101,
"\x66" => 102,
"\x67" => 103,
"\x68" => 104,
"\x69" => 105,
"\x6A" => 106,
"\x6B" => 107,
"\x6C" => 108,
"\x6D" => 109,
"\x6E" => 110,
"\x6F" => 111,
"\x70" => 112,
"\x71" => 113,
"\x72" => 114,
"\x73" => 115,
"\x74" => 116,
"\x75" => 117,
"\x76" => 118,
"\x77" => 119,
"\x78" => 120,
"\x79" => 121,
"\x7A" => 122,
"\x7B" => 123,
"\x7C" => 124,
"\x7D" => 125,
"\x7E" => 126,
"\x7F" => 127,
"\x80" => 128,
"\x81" => 129,
"\x82" => 130,
"\x83" => 131,
"\x84" => 132,
"\x85" => 133,
"\x86" => 134,
"\x87" => 135,
"\x88" => 136,
"\x89" => 137,
"\x8A" => 138,
"\x8B" => 139,
"\x8C" => 140,
"\x8D" => 141,
"\x8E" => 142,
"\x8F" => 143,
"\x90" => 144,
"\x91" => 145,
"\x92" => 146,
"\x93" => 147,
"\x94" => 148,
"\x95" => 149,
"\x96" => 150,
"\x97" => 151,
"\x98" => 152,
"\x99" => 153,
"\x9A" => 154,
"\x9B" => 155,
"\x9C" => 156,
"\x9D" => 157,
"\x9E" => 158,
"\x9F" => 159,
"\xA0" => 160,
"\xA1" => 161,
"\xA2" => 162,
"\xA3" => 163,
"\xA4" => 164,
"\xA5" => 165,
"\xA6" => 166,
"\xA7" => 167,
"\xA8" => 168,
"\xA9" => 169,
"\xAA" => 170,
"\xAB" => 171,
"\xAC" => 172,
"\xAD" => 173,
"\xAE" => 174,
"\xAF" => 175,
"\xB0" => 176,
"\xB1" => 177,
"\xB2" => 178,
"\xB3" => 179,
"\xB4" => 180,
"\xB5" => 181,
"\xB6" => 182,
"\xB7" => 183,
"\xB8" => 184,
"\xB9" => 185,
"\xBA" => 186,
"\xBB" => 187,
"\xBC" => 188,
"\xBD" => 189,
"\xBE" => 190,
"\xBF" => 191,
"\xC0" => 192,
"\xC1" => 193,
"\xC2" => 194,
"\xC3" => 195,
"\xC4" => 196,
"\xC5" => 197,
"\xC6" => 198,
"\xC7" => 199,
"\xC8" => 200,
"\xC9" => 201,
"\xCA" => 202,
"\xCB" => 203,
"\xCC" => 204,
"\xCD" => 205,
"\xCE" => 206,
"\xCF" => 207,
"\xD0" => 208,
"\xD1" => 209,
"\xD2" => 210,
"\xD3" => 211,
"\xD4" => 212,
"\xD5" => 213,
"\xD6" => 214,
"\xD7" => 215,
"\xD8" => 216,
"\xD9" => 217,
"\xDA" => 218,
"\xDB" => 219,
"\xDC" => 220,
"\xDD" => 221,
"\xDE" => 222,
"\xDF" => 223,
"\xE0" => 224,
"\xE1" => 225,
"\xE2" => 226,
"\xE3" => 227,
"\xE4" => 228,
"\xE5" => 229,
"\xE6" => 230,
"\xE7" => 231,
"\xE8" => 232,
"\xE9" => 233,
"\xEA" => 234,
"\xEB" => 235,
"\xEC" => 236,
"\xED" => 237,
"\xEE" => 238,
"\xEF" => 239,
"\xF0" => 240,
"\xF1" => 241,
"\xF2" => 242,
"\xF3" => 243,
"\xF4" => 244,
"\xF5" => 245,
"\xF6" => 246,
"\xF7" => 247,
"\xF8" => 248,
"\xF9" => 249,
"\xFA" => 250,
"\xFB" => 251,
"\xFC" => 252,
"\xFD" => 253,
"\xFE" => 254,
"\xFF" => 255,
);
/**
* @var array
*/
private static $CHR = array(
0 => "\x00",
1 => "\x01",
2 => "\x02",
3 => "\x03",
4 => "\x04",
5 => "\x05",
6 => "\x06",
7 => "\x07",
8 => "\x08",
9 => "\x09",
10 => "\x0A",
11 => "\x0B",
12 => "\x0C",
13 => "\x0D",
14 => "\x0E",
15 => "\x0F",
16 => "\x10",
17 => "\x11",
18 => "\x12",
19 => "\x13",
20 => "\x14",
21 => "\x15",
22 => "\x16",
23 => "\x17",
24 => "\x18",
25 => "\x19",
26 => "\x1A",
27 => "\x1B",
28 => "\x1C",
29 => "\x1D",
30 => "\x1E",
31 => "\x1F",
32 => "\x20",
33 => "\x21",
34 => "\x22",
35 => "\x23",
36 => "\x24",
37 => "\x25",
38 => "\x26",
39 => "\x27",
40 => "\x28",
41 => "\x29",
42 => "\x2A",
43 => "\x2B",
44 => "\x2C",
45 => "\x2D",
46 => "\x2E",
47 => "\x2F",
48 => "\x30",
49 => "\x31",
50 => "\x32",
51 => "\x33",
52 => "\x34",
53 => "\x35",
54 => "\x36",
55 => "\x37",
56 => "\x38",
57 => "\x39",
58 => "\x3A",
59 => "\x3B",
60 => "\x3C",
61 => "\x3D",
62 => "\x3E",
63 => "\x3F",
64 => "\x40",
65 => "\x41",
66 => "\x42",
67 => "\x43",
68 => "\x44",
69 => "\x45",
70 => "\x46",
71 => "\x47",
72 => "\x48",
73 => "\x49",
74 => "\x4A",
75 => "\x4B",
76 => "\x4C",
77 => "\x4D",
78 => "\x4E",
79 => "\x4F",
80 => "\x50",
81 => "\x51",
82 => "\x52",
83 => "\x53",
84 => "\x54",
85 => "\x55",
86 => "\x56",
87 => "\x57",
88 => "\x58",
89 => "\x59",
90 => "\x5A",
91 => "\x5B",
92 => "\x5C",
93 => "\x5D",
94 => "\x5E",
95 => "\x5F",
96 => "\x60",
97 => "\x61",
98 => "\x62",
99 => "\x63",
100 => "\x64",
101 => "\x65",
102 => "\x66",
103 => "\x67",
104 => "\x68",
105 => "\x69",
106 => "\x6A",
107 => "\x6B",
108 => "\x6C",
109 => "\x6D",
110 => "\x6E",
111 => "\x6F",
112 => "\x70",
113 => "\x71",
114 => "\x72",
115 => "\x73",
116 => "\x74",
117 => "\x75",
118 => "\x76",
119 => "\x77",
120 => "\x78",
121 => "\x79",
122 => "\x7A",
123 => "\x7B",
124 => "\x7C",
125 => "\x7D",
126 => "\x7E",
127 => "\x7F",
128 => "\x80",
129 => "\x81",
130 => "\x82",
131 => "\x83",
132 => "\x84",
133 => "\x85",
134 => "\x86",
135 => "\x87",
136 => "\x88",
137 => "\x89",
138 => "\x8A",
139 => "\x8B",
140 => "\x8C",
141 => "\x8D",
142 => "\x8E",
143 => "\x8F",
144 => "\x90",
145 => "\x91",
146 => "\x92",
147 => "\x93",
148 => "\x94",
149 => "\x95",
150 => "\x96",
151 => "\x97",
152 => "\x98",
153 => "\x99",
154 => "\x9A",
155 => "\x9B",
156 => "\x9C",
157 => "\x9D",
158 => "\x9E",
159 => "\x9F",
160 => "\xA0",
161 => "\xA1",
162 => "\xA2",
163 => "\xA3",
164 => "\xA4",
165 => "\xA5",
166 => "\xA6",
167 => "\xA7",
168 => "\xA8",
169 => "\xA9",
170 => "\xAA",
171 => "\xAB",
172 => "\xAC",
173 => "\xAD",
174 => "\xAE",
175 => "\xAF",
176 => "\xB0",
177 => "\xB1",
178 => "\xB2",
179 => "\xB3",
180 => "\xB4",
181 => "\xB5",
182 => "\xB6",
183 => "\xB7",
184 => "\xB8",
185 => "\xB9",
186 => "\xBA",
187 => "\xBB",
188 => "\xBC",
189 => "\xBD",
190 => "\xBE",
191 => "\xBF",
192 => "\xC0",
193 => "\xC1",
194 => "\xC2",
195 => "\xC3",
196 => "\xC4",
197 => "\xC5",
198 => "\xC6",
199 => "\xC7",
200 => "\xC8",
201 => "\xC9",
202 => "\xCA",
203 => "\xCB",
204 => "\xCC",
205 => "\xCD",
206 => "\xCE",
207 => "\xCF",
208 => "\xD0",
209 => "\xD1",
210 => "\xD2",
211 => "\xD3",
212 => "\xD4",
213 => "\xD5",
214 => "\xD6",
215 => "\xD7",
216 => "\xD8",
217 => "\xD9",
218 => "\xDA",
219 => "\xDB",
220 => "\xDC",
221 => "\xDD",
222 => "\xDE",
223 => "\xDF",
224 => "\xE0",
225 => "\xE1",
226 => "\xE2",
227 => "\xE3",
228 => "\xE4",
229 => "\xE5",
230 => "\xE6",
231 => "\xE7",
232 => "\xE8",
233 => "\xE9",
234 => "\xEA",
235 => "\xEB",
236 => "\xEC",
237 => "\xED",
238 => "\xEE",
239 => "\xEF",
240 => "\xF0",
241 => "\xF1",
242 => "\xF2",
243 => "\xF3",
244 => "\xF4",
245 => "\xF5",
246 => "\xF6",
247 => "\xF7",
248 => "\xF8",
249 => "\xF9",
250 => "\xFA",
251 => "\xFB",
252 => "\xFC",
253 => "\xFD",
254 => "\xFE",
255 => "\xFF",
);
/**
* @var array
*/
private static $WIN1252_TO_UTF8 = array(
0x80 => "\xe2\x82\xac", # €
0x82 => "\xe2\x80\x9a", # ‚
0x83 => "\xc6\x92", # ƒ
0x84 => "\xe2\x80\x9e", # „
0x85 => "\xe2\x80\xa6", # …
0x86 => "\xe2\x80\xa0", # †
0x87 => "\xe2\x80\xa1", # ‡
0x88 => "\xcb\x86", # ˆ
0x89 => "\xe2\x80\xb0", # ‰
0x8a => "\xc5\xa0", # Š
0x8b => "\xe2\x80\xb9", # ‹
0x8c => "\xc5\x92", # Œ
0x8e => "\xc5\xbd", # Ž
0x91 => "\xe2\x80\x98", # ‘
0x92 => "\xe2\x80\x99", # ’
0x93 => "\xe2\x80\x9c", # “
0x94 => "\xe2\x80\x9d", # ”
0x95 => "\xe2\x80\xa2", # •
0x96 => "\xe2\x80\x93", # –
0x97 => "\xe2\x80\x94", # —
0x98 => "\xcb\x9c", # ˜
0x99 => "\xe2\x84\xa2", # ™
0x9a => "\xc5\xa1", # š
0x9b => "\xe2\x80\xba", # ›
0x9c => "\xc5\x93", # œ
0x9e => "\xc5\xbe", # ž
0x9f => "\xc5\xb8", # Ÿ
0xa0 => "\xc2\xa0", #
0xa1 => "\xc2\xa1", # ¡
0xa2 => "\xc2\xa2", # ¢
0xa3 => "\xc2\xa3", # £
0xa4 => "\xc2\xa4", # ¤
0xa5 => "\xc2\xa5", # ¥
0xa6 => "\xc2\xa6", # ¦
0xa7 => "\xc2\xa7", # §
0xa8 => "\xc2\xa8", # ¨
0xa9 => "\xc2\xa9", # ©
0xaa => "\xc2\xaa", # ª
0xab => "\xc2\xab", # «
0xac => "\xc2\xac", # ¬
0xad => "\xc2\xad", # ­
0xae => "\xc2\xae", # ®
0xaf => "\xc2\xaf", # ¯
0xb0 => "\xc2\xb0", # °
0xb1 => "\xc2\xb1", # ±
0xb2 => "\xc2\xb2", # ²
0xb3 => "\xc2\xb3", # ³
0xb4 => "\xc2\xb4", # ´
0xb5 => "\xc2\xb5", # µ
0xb6 => "\xc2\xb6", # ¶
0xb7 => "\xc2\xb7", # ·
0xb8 => "\xc2\xb8", # ¸
0xb9 => "\xc2\xb9", # ¹
0xba => "\xc2\xba", # º
0xbb => "\xc2\xbb", # »
0xbc => "\xc2\xbc", # ¼
0xbd => "\xc2\xbd", # ½
0xbe => "\xc2\xbe", # ¾
0xbf => "\xc2\xbf", # ¿
0xc0 => "\xc3\x80", # À
0xc1 => "\xc3\x81", # Á
0xc2 => "\xc3\x82", # Â
0xc3 => "\xc3\x83", # Ã
0xc4 => "\xc3\x84", # Ä
0xc5 => "\xc3\x85", # Å
0xc6 => "\xc3\x86", # Æ
0xc7 => "\xc3\x87", # Ç
0xc8 => "\xc3\x88", # È
0xc9 => "\xc3\x89", # É
0xca => "\xc3\x8a", # Ê
0xcb => "\xc3\x8b", # Ë
0xcc => "\xc3\x8c", # Ì
0xcd => "\xc3\x8d", # Í
0xce => "\xc3\x8e", # Î
0xcf => "\xc3\x8f", # Ï
0xd0 => "\xc3\x90", # Ð
0xd1 => "\xc3\x91", # Ñ
0xd2 => "\xc3\x92", # Ò
0xd3 => "\xc3\x93", # Ó
0xd4 => "\xc3\x94", # Ô
0xd5 => "\xc3\x95", # Õ
0xd6 => "\xc3\x96", # Ö
0xd7 => "\xc3\x97", # ×
0xd8 => "\xc3\x98", # Ø
0xd9 => "\xc3\x99", # Ù
0xda => "\xc3\x9a", # Ú
0xdb => "\xc3\x9b", # Û
0xdc => "\xc3\x9c", # Ü
0xdd => "\xc3\x9d", # Ý
0xde => "\xc3\x9e", # Þ
0xdf => "\xc3\x9f", # ß
0xe0 => "\xc3\xa0", # à
0xe1 => "\xa1", # á
0xe2 => "\xc3\xa2", # â
0xe3 => "\xc3\xa3", # ã
0xe4 => "\xc3\xa4", # ä
0xe5 => "\xc3\xa5", # å
0xe6 => "\xc3\xa6", # æ
0xe7 => "\xc3\xa7", # ç
0xe8 => "\xc3\xa8", # è
0xe9 => "\xc3\xa9", # é
0xea => "\xc3\xaa", # ê
0xeb => "\xc3\xab", # ë
0xec => "\xc3\xac", # ì
0xed => "\xc3\xad", # í
0xee => "\xc3\xae", # î
0xef => "\xc3\xaf", # ï
0xf0 => "\xc3\xb0", # ð
0xf1 => "\xc3\xb1", # ñ
0xf2 => "\xc3\xb2", # ò
0xf3 => "\xc3\xb3", # ó
0xf4 => "\xc3\xb4", # ô
0xf5 => "\xc3\xb5", # õ
0xf6 => "\xc3\xb6", # ö
0xf7 => "\xc3\xb7", # ÷
0xf8 => "\xc3\xb8", # ø
0xf9 => "\xc3\xb9", # ù
0xfa => "\xc3\xba", # ú
0xfb => "\xc3\xbb", # û
0xfc => "\xc3\xbc", # ü
0xfd => "\xc3\xbd", # ý
0xfe => "\xc3\xbe", # þ
);
/**
* @var array
*/
private static $UTF8_MSWORD = array(
"\xc2\xab" => '"', // « (U+00AB) in UTF-8
"\xc2\xbb" => '"', // » (U+00BB) in UTF-8
"\xe2\x80\x98" => "'", // ‘ (U+2018) in UTF-8
"\xe2\x80\x99" => "'", // ’ (U+2019) in UTF-8
"\xe2\x80\x9a" => "'", // ‚ (U+201A) in UTF-8
"\xe2\x80\x9b" => "'", // ‛ (U+201B) in UTF-8
"\xe2\x80\x9c" => '"', // “ (U+201C) in UTF-8
"\xe2\x80\x9d" => '"', // ” (U+201D) in UTF-8
"\xe2\x80\x9e" => '"', // „ (U+201E) in UTF-8
"\xe2\x80\x9f" => '"', // ‟ (U+201F) in UTF-8
"\xe2\x80\xb9" => "'", // ‹ (U+2039) in UTF-8
"\xe2\x80\xba" => "'", // › (U+203A) in UTF-8
"\xe2\x80\x93" => '-', // – (U+2013) in UTF-8
"\xe2\x80\x94" => '-', // — (U+2014) in UTF-8
"\xe2\x80\xa6" => '...' // … (U+2026) in UTF-8
);
/**
* Bom => Byte-Length
*
* INFO: https://en.wikipedia.org/wiki/Byte_order_mark
*
* @var array
*/
private static $BOM = array(
"\xef\xbb\xbf" => 3, // UTF-8 BOM
'' => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...)
"\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM
' þÿ' => 6, // UTF-32 (BE) BOM as "WINDOWS-1252"
"\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM
'ÿþ ' => 6, // UTF-32 (LE) BOM as "WINDOWS-1252"
"\xfe\xff" => 2, // UTF-16 (BE) BOM
'þÿ' => 4, // UTF-16 (BE) BOM as "WINDOWS-1252"
"\xff\xfe" => 2, // UTF-16 (LE) BOM
'ÿþ' => 4, // UTF-16 (LE) BOM as "WINDOWS-1252"
);
/**
* @var array
*/
private static $WHITESPACE_TABLE = array(
'SPACE' => "\x20",
'NO-BREAK SPACE' => "\xc2\xa0",
'OGHAM SPACE MARK' => "\xe1\x9a\x80",
'EN QUAD' => "\xe2\x80\x80",
'EM QUAD' => "\xe2\x80\x81",
'EN SPACE' => "\xe2\x80\x82",
'EM SPACE' => "\xe2\x80\x83",
'THREE-PER-EM SPACE' => "\xe2\x80\x84",
'FOUR-PER-EM SPACE' => "\xe2\x80\x85",
'SIX-PER-EM SPACE' => "\xe2\x80\x86",
'FIGURE SPACE' => "\xe2\x80\x87",
'PUNCTUATION SPACE' => "\xe2\x80\x88",
'THIN SPACE' => "\xe2\x80\x89",
'HAIR SPACE' => "\xe2\x80\x8a",
'LINE SEPARATOR' => "\xe2\x80\xa8",
'PARAGRAPH SEPARATOR' => "\xe2\x80\xa9",
'ZERO WIDTH SPACE' => "\xe2\x80\x8b",
'NARROW NO-BREAK SPACE' => "\xe2\x80\xaf",
'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f",
'IDEOGRAPHIC SPACE' => "\xe3\x80\x80",
);
private static $BROKEN_UTF8_FIX = array(
"\xc2\x80" => "\xe2\x82\xac", // EURO SIGN
"\xc2\x82" => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
"\xc2\x83" => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
"\xc2\x84" => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
"\xc2\x85" => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
"\xc2\x86" => "\xe2\x80\xa0", // DAGGER
"\xc2\x87" => "\xe2\x80\xa1", // DOUBLE DAGGER
"\xc2\x88" => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
"\xc2\x89" => "\xe2\x80\xb0", // PER MILLE SIGN
"\xc2\x8a" => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
"\xc2\x8b" => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
"\xc2\x8c" => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
"\xc2\x8e" => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
"\xc2\x91" => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
"\xc2\x92" => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
"\xc2\x93" => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
"\xc2\x94" => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
"\xc2\x95" => "\xe2\x80\xa2", // BULLET
"\xc2\x96" => "\xe2\x80\x93", // EN DASH
"\xc2\x97" => "\xe2\x80\x94", // EM DASH
"\xc2\x98" => "\xcb\x9c", // SMALL TILDE
"\xc2\x99" => "\xe2\x84\xa2", // TRADE MARK SIGN
"\xc2\x9a" => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
"\xc2\x9b" => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
"\xc2\x9c" => "\xc5\x93", // LATIN SMALL LIGATURE OE
"\xc2\x9e" => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
"\xc2\x9f" => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
'ü' => 'ü',
'ä' => 'ä',
'ö' => 'ö',
'Ö' => 'Ö',
'ß' => 'ß',
'Ã ' => 'à',
'á' => 'á',
'â' => 'â',
'ã' => 'ã',
'ù' => 'ù',
'ú' => 'ú',
'û' => 'û',
'Ù' => 'Ù',
'Ú' => 'Ú',
'Û' => 'Û',
'Ãœ' => 'Ü',
'ò' => 'ò',
'ó' => 'ó',
'ô' => 'ô',
'è' => 'è',
'é' => 'é',
'ê' => 'ê',
'ë' => 'ë',
'À' => 'À',
'Á' => 'Á',
'Â' => 'Â',
'Ã' => 'Ã',
'Ä' => 'Ä',
'Ã…' => 'Å',
'Ç' => 'Ç',
'È' => 'È',
'É' => 'É',
'Ê' => 'Ê',
'Ë' => 'Ë',
'ÃŒ' => 'Ì',
'Í' => 'Í',
'ÃŽ' => 'Î',
'Ï' => 'Ï',
'Ñ' => 'Ñ',
'Ã’' => 'Ò',
'Ó' => 'Ó',
'Ô' => 'Ô',
'Õ' => 'Õ',
'Ø' => 'Ø',
'Ã¥' => 'å',
'æ' => 'æ',
'ç' => 'ç',
'ì' => 'ì',
'í' => 'í',
'î' => 'î',
'ï' => 'ï',
'ð' => 'ð',
'ñ' => 'ñ',
'õ' => 'õ',
'ø' => 'ø',
'ý' => 'ý',
'ÿ' => 'ÿ',
'€' => '€',
'’' => '’',
);
/**
* bidirectional text chars
*
* url: https://www.w3.org/International/questions/qa-bidi-unicode-controls
*
* @var array
*/
private static $BIDI_UNI_CODE_CONTROLS_TABLE = array(
// LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr")
8234 => "\xE2\x80\xAA",
// RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl")
8235 => "\xE2\x80\xAB",
// POP DIRECTIONAL FORMATTING // (use -> </bdo>)
8236 => "\xE2\x80\xAC",
// LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">)
8237 => "\xE2\x80\xAD",
// RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">)
8238 => "\xE2\x80\xAE",
// LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr")
8294 => "\xE2\x81\xA6",
// RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl")
8295 => "\xE2\x81\xA7",
// FIRST STRONG ISOLATE // (use -> dir = "auto")
8296 => "\xE2\x81\xA8",
// POP DIRECTIONAL ISOLATE
8297 => "\xE2\x81\xA9",
);
/**
* This method will auto-detect your server environment for UTF-8 support.
*
* INFO: You don't need to run it manually, it will be triggered if it's needed.
*/
public static function checkForSupport() {
if (!isset(self::$SUPPORT['already_checked_via_portable_utf8'])) {
self::$SUPPORT['already_checked_via_portable_utf8'] = true;
// http://php.net/manual/en/book.mbstring.php
self::$SUPPORT['mbstring'] = self::mbstring_loaded();
self::$SUPPORT['mbstring_func_overload'] = self::mbstring_overloaded();
// http://php.net/manual/en/book.iconv.php
self::$SUPPORT['iconv'] = self::iconv_loaded();
// http://php.net/manual/en/book.intl.php
self::$SUPPORT['intl'] = self::intl_loaded();
self::$SUPPORT['intl__transliterator_list_ids'] = array();
if (
self::$SUPPORT['intl'] === true
&&
\function_exists('transliterator_list_ids') === true
) {
self::$SUPPORT['intl__transliterator_list_ids'] = transliterator_list_ids();
}
// http://php.net/manual/en/class.intlchar.php
self::$SUPPORT['intlChar'] = self::intlChar_loaded();
// http://php.net/manual/en/book.pcre.php
self::$SUPPORT['pcre_utf8'] = self::pcre_utf8_support();
}
}
/**
* Generates an array of byte length of each character of a Unicode string.
*
* 1 byte => U+0000 - U+007F
* 2 byte => U+0080 - U+07FF
* 3 byte => U+0800 - U+FFFF
* 4 byte => U+10000 - U+10FFFF
*
* @param string $str <p>The original unicode string.</p>
*
* @return array <p>An array of byte lengths of each character.</p>
*/
public static function chr_size_list($str) {
if (!isset($str[0])) {
return array();
}
return \array_map(
function ($data) {
return \strlen($data); // count the bytes
},
self::split($str)
);
}
/**
* Accepts a string and removes all non-UTF-8 characters from it + extras if needed.
*
* @param string $str <p>The string to be sanitized.</p>
* @param bool $remove_bom [optional] <p>Set to true, if you need to remove UTF-BOM.</p>
* @param bool $normalize_whitespace [optional] <p>Set to true, if you need to normalize the
* whitespace.</p>
* @param bool $normalize_msword [optional] <p>Set to true, if you need to normalize MS Word chars
* e.g.: "…"
* => "..."</p>
* @param bool $keep_non_breaking_space [optional] <p>Set to true, to keep non-breaking-spaces, in
* combination with
* $normalize_whitespace</p>
* @param bool $replace_diamond_question_mark [optional] <p>Set to true, if you need to remove diamond question
* mark e.g.: "�"</p>
* @param bool $remove_invisible_characters [optional] <p>Set to false, if you not want to remove invisible
* characters e.g.: "\0"</p>
*
* @return string <p>Clean UTF-8 encoded string.</p>
*/
public static function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false, $replace_diamond_question_mark = false, $remove_invisible_characters = true)
{
// http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string
// caused connection reset problem on larger strings
$regx = '/
(
(?: [\x00-\x7F] # single-byte sequences 0xxxxxxx
| [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx
| [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2
| [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3
){1,100} # ...one or more times
)
| ( [\x80-\xBF] ) # invalid byte in range 10000000 - 10111111
| ( [\xC0-\xFF] ) # invalid byte in range 11000000 - 11111111
/x';
$str = (string)\preg_replace($regx, '$1', $str);
if ($replace_diamond_question_mark === true) {
$str = self::replace_diamond_question_mark($str, '');
}
if ($remove_invisible_characters === true) {
$str = self::remove_invisible_characters($str);
}
if ($normalize_whitespace === true) {
$str = self::normalize_whitespace($str, $keep_non_breaking_space);
}
if ($normalize_msword === true) {
$str = self::normalize_msword($str);
}
if ($remove_bom === true) {
$str = self::remove_bom($str);
}
return $str;
}
/**
* Returns count of characters used in a string.
*
* @param string $str <p>The input string.</p>
* @param bool $cleanUtf8 [optional] <p>Remove non UTF-8 chars from the string.</p>
*
* @return array <p>An associative array of Character as keys and
* their count as values.</p>
*/
public static function count_chars($str, $cleanUtf8 = false) {
return \array_count_values(self::split($str, 1, $cleanUtf8));
}
/**
* Encode a string with a new charset-encoding.
*
* INFO: The different to "StringLib::utf8_encode()" is that this function, try to fix also broken / double encoding,
* so you can call this function also on a UTF-8 String and you don't mess the string.
*
* @param string $encoding <p>e.g. 'UTF-16', 'UTF-8', 'ISO-8859-1', etc.</p>
* @param string $str <p>The input string</p>
* @param bool $force [optional] <p>Force the new encoding (we try to fix broken / double encoding for
* UTF-8)<br> otherwise we auto-detect the current string-encoding</p>
*
* @return string
*/
public static function encode($encoding, $str, $force = true) {
if (!isset($str[0], $encoding[0])) {
return $str;
}
if ($encoding !== 'UTF-8') {
$encoding = self::normalize_encoding($encoding, 'UTF-8');
}
if (!isset(self::$SUPPORT['already_checked_via_portable_utf8'])) {
self::checkForSupport();
}
$encodingDetected = self::str_detect_encoding($str);
if (
$encodingDetected !== false
&&
(
$force === true
||
$encodingDetected !== $encoding
)
) {
if (
$encoding === 'UTF-8'
&&
(
$force === true
|| $encodingDetected === 'UTF-8'
|| $encodingDetected === 'WINDOWS-1252'
|| $encodingDetected === 'ISO-8859-1'
)
) {
return self::to_utf8($str);
}
if (
$encoding === 'ISO-8859-1'
&&
(
$force === true
|| $encodingDetected === 'ISO-8859-1'
|| $encodingDetected === 'WINDOWS-1252'
|| $encodingDetected === 'UTF-8'
)
) {
return self::to_iso8859($str);
}
if (
$encoding !== 'UTF-8'
&&
$encoding !== 'WINDOWS-1252'
&&
self::$SUPPORT['mbstring'] === false
) {
\trigger_error('StringLib::encode() without mbstring cannot handle "' . $encoding . '" encoding', E_USER_WARNING);
}
$strEncoded = \mb_convert_encoding(
$str,
$encoding,
$encodingDetected
);
if ($strEncoded) {
return $strEncoded;
}
}
return $str;
}
/**
* Convert a string into "ISO-8859"-encoding (Latin-1).
*
* @param string|string[] $str
*
* @return string|string[]
*/
public static function to_iso8859($str)
{
if (\is_array($str) === true) {
foreach ($str as $k => $v) {
$str[$k] = self::to_iso8859($v);
}
return $str;
}
$str = (string)$str;
if (!isset($str[0])) {
return '';
}
return self::utf8_decode($str);
}
/**
* Converts a UTF-8 string to a series of HTML numbered entities.
*
* INFO: opposite to StringLib::html_decode()
*
* @param string $str <p>The Unicode string to be encoded as numbered entities.</p>
* @param bool $keepAsciiChars [optional] <p>Keep ASCII chars.</p>
* @param string $encoding [optional] <p>Default is UTF-8</p>
*
* @return string <p>HTML numbered entities.</p>
*/
public static function html_encode($str, $keepAsciiChars = false, $encoding = 'UTF-8') {
if (!isset($str[0])) {
return '';
}
if ($encoding !== 'UTF-8') {
$encoding = self::normalize_encoding($encoding, 'UTF-8');
}
# INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity
if (\function_exists('mb_encode_numericentity')) {
$startCode = 0x00;
if ($keepAsciiChars === true) {
$startCode = 0x80;
}
return \mb_encode_numericentity(
$str,
array($startCode, 0xfffff, 0, 0xfffff, 0),
$encoding
);
}
return \implode(
'',
\array_map(
function ($data) use ($keepAsciiChars, $encoding) {
return StringLib::single_chr_html_encode($data, $keepAsciiChars, $encoding);
},
self::split($str)
)
);
}
/**
* UTF-8 version of html_entity_decode()
*
* The reason we are not using html_entity_decode() by itself is because
* while it is not technically correct to leave out the semicolon
* at the end of an entity most browsers will still interpret the entity
* correctly. html_entity_decode() does not convert entities without
* semicolons, so we are left with our own little solution here. Bummer.
*
* Convert all HTML entities to their applicable characters
*
* INFO: opposite to StringLib::html_encode()
*
* @link http://php.net/manual/en/function.html-entity-decode.php
*
* @param string $str <p>
* The input string.
* </p>
* @param int $flags [optional] <p>
* A bitmask of one or more of the following flags, which specify how to handle quotes and
* which document type to use. The default is ENT_COMPAT | ENT_HTML401.
* <table>
* Available <i>flags</i> constants
* <tr valign="top">
* <td>Constant Name</td>
* <td>Description</td>
* </tr>
* <tr valign="top">
* <td><b>ENT_COMPAT</b></td>
* <td>Will convert double-quotes and leave single-quotes alone.</td>
* </tr>
* <tr valign="top">
* <td><b>ENT_QUOTES</b></td>
* <td>Will convert both double and single quotes.</td>
* </tr>
* <tr valign="top">
* <td><b>ENT_NOQUOTES</b></td>
* <td>Will leave both double and single quotes unconverted.</td>
* </tr>
* <tr valign="top">
* <td><b>ENT_HTML401</b></td>
* <td>
* Handle code as HTML 4.01.
* </td>
* </tr>
* <tr valign="top">
* <td><b>ENT_XML1</b></td>
* <td>
* Handle code as XML 1.
* </td>
* </tr>
* <tr valign="top">
* <td><b>ENT_XHTML</b></td>
* <td>
* Handle code as XHTML.
* </td>
* </tr>
* <tr valign="top">
* <td><b>ENT_HTML5</b></td>
* <td>
* Handle code as HTML 5.
* </td>
* </tr>
* </table>
* </p>
* @param string $encoding [optional] <p>Encoding to use.</p>
*
* @return string <p>The decoded string.</p>
*/
public static function html_entity_decode($str, $flags = null, $encoding = 'UTF-8') {
if (!isset($str[0])) {
return '';
}
if (!isset($str[3])) { // examples: &; || &x;
return $str;
}
if (
\strpos($str, '&') === false
||
(
\strpos($str, '&#') === false
&&
\strpos($str, ';') === false
)
) {
return $str;
}
if ($encoding !== 'UTF-8') {
$encoding = self::normalize_encoding($encoding, 'UTF-8');
}
if ($flags === null) {
$flags = ENT_QUOTES | ENT_HTML5;
}
if (
$encoding !== 'UTF-8'
&&
$encoding !== 'WINDOWS-1252'
&&
self::$SUPPORT['mbstring'] === false
) {
\trigger_error('StringLib::html_entity_decode() without mbstring cannot handle "' . $encoding . '" encoding', E_USER_WARNING);
}
do {
$str_compare = $str;
$str = (string)\preg_replace_callback(
"/&#\d{2,6};/",
function ($matches) use ($encoding) {
$returnTmp = \mb_convert_encoding($matches[0], $encoding, 'HTML-ENTITIES');
if ($returnTmp !== '"' && $returnTmp !== "'") {
return $returnTmp;
}
return $matches[0];
},
$str
);
// decode numeric & UTF16 two byte entities
$str = \html_entity_decode(
\preg_replace('/(&#(?:x0*[0-9a-f]{2,6}(?![0-9a-f;])|(?:0*\d{2,6}(?![0-9;]))))/iS', '$1;', $str),
$flags,
$encoding
);
} while ($str_compare !== $str);
return $str;
}
/**
* Checks whether iconv is available on the server.
*
* @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
*/
public static function iconv_loaded() {
return \extension_loaded('iconv') ? true : false;
}
/**
* Checks whether intl-char is available on the server.
*
* @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
*/
public static function intlChar_loaded() {
return \class_exists('IntlChar');
}
/**
* Checks whether intl is available on the server.
*
* @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
*/
public static function intl_loaded() {
return \extension_loaded('intl');
}
/**
* Checks if a string is 7 bit ASCII.
*
* @param string $str <p>The string to check.</p>
*
* @return bool <p>
* <strong>true</strong> if it is ASCII<br>
* <strong>false</strong> otherwise
* </p>
*/
public static function is_ascii($str) {
if (!isset($str[0])) {
return true;
}
return !\preg_match('/[^\x09\x10\x13\x0A\x0D\x20-\x7E]/', $str);
}
/**
* Check if the input is binary... (is look like a hack).
*
* @param mixed $input
* @param bool $strict
*
* @return bool
*/
public static function is_binary($input, $strict = false)
{
$input = (string)$input;
if (!isset($input[0])) {
return false;
}
if (\preg_match('~^[01]+$~', $input)) {
return true;
}
$testNull = 0;
$testLength = \strlen($input);
if ($testLength) {
$testNull = \substr_count($input, "\x0");
if (($testNull / $testLength) > 0.3) {
return true;
}
}
if (
$strict === true
&&
\class_exists('finfo')
) {
$finfo = new \finfo(FILEINFO_MIME_ENCODING);
$finfo_encoding = $finfo->buffer($input);
if ($finfo_encoding && $finfo_encoding === 'binary') {
return true;
}
} else {
if ($testNull > 0) {
return true;
}
}
return false;
}
/**
* Reads entire file into a string.
*
* WARNING: do not use UTF-8 Option ($convertToUtf8) for binary-files (e.g.: images) !!!
*
* @link http://php.net/manual/en/function.file-get-contents.php
*
* @param string $filename <p>
* Name of the file to read.
* </p>
* @param bool $use_include_path [optional] <p>
* Prior to PHP 5, this parameter is called
* use_include_path and is a bool.
* As of PHP 5 the FILE_USE_INCLUDE_PATH can be used
* to trigger include path
* search.
* </p>
* @param resource|null $context [optional] <p>
* A valid context resource created with
* stream_context_create. If you don't need to use a
* custom context, you can skip this parameter by &null;.
* </p>
* @param int|null $offset [optional] <p>
* The offset where the reading starts.
* </p>
* @param int|null $maxLength [optional] <p>
* Maximum length of data read. The default is to read until end
* of file is reached.
* </p>
* @param int $timeout <p>The time in seconds for the timeout.</p>
*
* @param bool $convertToUtf8 <strong>WARNING!!!</strong> <p>Maybe you can't use this option for e.g.
* images or pdf, because they used non default utf-8 chars.</p>
*
* @return string|false <p>The function returns the read data or false on failure.</p>
*/
public static function file_get_contents($filename, $use_include_path = false, $context = null, $offset = null, $maxLength = null, $timeout = 10, $convertToUtf8 = true)
{
// init
$filename = \filter_var($filename, FILTER_SANITIZE_STRING);
if ($timeout && $context === null) {
$context = \stream_context_create(
array(
'http' =>
array(
'timeout' => $timeout,
),
)
);
}
if ($offset === null) {
$offset = 0;
}
if (\is_int($maxLength) === true) {
$data = \file_get_contents($filename, $use_include_path, $context, $offset, $maxLength);
} else {
$data = \file_get_contents($filename, $use_include_path, $context, $offset);
}
// return false on error
if ($data === false) {
return false;
}
if ($convertToUtf8 === true) {
if (
self::is_binary($data, true) === true
&&
self::is_utf16($data) === false
&&
self::is_utf32($data) === false
) {
// do nothing, it's binary and not UTF16 or UTF32
} else {
$data = self::encode('UTF-8', $data, false);
$data = self::cleanup($data);
}
}
return $data;
}
/**
* Clean-up a and show only printable UTF-8 chars at the end + fix UTF-8 encoding.
*
* @param string $str <p>The input string.</p>
*
* @return string
*/
public static function cleanup($str)
{
if (!isset($str[0])) {
return '';
}
// fixed ISO <-> UTF-8 Errors
$str = self::fix_simple_utf8($str);
// remove all none UTF-8 symbols
// && remove diamond question mark (�)
// && remove remove invisible characters (e.g. "\0")
// && remove BOM
// && normalize whitespace chars (but keep non-breaking-spaces)
$str = self::clean(
$str,
true,
true,
false,
true,
true,
true
);
return $str;
}
/**
* Try to fix simple broken UTF-8 strings.
*
* INFO: Take a look at "UTF8::fix_utf8()" if you need a more advanced fix for broken UTF-8 strings.
*
* If you received an UTF-8 string that was converted from Windows-1252 as it was ISO-8859-1
* (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
* See: http://en.wikipedia.org/wiki/Windows-1252
*
* @param string $str <p>The input string</p>
*
* @return string
*/
public static function fix_simple_utf8($str)
{
if (!isset($str[0])) {
return '';
}
static $BROKEN_UTF8_TO_UTF8_KEYS_CACHE = null;
static $BROKEN_UTF8_TO_UTF8_VALUES_CACHE = null;
if ($BROKEN_UTF8_TO_UTF8_KEYS_CACHE === null) {
$BROKEN_UTF8_TO_UTF8_KEYS_CACHE = \array_keys(self::$BROKEN_UTF8_FIX);
$BROKEN_UTF8_TO_UTF8_VALUES_CACHE = \array_values(self::$BROKEN_UTF8_FIX);
}
return \str_replace($BROKEN_UTF8_TO_UTF8_KEYS_CACHE, $BROKEN_UTF8_TO_UTF8_VALUES_CACHE, $str);
}
/**
* Check if the string is UTF-16.
*
* @param string $str <p>The input string.</p>
*
* @return int|false <p>
* <strong>false</strong> if is't not UTF-16,<br>
* <strong>1</strong> for UTF-16LE,<br>
* <strong>2</strong> for UTF-16BE.
* </p>
*/
public static function is_utf16($str)
{
if (self::is_binary($str) === false) {
return false;
}
// init
$strChars = array();
$str = self::remove_bom($str);
$maybeUTF16LE = 0;
$test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE');
if ($test) {
$test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8');
$test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE');
if ($test3 === $test) {
if (\count($strChars) === 0) {
$strChars = self::count_chars($str, true);
}
foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
if (\in_array($test3char, $strChars, true) === true) {
$maybeUTF16LE++;
}
}
}
}
$maybeUTF16BE = 0;
$test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE');
if ($test) {
$test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8');
$test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE');
if ($test3 === $test) {
if (\count($strChars) === 0) {
$strChars = self::count_chars($str, true);
}
foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
if (\in_array($test3char, $strChars, true) === true) {
$maybeUTF16BE++;
}
}
}
}
if ($maybeUTF16BE !== $maybeUTF16LE) {
if ($maybeUTF16LE > $maybeUTF16BE) {
return 1;
}
return 2;
}
return false;
}
/**
* Check if the string is UTF-32.
*
* @param string $str
*
* @return int|false <p>
* <strong>false</strong> if is't not UTF-32,<br>
* <strong>1</strong> for UTF-32LE,<br>
* <strong>2</strong> for UTF-32BE.
* </p>
*/
public static function is_utf32($str)
{
if (self::is_binary($str) === false) {
return false;
}
// init
$strChars = array();
$str = self::remove_bom($str);
$maybeUTF32LE = 0;
$test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE');
if ($test) {
$test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8');
$test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE');
if ($test3 === $test) {
if (\count($strChars) === 0) {
$strChars = self::count_chars($str, true);
}
foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
if (\in_array($test3char, $strChars, true) === true) {
$maybeUTF32LE++;
}
}
}
}
$maybeUTF32BE = 0;
$test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE');
if ($test) {
$test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8');
$test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE');
if ($test3 === $test) {
if (\count($strChars) === 0) {
$strChars = self::count_chars($str, true);
}
foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
if (\in_array($test3char, $strChars, true) === true) {
$maybeUTF32BE++;
}
}
}
}
if ($maybeUTF32BE !== $maybeUTF32LE) {
if ($maybeUTF32LE > $maybeUTF32BE) {
return 1;
}
return 2;
}
return false;
}
/**
* Checks whether the passed string contains only byte sequences that appear valid UTF-8 characters.
*
* @see http://hsivonen.iki.fi/php-utf8/
*
* @param string|string[] $str <p>The string to be checked.</p>
* @param bool $strict <p>Check also if the string is not UTF-16 or UTF-32.</p>
*
* @return bool
*/
public static function is_utf8($str, $strict = false) {
if (\is_array($str) === true) {
foreach ($str as $k => $v) {
if (false === self::is_utf8($v, $strict)) {
return false;
}
}
return true;
}
if (!isset($str[0])) {
return true;
}
if ($strict === true) {
if (self::is_utf16($str) !== false) {
return false;
}
if (self::is_utf32($str) !== false) {
return false;
}
}
if (self::pcre_utf8_support() !== true) {
// If even just the first character can be matched, when the /u
// modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
// invalid, nothing at all will match, even if the string contains
// some valid sequences
return (\preg_match('/^.{1}/us', $str, $ar) === 1);
}
$mState = 0; // cached expected number of octets after the current octet
// until the beginning of the next UTF8 character sequence
$mUcs4 = 0; // cached Unicode character
$mBytes = 1; // cached expected number of octets in the current sequence
if (!isset(self::$SUPPORT['already_checked_via_portable_utf8'])) {
self::checkForSupport();
}
$len = \strlen($str); // count the bytes
/** @noinspection ForeachInvariantsInspection */
for ($i = 0; $i < $len; $i++) {
$in = self::$ORD[$str[$i]];
if ($mState === 0) {
// When mState is zero we expect either a US-ASCII character or a
// multi-octet sequence.
if (0 === (0x80 & $in)) {
// US-ASCII, pass straight through.
$mBytes = 1;
} elseif (0xC0 === (0xE0 & $in)) {
// First octet of 2 octet sequence.
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x1F) << 6;
$mState = 1;
$mBytes = 2;
} elseif (0xE0 === (0xF0 & $in)) {
// First octet of 3 octet sequence.
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x0F) << 12;
$mState = 2;
$mBytes = 3;
} elseif (0xF0 === (0xF8 & $in)) {
// First octet of 4 octet sequence.
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x07) << 18;
$mState = 3;
$mBytes = 4;
} elseif (0xF8 === (0xFC & $in)) {
/* First octet of 5 octet sequence.
*
* This is illegal because the encoded codepoint must be either
* (a) not the shortest form or
* (b) outside the Unicode range of 0-0x10FFFF.
* Rather than trying to resynchronize, we will carry on until the end
* of the sequence and let the later error handling code catch it.
*/
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x03) << 24;
$mState = 4;
$mBytes = 5;
} elseif (0xFC === (0xFE & $in)) {
// First octet of 6 octet sequence, see comments for 5 octet sequence.
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 1) << 30;
$mState = 5;
$mBytes = 6;
} else {
/* Current octet is neither in the US-ASCII range nor a legal first
* octet of a multi-octet sequence.
*/
return false;
}
} else {
// When mState is non-zero, we expect a continuation of the multi-octet
// sequence
if (0x80 === (0xC0 & $in)) {
// Legal continuation.
$shift = ($mState - 1) * 6;
$tmp = $in;
$tmp = ($tmp & 0x0000003F) << $shift;
$mUcs4 |= $tmp;
/**
* End of the multi-octet sequence. mUcs4 now contains the final
* Unicode code point to be output
*/
if (0 === --$mState) {
/*
* Check for illegal sequences and code points.
*/
// From Unicode 3.1, non-shortest form is illegal
if (
(2 === $mBytes && $mUcs4 < 0x0080) ||
(3 === $mBytes && $mUcs4 < 0x0800) ||
(4 === $mBytes && $mUcs4 < 0x10000) ||
(4 < $mBytes) ||
// From Unicode 3.2, surrogate characters are illegal.
(($mUcs4 & 0xFFFFF800) === 0xD800) ||
// Code points outside the Unicode range are illegal.
($mUcs4 > 0x10FFFF)
) {
return false;
}
// initialize UTF8 cache
$mState = 0;
$mUcs4 = 0;
$mBytes = 1;
}
} else {
/**
*((0xC0 & (*in) != 0x80) && (mState != 0))
* Incomplete multi-octet sequence.
*/
return false;
}
}
}
return true;
}
/**
* Calculates and returns the maximum number of bytes taken by any
* UTF-8 encoded character in the given string.
*
* @param string $str <p>The original Unicode string.</p>
*
* @return int <p>Max byte lengths of the given chars.</p>
*/
public static function max_chr_width($str) {
$bytes = self::chr_size_list($str);
if (\count($bytes) > 0) {
return (int)\max($bytes);
}
return 0;
}
/**
* Checks whether mbstring is available on the server.
*
* @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
*/
public static function mbstring_loaded() {
$return = \extension_loaded('mbstring') ? true : false;
if ($return === true) {
\mb_internal_encoding('UTF-8');
}
return $return;
}
/**
* @return bool
*/
private static function mbstring_overloaded() {
return \defined('MB_OVERLOAD_STRING')
&&
\ini_get('mbstring.func_overload') & MB_OVERLOAD_STRING;
}
/**
* Normalize the encoding-"name" input.
*
* @param string $encoding <p>e.g.: ISO, UTF8, WINDOWS-1251 etc.</p>
* @param mixed $fallback <p>e.g.: UTF-8</p>
*
* @return string <p>e.g.: ISO-8859-1, UTF-8, WINDOWS-1251 etc.<br>Will return a empty string as fallback (by
* default)</p>
*/
public static function normalize_encoding($encoding, $fallback = '') {
static $STATIC_NORMALIZE_ENCODING_CACHE = array();
if (!$encoding) {
return $fallback;
}
if (
'UTF-8' === $encoding
||
'UTF8' === $encoding
) {
return 'UTF-8';
}
if (isset($STATIC_NORMALIZE_ENCODING_CACHE[$encoding])) {
return $STATIC_NORMALIZE_ENCODING_CACHE[$encoding];
}
if (\in_array($encoding, self::$ENCODINGS, true)) {
$STATIC_NORMALIZE_ENCODING_CACHE[$encoding] = $encoding;
return $encoding;
}
$encodingOrig = $encoding;
$encoding = \strtoupper($encoding);
$encodingUpperHelper = \preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
$equivalences = array(
'ISO8859' => 'ISO-8859-1',
'ISO88591' => 'ISO-8859-1',
'ISO' => 'ISO-8859-1',
'LATIN' => 'ISO-8859-1',
'LATIN1' => 'ISO-8859-1', // Western European
'ISO88592' => 'ISO-8859-2',
'LATIN2' => 'ISO-8859-2', // Central European
'ISO88593' => 'ISO-8859-3',
'LATIN3' => 'ISO-8859-3', // Southern European
'ISO88594' => 'ISO-8859-4',
'LATIN4' => 'ISO-8859-4', // Northern European
'ISO88595' => 'ISO-8859-5',
'ISO88596' => 'ISO-8859-6', // Greek
'ISO88597' => 'ISO-8859-7',
'ISO88598' => 'ISO-8859-8', // Hebrew
'ISO88599' => 'ISO-8859-9',
'LATIN5' => 'ISO-8859-9', // Turkish
'ISO885911' => 'ISO-8859-11',
'TIS620' => 'ISO-8859-11', // Thai
'ISO885910' => 'ISO-8859-10',
'LATIN6' => 'ISO-8859-10', // Nordic
'ISO885913' => 'ISO-8859-13',
'LATIN7' => 'ISO-8859-13', // Baltic
'ISO885914' => 'ISO-8859-14',
'LATIN8' => 'ISO-8859-14', // Celtic
'ISO885915' => 'ISO-8859-15',
'LATIN9' => 'ISO-8859-15', // Western European (with some extra chars e.g. €)
'ISO885916' => 'ISO-8859-16',
'LATIN10' => 'ISO-8859-16', // Southeast European
'CP1250' => 'WINDOWS-1250',
'WIN1250' => 'WINDOWS-1250',
'WINDOWS1250' => 'WINDOWS-1250',
'CP1251' => 'WINDOWS-1251',
'WIN1251' => 'WINDOWS-1251',
'WINDOWS1251' => 'WINDOWS-1251',
'CP1252' => 'WINDOWS-1252',
'WIN1252' => 'WINDOWS-1252',
'WINDOWS1252' => 'WINDOWS-1252',
'CP1253' => 'WINDOWS-1253',
'WIN1253' => 'WINDOWS-1253',
'WINDOWS1253' => 'WINDOWS-1253',
'CP1254' => 'WINDOWS-1254',
'WIN1254' => 'WINDOWS-1254',
'WINDOWS1254' => 'WINDOWS-1254',
'CP1255' => 'WINDOWS-1255',
'WIN1255' => 'WINDOWS-1255',
'WINDOWS1255' => 'WINDOWS-1255',
'CP1256' => 'WINDOWS-1256',
'WIN1256' => 'WINDOWS-1256',
'WINDOWS1256' => 'WINDOWS-1256',
'CP1257' => 'WINDOWS-1257',
'WIN1257' => 'WINDOWS-1257',
'WINDOWS1257' => 'WINDOWS-1257',
'CP1258' => 'WINDOWS-1258',
'WIN1258' => 'WINDOWS-1258',
'WINDOWS1258' => 'WINDOWS-1258',
'UTF16' => 'UTF-16',
'UTF32' => 'UTF-32',
'UTF8' => 'UTF-8',
'UTF' => 'UTF-8',
'UTF7' => 'UTF-7',
'8BIT' => 'CP850',
'BINARY' => 'CP850',
);
if (!empty($equivalences[$encodingUpperHelper])) {
$encoding = $equivalences[$encodingUpperHelper];
}
$STATIC_NORMALIZE_ENCODING_CACHE[$encodingOrig] = $encoding;
return $encoding;
}
/**
* Normalize some MS Word special characters.
*
* @param string $str <p>The string to be normalized.</p>
*
* @return string
*/
public static function normalize_msword($str) {
if (!isset($str[0])) {
return '';
}
static $UTF8_MSWORD_KEYS_CACHE = null;
static $UTF8_MSWORD_VALUES_CACHE = null;
if ($UTF8_MSWORD_KEYS_CACHE === null) {
$UTF8_MSWORD_KEYS_CACHE = \array_keys(self::$UTF8_MSWORD);
$UTF8_MSWORD_VALUES_CACHE = \array_values(self::$UTF8_MSWORD);
}
return \str_replace($UTF8_MSWORD_KEYS_CACHE, $UTF8_MSWORD_VALUES_CACHE, $str);
}
/**
* Normalize the whitespace.
*
* @param string $str <p>The string to be normalized.</p>
* @param bool $keepNonBreakingSpace [optional] <p>Set to true, to keep non-breaking-spaces.</p>
* @param bool $keepBidiUnicodeControls [optional] <p>Set to true, to keep non-printable (for the web)
* bidirectional text chars.</p>
*
* @return string
*/
public static function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false) {
if (!isset($str[0])) {
return '';
}
static $WHITESPACE_CACHE = array();
$cacheKey = (int)$keepNonBreakingSpace;
if (!isset($WHITESPACE_CACHE[$cacheKey])) {
$WHITESPACE_CACHE[$cacheKey] = self::$WHITESPACE_TABLE;
if ($keepNonBreakingSpace === true) {
unset($WHITESPACE_CACHE[$cacheKey]['NO-BREAK SPACE']);
}
$WHITESPACE_CACHE[$cacheKey] = \array_values($WHITESPACE_CACHE[$cacheKey]);
}
if ($keepBidiUnicodeControls === false) {
static $BIDI_UNICODE_CONTROLS_CACHE = null;
if ($BIDI_UNICODE_CONTROLS_CACHE === null) {
$BIDI_UNICODE_CONTROLS_CACHE = \array_values(self::$BIDI_UNI_CODE_CONTROLS_TABLE);
}
$str = \str_replace($BIDI_UNICODE_CONTROLS_CACHE, '', $str);
}
return \str_replace($WHITESPACE_CACHE[$cacheKey], ' ', $str);
}
/**
* Calculates Unicode code point of the given UTF-8 encoded character.
*
* INFO: opposite to StringLib::chr()
*
* @param string $chr <p>The character of which to calculate code point.<p/>
* @param string $encoding [optional] <p>Default is UTF-8</p>
*
* @return int <p>
* Unicode code point of the given character,<br>
* 0 on invalid UTF-8 byte sequence.
* </p>
*/
public static function ord($chr, $encoding = 'UTF-8') {
// init
static $CHAR_CACHE = array();
// save the original string
$chr_orig = $chr;
if ($encoding !== 'UTF-8') {
$encoding = self::normalize_encoding($encoding, 'UTF-8');
// check again, if it's still not UTF-8
/** @noinspection NotOptimalIfConditionsInspection */
if ($encoding !== 'UTF-8') {
$chr = (string)\mb_convert_encoding($chr, 'UTF-8', $encoding);
}
}
$cacheKey = $chr_orig . $encoding;
if (isset($CHAR_CACHE[$cacheKey]) === true) {
return $CHAR_CACHE[$cacheKey];
}
if (!isset(self::$SUPPORT['already_checked_via_portable_utf8'])) {
self::checkForSupport();
}
if (self::$SUPPORT['intlChar'] === true) {
$code = \IntlChar::ord($chr);
if ($code) {
return $CHAR_CACHE[$cacheKey] = $code;
}
}
/** @noinspection CallableParameterUseCaseInTypeContextInspection */
$chr = \unpack('C*', (string)\mb_substr($chr, 0, 4, '8BIT'));
$code = $chr ? $chr[1] : 0;
if (0xF0 <= $code && isset($chr[4])) {
return $CHAR_CACHE[$cacheKey] = (($code - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80;
}
if (0xE0 <= $code && isset($chr[3])) {
return $CHAR_CACHE[$cacheKey] = (($code - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80;
}
if (0xC0 <= $code && isset($chr[2])) {
return $CHAR_CACHE[$cacheKey] = (($code - 0xC0) << 6) + $chr[2] - 0x80;
}
return $CHAR_CACHE[$cacheKey] = $code;
}
/**
* Checks if \u modifier is available that enables Unicode support in PCRE.
*
* @return bool <p><strong>true</strong> if support is available, <strong>false</strong> otherwise.</p>
*/
public static function pcre_utf8_support() {
/** @noinspection PhpUsageOfSilenceOperatorInspection */
return (bool)@\preg_match('//u', '');
}
/**
* Remove the BOM from UTF-8 / UTF-16 / UTF-32 strings.
*
* @param string $str <p>The input string.</p>
*
* @return string <p>String without UTF-BOM</p>
*/
public static function remove_bom($str) {
if (!isset($str[0])) {
return '';
}
foreach (self::$BOM as $bomString => $bomByteLength) {
if (0 === \mb_strpos($str, $bomString, 0, '8BIT')) {
$strTmp = \mb_substr($str, $bomByteLength, null, '8BIT');
if ($strTmp === false) {
$strTmp = '';
}
$str = (string)$strTmp;
}
}
return $str;
}
/**
* Remove invisible characters from a string.
*
* e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script.
*
* copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php
*
* @param string $str
* @param bool $url_encoded
* @param string $replacement
*
* @return string
*/
public static function remove_invisible_characters($str, $url_encoded = true, $replacement = '') {
// init
$non_displayables = array();
// every control character except newline (dec 10),
// carriage return (dec 13) and horizontal tab (dec 09)
if ($url_encoded) {
$non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15
$non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31
}
$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
do {
$str = (string)\preg_replace($non_displayables, $replacement, $str, -1, $count);
} while ($count !== 0);
return $str;
}
/**
* Replace the diamond question mark (�) and invalid-UTF8 chars with the replacement.
*
* @param string $str <p>The input string</p>
* @param string $replacementChar <p>The replacement character.</p>
* @param bool $processInvalidUtf8 <p>Convert invalid UTF-8 chars </p>
*
* @return string
*/
public static function replace_diamond_question_mark($str, $replacementChar = '', $processInvalidUtf8 = true) {
if (!isset($str[0])) {
return '';
}
if ($processInvalidUtf8 === true) {
$replacementCharHelper = $replacementChar;
if ($replacementChar === '') {
$replacementCharHelper = 'none';
}
if (!isset(self::$SUPPORT['already_checked_via_portable_utf8'])) {
self::checkForSupport();
}
$save = \mb_substitute_character();
\mb_substitute_character($replacementCharHelper);
$strTmp = \mb_convert_encoding($str, 'UTF-8', 'UTF-8');
\mb_substitute_character($save);
if (\is_string($strTmp)) {
$str = $strTmp;
} else {
$str = '';
}
}
return str_replace(
array(
"\xEF\xBF\xBD",
'�',
),
array(
$replacementChar,
$replacementChar,
),
$str
);
}
/**
* Converts a UTF-8 character to HTML Numbered Entity like "&#123;".
*
* @param string $char <p>The Unicode character to be encoded as numbered entity.</p>
* @param bool $keepAsciiChars <p>Set to <strong>true</strong> to keep ASCII chars.</>
* @param string $encoding [optional] <p>Default is UTF-8</p>
*
* @return string <p>The HTML numbered entity.</p>
*/
public static function single_chr_html_encode($char, $keepAsciiChars = false, $encoding = 'UTF-8') {
if (!isset($char[0])) {
return '';
}
if (
$keepAsciiChars === true
&&
self::is_ascii($char) === true
) {
return $char;
}
if ($encoding !== 'UTF-8') {
$encoding = self::normalize_encoding($encoding, 'UTF-8');
}
return '&#' . self::ord($char, $encoding) . ';';
}
/**
* Convert a string to an array of Unicode characters.
*
* @param string $str <p>The string to split into array.</p>
* @param int $length [optional] <p>Max character length of each array element.</p>
* @param bool $cleanUtf8 [optional] <p>Remove non UTF-8 chars from the string.</p>
*
* @return string[] <p>An array containing chunks of the string.</p>
*/
public static function split($str, $length = 1, $cleanUtf8 = false) {
if (!isset($str[0])) {
return array();
}
// init
$ret = array();
if (!isset(self::$SUPPORT['already_checked_via_portable_utf8'])) {
self::checkForSupport();
}
if ($cleanUtf8 === true) {
$str = self::clean($str);
}
if (self::$SUPPORT['pcre_utf8'] === true) {
\preg_match_all('/./us', $str, $retArray);
if (isset($retArray[0])) {
$ret = $retArray[0];
}
unset($retArray);
} else {
// fallback
if (!isset(self::$SUPPORT['already_checked_via_portable_utf8'])) {
self::checkForSupport();
}
$len = \strlen($str); // count the bytes
/** @noinspection ForeachInvariantsInspection */
for ($i = 0; $i < $len; $i++) {
if (($str[$i] & "\x80") === "\x00") {
$ret[] = $str[$i];
} elseif (
isset($str[$i + 1])
&&
($str[$i] & "\xE0") === "\xC0"
) {
if (($str[$i + 1] & "\xC0") === "\x80") {
$ret[] = $str[$i] . $str[$i + 1];
$i++;
}
} elseif (
isset($str[$i + 2])
&&
($str[$i] & "\xF0") === "\xE0"
) {
if (
($str[$i + 1] & "\xC0") === "\x80"
&&
($str[$i + 2] & "\xC0") === "\x80"
) {
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2];
$i += 2;
}
} elseif (
isset($str[$i + 3])
&&
($str[$i] & "\xF8") === "\xF0"
) {
if (
($str[$i + 1] & "\xC0") === "\x80"
&&
($str[$i + 2] & "\xC0") === "\x80"
&&
($str[$i + 3] & "\xC0") === "\x80"
) {
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
$i += 3;
}
}
}
}
if ($length > 1) {
$ret = \array_chunk($ret, $length);
return \array_map(
function ($item) {
return \implode('', $item);
}, $ret
);
}
if (isset($ret[0]) && $ret[0] === '') {
return array();
}
return $ret;
}
/**
* Optimized "\mb_detect_encoding()"-function -> with support for UTF-16 and UTF-32.
*
* @param string $str <p>The input string.</p>
*
* @return false|string <p>
* The detected string-encoding e.g. UTF-8 or UTF-16BE,<br>
* otherwise it will return false.
* </p>
*/
public static function str_detect_encoding($str)
{
//
// 1.) check binary strings (010001001...) like UTF-16 / UTF-32
//
if (self::is_binary($str, true) === true) {
if (self::is_utf16($str) === 1) {
return 'UTF-16LE';
}
if (self::is_utf16($str) === 2) {
return 'UTF-16BE';
}
if (self::is_utf32($str) === 1) {
return 'UTF-32LE';
}
if (self::is_utf32($str) === 2) {
return 'UTF-32BE';
}
return false;
}
//
// 2.) simple check for ASCII chars
//
if (self::is_ascii($str) === true) {
return 'ASCII';
}
//
// 3.) simple check for UTF-8 chars
//
if (self::is_utf8($str) === true) {
return 'UTF-8';
}
//
// 4.) check via "\mb_detect_encoding()"
//
// INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "\mb_detect_encoding()"
$detectOrder = array(
'ISO-8859-1',
'ISO-8859-2',
'ISO-8859-3',
'ISO-8859-4',
'ISO-8859-5',
'ISO-8859-6',
'ISO-8859-7',
'ISO-8859-8',
'ISO-8859-9',
'ISO-8859-10',
'ISO-8859-13',
'ISO-8859-14',
'ISO-8859-15',
'ISO-8859-16',
'WINDOWS-1251',
'WINDOWS-1252',
'WINDOWS-1254',
'ISO-2022-JP',
'JIS',
'EUC-JP',
);
$encoding = \mb_detect_encoding($str, $detectOrder, true);
if ($encoding) {
return $encoding;
}
//
// 5.) check via "iconv()"
//
$md5 = \md5($str);
foreach (self::$ENCODINGS as $encodingTmp) {
# INFO: //IGNORE and //TRANSLIT still throw notice
/** @noinspection PhpUsageOfSilenceOperatorInspection */
if (\md5(@\iconv($encodingTmp, $encodingTmp . '//IGNORE', $str)) === $md5) {
return $encodingTmp;
}
}
return false;
}
/**
* This function leaves UTF-8 characters alone, while converting almost all non-UTF8 to UTF8.
*
* <ul>
* <li>It decode UTF-8 codepoints and unicode escape sequences.</li>
* <li>It assumes that the encoding of the original string is either WINDOWS-1252 or ISO-8859.</li>
* <li>WARNING: It does not remove invalid UTF-8 characters, so you maybe need to use "StringLib::clean()" for this
* case.</li>
* </ul>
*
* @param string|string[] $str <p>Any string or array.</p>
* @param bool $decodeHtmlEntityToUtf8 <p>Set to true, if you need to decode html-entities.</p>
*
* @return string|string[] <p>The UTF-8 encoded string.</p>
*/
public static function to_utf8($str, $decodeHtmlEntityToUtf8 = false) {
if (\is_array($str) === true) {
foreach ($str as $k => $v) {
$str[$k] = self::to_utf8($v, $decodeHtmlEntityToUtf8);
}
return $str;
}
$str = (string)$str;
if (!isset($str[0])) {
return $str;
}
if (!isset(self::$SUPPORT['already_checked_via_portable_utf8'])) {
self::checkForSupport();
}
$max = \strlen($str); // count the bytes
$buf = '';
/** @noinspection ForeachInvariantsInspection */
for ($i = 0; $i < $max; $i++) {
$c1 = $str[$i];
if ($c1 >= "\xC0") { // should be converted to UTF8, if it's not UTF8 already
if ($c1 <= "\xDF") { // looks like 2 bytes UTF8
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1];
if ($c2 >= "\x80" && $c2 <= "\xBF") { // yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2;
$i++;
} else { // not valid UTF8 - convert it
$buf .= self::to_utf8_convert($c1);
}
} elseif ($c1 >= "\xE0" && $c1 <= "\xEF") { // looks like 3 bytes UTF8
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1];
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2];
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF") { // yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3;
$i += 2;
} else { // not valid UTF8 - convert it
$buf .= self::to_utf8_convert($c1);
}
} elseif ($c1 >= "\xF0" && $c1 <= "\xF7") { // looks like 4 bytes UTF8
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1];
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2];
$c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3];
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF" && $c4 >= "\x80" && $c4 <= "\xBF") { // yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3 . $c4;
$i += 3;
} else { // not valid UTF8 - convert it
$buf .= self::to_utf8_convert($c1);
}
} else { // doesn't look like UTF8, but should be converted
$buf .= self::to_utf8_convert($c1);
}
} elseif (($c1 & "\xC0") === "\x80") { // needs conversion
$buf .= self::to_utf8_convert($c1);
} else { // it doesn't need conversion
$buf .= $c1;
}
}
// decode unicode escape sequences
$buf = \preg_replace_callback(
'/\\\\u([0-9a-f]{4})/i',
function ($match) {
return \mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE');
},
$buf
);
// decode UTF-8 codepoints
if ($decodeHtmlEntityToUtf8 === true) {
$buf = self::html_entity_decode($buf);
}
return $buf;
}
/**
* @param int $int
*
* @return string
*/
private static function to_utf8_convert($int) {
// init
$buf = '';
$ordC1 = self::$ORD[$int];
if (isset(self::$WIN1252_TO_UTF8[$ordC1])) { // found in Windows-1252 special cases
$buf .= self::$WIN1252_TO_UTF8[$ordC1];
} else {
$cc1 = self::$CHR[$ordC1 / 64] | "\xC0";
$cc2 = ($int & "\x3F") | "\x80";
$buf .= $cc1 . $cc2;
}
return $buf;
}
/**
* Decodes an UTF-8 string to ISO-8859-1.
*
* @param string $str <p>The input string.</p>
* @param bool $keepUtf8Chars
*
* @return string
*/
public static function utf8_decode($str, $keepUtf8Chars = false) {
if (!isset($str[0])) {
return '';
}
static $UTF8_TO_WIN1252_KEYS_CACHE = null;
static $UTF8_TO_WIN1252_VALUES_CACHE = null;
if ($UTF8_TO_WIN1252_KEYS_CACHE === null) {
$UTF8_TO_WIN1252_KEYS_CACHE = \array_keys(self::$WIN1252_TO_UTF8);
$UTF8_TO_WIN1252_VALUES_CACHE = \array_values(self::$WIN1252_TO_UTF8);
}
/** @noinspection PhpInternalEntityUsedInspection */
$str = \str_replace($UTF8_TO_WIN1252_KEYS_CACHE, $UTF8_TO_WIN1252_VALUES_CACHE, $str);
if (!isset(self::$SUPPORT['already_checked_via_portable_utf8'])) {
self::checkForSupport();
}
// save for later comparision
$str_backup = $str;
$len = \strlen($str); // count the bytes
$noCharFound = '?';
/** @noinspection ForeachInvariantsInspection */
for ($i = 0, $j = 0; $i < $len; ++$i, ++$j) {
switch ($str[$i] & "\xF0") {
case "\xC0":
case "\xD0":
$c = (self::$ORD[$str[$i] & "\x1F"] << 6) | self::$ORD[$str[++$i] & "\x3F"];
$str[$j] = $c < 256 ? self::$CHR[$c] : $noCharFound;
break;
/** @noinspection PhpMissingBreakStatementInspection */
case "\xF0":
++$i;
case "\xE0":
$str[$j] = $noCharFound;
$i += 2;
break;
default:
$str[$j] = $str[$i];
}
}
$return = (string)\mb_substr($str, 0, $j, '8BIT');
if (
$keepUtf8Chars === true
&&
\mb_strlen($return) >= \mb_strlen($str_backup)
) {
return $str_backup;
}
return $return;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment