Skip to content

Instantly share code, notes, and snippets.

@ngrebenshikov
Created January 9, 2016 08:40
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ngrebenshikov/e82ffca794554c67a9af to your computer and use it in GitHub Desktop.
Save ngrebenshikov/e82ffca794554c67a9af to your computer and use it in GitHub Desktop.
UTF8 to CP1251 string converter on Haxe
package ;
import haxe.io.Bytes;
import haxe.ds.IntMap;
class TextEncodingHelper {
// Based on https://code.google.com/p/convert-utf8-to-cp1251/source/browse/main.cpp
private static var utf8ToCp1251: IntMap<Int> = [
0x201A => 0x82, // SINGLE LOW-9 QUOTATION MARK
0x0453 => 0x83, // CYRILLIC SMALL LETTER GJE
0x201E => 0x84, // DOUBLE LOW-9 QUOTATION MARK
0x2026 => 0x85, // HORIZONTAL ELLIPSIS
0x2020 => 0x86, // DAGGER
0x2021 => 0x87, // DOUBLE DAGGER
0x20AC => 0x88, // EURO SIGN
0x2030 => 0x89, // PER MILLE SIGN
0x0409 => 0x8A, // CYRILLIC CAPITAL LETTER LJE
0x2039 => 0x8B, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x040A => 0x8C, // CYRILLIC CAPITAL LETTER NJE
0x040C => 0x8D, // CYRILLIC CAPITAL LETTER KJE
0x040B => 0x8E, // CYRILLIC CAPITAL LETTER TSHE
0x040F => 0x8F, // CYRILLIC CAPITAL LETTER DZHE
0x0452 => 0x90, // CYRILLIC SMALL LETTER DJE
0x2018 => 0x91, // LEFT SINGLE QUOTATION MARK
0x2019 => 0x92, // RIGHT SINGLE QUOTATION MARK
0x201C => 0x93, // LEFT DOUBLE QUOTATION MARK
0x201D => 0x94, // RIGHT DOUBLE QUOTATION MARK
0x2022 => 0x95, // BULLET
0x2013 => 0x96, // EN DASH
0x2014 => 0x97, // EM DASH
0x2122 => 0x99, // TRADE MARK SIGN
0x0459 => 0x9A, // CYRILLIC SMALL LETTER LJE
0x203A => 0x9B, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x045A => 0x9C, // CYRILLIC SMALL LETTER NJE
0x045C => 0x9D, // CYRILLIC SMALL LETTER KJE
0x045B => 0x9E, // CYRILLIC SMALL LETTER TSHE
0x045F => 0x9F, // CYRILLIC SMALL LETTER DZHE
0x00A0 => 0xA0, // NO-BREAK SPACE
0x040E => 0xA1, // CYRILLIC CAPITAL LETTER SHORT U
0x045E => 0xA2, // CYRILLIC SMALL LETTER SHORT U
0x0408 => 0xA3, // CYRILLIC CAPITAL LETTER JE
0x00A4 => 0xA4, // CURRENCY SIGN
0x0490 => 0xA5, // CYRILLIC CAPITAL LETTER GHE WITH UPTURN
0x00A6 => 0xA6, // BROKEN BAR
0x00A7 => 0xA7, // SECTION SIGN
0x0401 => 0xA8, // CYRILLIC CAPITAL LETTER IO
0x00A9 => 0xA9, // COPYRIGHT SIGN
0x0404 => 0xAA, // CYRILLIC CAPITAL LETTER UKRAINIAN IE
0x00AB => 0xAB, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
0x00AC => 0xAC, // NOT SIGN
0x00AD => 0xAD, // SOFT HYPHEN
0x00AE => 0xAE, // REGISTERED SIGN
0x0407 => 0xAF, // CYRILLIC CAPITAL LETTER YI
0x00B0 => 0xB0, // DEGREE SIGN
0x00B1 => 0xB1, // PLUS-MINUS SIGN
0x0406 => 0xB2, // CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
0x0456 => 0xB3, // CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
0x0491 => 0xB4, // CYRILLIC SMALL LETTER GHE WITH UPTURN
0x00B5 => 0xB5, // MICRO SIGN
0x00B6 => 0xB6, // PILCROW SIGN
0x00B7 => 0xB7, // MIDDLE DOT
0x0451 => 0xB8, // CYRILLIC SMALL LETTER IO
0x2116 => 0xB9, // NUMERO SIGN
0x0454 => 0xBA, // CYRILLIC SMALL LETTER UKRAINIAN IE
0x00BB => 0xBB, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
0x0458 => 0xBC, // CYRILLIC SMALL LETTER JE
0x0405 => 0xBD, // CYRILLIC CAPITAL LETTER DZE
0x0455 => 0xBE, // CYRILLIC SMALL LETTER DZE
0x0457 => 0xBF // CYRILLIC SMALL LETTER YI
];
private static function convertUtf8ToWindows1251(utf8String: String): String {
var utf8: Bytes = Bytes.ofString(utf8String);
var result: StringBuf = new StringBuf();
var i: Int = 0;
while(i < utf8.length) {
var prefix: Int = utf8.get(i);
var suffix: Int = utf8.get(i+1);
if ((prefix & 0x80) == 0) {
result.addChar(prefix);
} else if ((~prefix) & 0x20 != 0) {
var first5bit: Int = prefix & 0x1F;
first5bit <<= 6;
var sec6bit: Int = suffix & 0x3F;
var unicodeChar: Int = first5bit + sec6bit;
if ( unicodeChar >= 0x410 && unicodeChar <= 0x44F ) {
result.addChar(unicodeChar - 0x350);
} else if (unicodeChar >= 0x80 && unicodeChar <= 0xFF) {
result.addChar(unicodeChar);
} else if (unicodeChar >= 0x402 && unicodeChar <= 0x403) {
result.addChar(unicodeChar - 0x382);
} else {
if (utf8ToCp1251.exists(unicodeChar)) {
result.addChar(utf8ToCp1251.get(unicodeChar));
} else {
trace("Can't convert the char");
}
}
i += 1;
} else {
trace("Can't convert the char");
}
i += 1;
}
return result.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment