Last active
April 13, 2021 18:26
-
-
Save pedrolcl/e37aae1f8ca78c1cdf361496fc6ebc07 to your computer and use it in GitHub Desktop.
integration of uchardet with QTextCodec - codec names dont always match
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cmake_minimum_required(VERSION 3.14) | |
project(integrate_uchardet LANGUAGES CXX) | |
set(CMAKE_INCLUDE_CURRENT_DIR ON) | |
set(CMAKE_AUTOUIC ON) | |
set(CMAKE_AUTOMOC ON) | |
set(CMAKE_AUTORCC ON) | |
set(CMAKE_CXX_STANDARD 11) | |
set(CMAKE_CXX_STANDARD_REQUIRED ON) | |
find_package(QT NAMES Qt6 Qt5 REQUIRED) | |
find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core REQUIRED) | |
add_executable(integrate_uchardet integrate_uchardet.cpp) | |
target_link_libraries(integrate_uchardet Qt${QT_VERSION_MAJOR}::Core) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <QDebug> | |
#include <QCoreApplication> | |
#include <QList> | |
#include <QMap> | |
#include <QTextCodec> | |
static QMap<QByteArray,QList<QByteArray> > ucodecs{ | |
{"International (Unicode)", { | |
"UTF-8", | |
"UTF-16BE", | |
"UTF-16LE", | |
"UTF-32BE", | |
"UTF-32LE"}}, | |
{"Arabic", { | |
"ISO-8859-6", | |
"WINDOWS-1256"}}, | |
{"Bulgarian", { | |
"ISO-8859-5", | |
"WINDOWS-1251"}}, | |
{"Chinese", { | |
"ISO-2022-CN", | |
"BIG5", | |
/*"EUC-TW",*/ | |
"GB18030", | |
"HZ-GB-2312"}}, | |
{"Croatian", { | |
"ISO-8859-2", | |
"ISO-8859-13", | |
"ISO-8859-16", | |
"Windows-1250", | |
"IBM852", | |
"MAC-CENTRALEUROPE"}}, | |
{"Czech", { | |
"Windows-1250", | |
"ISO-8859-2", | |
"IBM852", | |
"MAC-CENTRALEUROPE"}}, | |
{"Danish", { | |
"ISO-8859-1", | |
"ISO-8859-15", | |
"WINDOWS-1252"}}, | |
{"English", { | |
"ASCII"}}, | |
{"Esperanto", { | |
"ISO-8859-3"}}, | |
{"Estonian", { | |
"ISO-8859-4", | |
"ISO-8859-13", | |
"ISO-8859-13", | |
"Windows-1252", | |
"Windows-1257"}}, | |
{"Finnish", { | |
"ISO-8859-1", | |
"ISO-8859-4", | |
"ISO-8859-9", | |
"ISO-8859-13", | |
"ISO-8859-15", | |
"WINDOWS-1252"}}, | |
{"French", { | |
"ISO-8859-1", | |
"ISO-8859-15", | |
"WINDOWS-1252"}}, | |
{"German", { | |
"ISO-8859-1", | |
"WINDOWS-1252"}}, | |
{"Greek", { | |
"ISO-8859-7", | |
"WINDOWS-1253"}}, | |
{"Hebrew", { | |
"ISO-8859-8", | |
"WINDOWS-1255"}}, | |
{"Hungarian", { | |
"ISO-8859-2", | |
"WINDOWS-1250"}}, | |
{"Irish Gaelic", { | |
"ISO-8859-1", | |
"ISO-8859-9", | |
"ISO-8859-15", | |
"WINDOWS-1252"}}, | |
{"Italian", { | |
"ISO-8859-1", | |
"ISO-8859-3", | |
"ISO-8859-9", | |
"ISO-8859-15", | |
"WINDOWS-1252"}}, | |
{"Japanese", { | |
"ISO-2022-JP", | |
"SHIFT_JIS", | |
"EUC-JP"}}, | |
{"Korean", { | |
"ISO-2022-KR", | |
"EUC-KR", "UHC"}}, | |
{"Lithuanian", { | |
"ISO-8859-4", | |
"ISO-8859-10", | |
"ISO-8859-13"}}, | |
{"Latvian", { | |
"ISO-8859-4", | |
"ISO-8859-10", | |
"ISO-8859-13"}}, | |
{"Maltese", { | |
"ISO-8859-3"}}, | |
{"Polish", { | |
"ISO-8859-2", | |
"ISO-8859-13", | |
"ISO-8859-16", | |
"Windows-1250", | |
"IBM852", | |
"MAC-CENTRALEUROPE"}}, | |
{"Portuguese", { | |
"ISO-8859-1", | |
"ISO-8859-9", | |
"ISO-8859-15", | |
"WINDOWS-1252"}}, | |
{"Romanian", { | |
"ISO-8859-2", | |
"ISO-8859-16", | |
"Windows-1250", | |
"IBM852"}}, | |
{"Russian", { | |
"ISO-8859-5", | |
"KOI8-R", | |
"WINDOWS-1251", | |
"MAC-CYRILLIC", | |
"IBM866", | |
"IBM855"}}, | |
{"Slovak", { | |
"Windows-1250", | |
"ISO-8859-2", | |
"IBM852", | |
"MAC-CENTRALEUROPE"}}, | |
{"Slovene", { | |
"ISO-8859-2", | |
"ISO-8859-16", | |
"Windows-1250", | |
"IBM852", | |
"MAC-CENTRALEUROPE"}}, | |
{"Spanish", { | |
"ISO-8859-1", | |
"ISO-8859-15", | |
"WINDOWS-1252"}}, | |
{"Swedish", { | |
"ISO-8859-1", | |
"ISO-8859-4", | |
"ISO-8859-9", | |
"ISO-8859-15", | |
"WINDOWS-1252"}}, | |
{"Thai", { | |
"TIS-620", | |
"ISO-8859-11"}}, | |
{"Turkish", { | |
"ISO-8859-3", | |
"ISO-8859-9"}}, | |
{"Vietnamese", { | |
"VISCII", | |
"Windows-1258"}} | |
}; | |
/** | |
* References: | |
* | |
* https://www.iana.org/assignments/character-sets/character-sets.xml | |
* https://doc.qt.io/qt-5/qtextcodec.html | |
*/ | |
QMap<QByteArray,int> umibs{ | |
/* Linux */ | |
{"UHC", 38}, | |
{"ISO-8859-11", 2259}, | |
{"VISCII", 2082}, | |
/* Windows */ | |
{"ISO-2022-CN", 104}, | |
{"HZ-GB-2312", 2085}, | |
{"IBM852", 2010}, | |
{"ASCII", 3}, | |
{"ISO-2022-KR", 37}, | |
{"IBM855", 2046} | |
/* macOS */ | |
}; | |
int main(int argc, char *argv[]) | |
{ | |
QCoreApplication a(argc, argv); | |
auto keys = ucodecs.keys(); | |
foreach(const auto& k, keys) { | |
foreach(const auto& c, ucodecs[k]) { | |
auto codec = QTextCodec::codecForName(c); | |
if (codec == nullptr) { | |
if (!umibs.contains(c)) { | |
qWarning() << k << "\t" << c << ":CHECK!!!"; | |
} | |
} else { | |
if (!umibs.contains(c)) { | |
umibs.insert(c, codec->mibEnum()); | |
} else if (codec->mibEnum() != umibs[c]) { | |
qWarning() << "\t" << c << ":" << codec->mibEnum() << "?" << umibs[c]; | |
} | |
} | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment