Skip to content

Instantly share code, notes, and snippets.

@pedrolcl
Last active April 13, 2021 18:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pedrolcl/e37aae1f8ca78c1cdf361496fc6ebc07 to your computer and use it in GitHub Desktop.
Save pedrolcl/e37aae1f8ca78c1cdf361496fc6ebc07 to your computer and use it in GitHub Desktop.
integration of uchardet with QTextCodec - codec names dont always match
cmake_minimum_required(VERSION 3.14)
project(integrate_uchardet LANGUAGES CXX)
set(CMAKE_INCLUDE_CURRENT_DIR ON)
set(CMAKE_AUTOUIC ON)
set(CMAKE_AUTOMOC ON)
set(CMAKE_AUTORCC ON)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
find_package(QT NAMES Qt6 Qt5 REQUIRED)
find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core REQUIRED)
add_executable(integrate_uchardet integrate_uchardet.cpp)
target_link_libraries(integrate_uchardet Qt${QT_VERSION_MAJOR}::Core)
#include <QDebug>
#include <QCoreApplication>
#include <QList>
#include <QMap>
#include <QTextCodec>
static QMap<QByteArray,QList<QByteArray> > ucodecs{
{"International (Unicode)", {
"UTF-8",
"UTF-16BE",
"UTF-16LE",
"UTF-32BE",
"UTF-32LE"}},
{"Arabic", {
"ISO-8859-6",
"WINDOWS-1256"}},
{"Bulgarian", {
"ISO-8859-5",
"WINDOWS-1251"}},
{"Chinese", {
"ISO-2022-CN",
"BIG5",
/*"EUC-TW",*/
"GB18030",
"HZ-GB-2312"}},
{"Croatian", {
"ISO-8859-2",
"ISO-8859-13",
"ISO-8859-16",
"Windows-1250",
"IBM852",
"MAC-CENTRALEUROPE"}},
{"Czech", {
"Windows-1250",
"ISO-8859-2",
"IBM852",
"MAC-CENTRALEUROPE"}},
{"Danish", {
"ISO-8859-1",
"ISO-8859-15",
"WINDOWS-1252"}},
{"English", {
"ASCII"}},
{"Esperanto", {
"ISO-8859-3"}},
{"Estonian", {
"ISO-8859-4",
"ISO-8859-13",
"ISO-8859-13",
"Windows-1252",
"Windows-1257"}},
{"Finnish", {
"ISO-8859-1",
"ISO-8859-4",
"ISO-8859-9",
"ISO-8859-13",
"ISO-8859-15",
"WINDOWS-1252"}},
{"French", {
"ISO-8859-1",
"ISO-8859-15",
"WINDOWS-1252"}},
{"German", {
"ISO-8859-1",
"WINDOWS-1252"}},
{"Greek", {
"ISO-8859-7",
"WINDOWS-1253"}},
{"Hebrew", {
"ISO-8859-8",
"WINDOWS-1255"}},
{"Hungarian", {
"ISO-8859-2",
"WINDOWS-1250"}},
{"Irish Gaelic", {
"ISO-8859-1",
"ISO-8859-9",
"ISO-8859-15",
"WINDOWS-1252"}},
{"Italian", {
"ISO-8859-1",
"ISO-8859-3",
"ISO-8859-9",
"ISO-8859-15",
"WINDOWS-1252"}},
{"Japanese", {
"ISO-2022-JP",
"SHIFT_JIS",
"EUC-JP"}},
{"Korean", {
"ISO-2022-KR",
"EUC-KR", "UHC"}},
{"Lithuanian", {
"ISO-8859-4",
"ISO-8859-10",
"ISO-8859-13"}},
{"Latvian", {
"ISO-8859-4",
"ISO-8859-10",
"ISO-8859-13"}},
{"Maltese", {
"ISO-8859-3"}},
{"Polish", {
"ISO-8859-2",
"ISO-8859-13",
"ISO-8859-16",
"Windows-1250",
"IBM852",
"MAC-CENTRALEUROPE"}},
{"Portuguese", {
"ISO-8859-1",
"ISO-8859-9",
"ISO-8859-15",
"WINDOWS-1252"}},
{"Romanian", {
"ISO-8859-2",
"ISO-8859-16",
"Windows-1250",
"IBM852"}},
{"Russian", {
"ISO-8859-5",
"KOI8-R",
"WINDOWS-1251",
"MAC-CYRILLIC",
"IBM866",
"IBM855"}},
{"Slovak", {
"Windows-1250",
"ISO-8859-2",
"IBM852",
"MAC-CENTRALEUROPE"}},
{"Slovene", {
"ISO-8859-2",
"ISO-8859-16",
"Windows-1250",
"IBM852",
"MAC-CENTRALEUROPE"}},
{"Spanish", {
"ISO-8859-1",
"ISO-8859-15",
"WINDOWS-1252"}},
{"Swedish", {
"ISO-8859-1",
"ISO-8859-4",
"ISO-8859-9",
"ISO-8859-15",
"WINDOWS-1252"}},
{"Thai", {
"TIS-620",
"ISO-8859-11"}},
{"Turkish", {
"ISO-8859-3",
"ISO-8859-9"}},
{"Vietnamese", {
"VISCII",
"Windows-1258"}}
};
/**
* References:
*
* https://www.iana.org/assignments/character-sets/character-sets.xml
* https://doc.qt.io/qt-5/qtextcodec.html
*/
QMap<QByteArray,int> umibs{
/* Linux */
{"UHC", 38},
{"ISO-8859-11", 2259},
{"VISCII", 2082},
/* Windows */
{"ISO-2022-CN", 104},
{"HZ-GB-2312", 2085},
{"IBM852", 2010},
{"ASCII", 3},
{"ISO-2022-KR", 37},
{"IBM855", 2046}
/* macOS */
};
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
auto keys = ucodecs.keys();
foreach(const auto& k, keys) {
foreach(const auto& c, ucodecs[k]) {
auto codec = QTextCodec::codecForName(c);
if (codec == nullptr) {
if (!umibs.contains(c)) {
qWarning() << k << "\t" << c << ":CHECK!!!";
}
} else {
if (!umibs.contains(c)) {
umibs.insert(c, codec->mibEnum());
} else if (codec->mibEnum() != umibs[c]) {
qWarning() << "\t" << c << ":" << codec->mibEnum() << "?" << umibs[c];
}
}
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment