Skip to content

Instantly share code, notes, and snippets.

@Bak-Jin-Hyeong
Last active May 13, 2017 11:48
Show Gist options
  • Save Bak-Jin-Hyeong/89b3375fb757171861dea5a836e2132f to your computer and use it in GitHub Desktop.
Save Bak-Jin-Hyeong/89b3375fb757171861dea5a836e2132f to your computer and use it in GitHub Desktop.
#define NOMINMAX
#include <fcntl.h>
#include <io.h>
#include <clocale>
#include <cstdio>
#include <vector>
#include <algorithm>
#include <Windows.h>
#include <uchardet-0.0.6/src/nscore.h>
#include <uchardet-0.0.6/src/uchardet.h>
struct AutoCloseFileHandle {
explicit AutoCloseFileHandle(HANDLE h) : handle(h) {}
~AutoCloseFileHandle() {
if (handle != INVALID_HANDLE_VALUE) {
::CloseHandle(handle);
}
}
private:
HANDLE handle = INVALID_HANDLE_VALUE;
AutoCloseFileHandle(const AutoCloseFileHandle&) = delete;
void operator=(const AutoCloseFileHandle&) = delete;
};
auto open_file_for_read(const wchar_t filename[]) {
struct result {
HANDLE handle;
long long size;
int error;
};
auto h = ::CreateFileW(filename,
GENERIC_READ,
FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
nullptr, OPEN_EXISTING, FILE_FLAG_SEQUENTIAL_SCAN, nullptr);
if (h == INVALID_HANDLE_VALUE) {
auto win32_error = ::GetLastError();
return result{ nullptr, 0, HRESULT_FROM_WIN32(win32_error) };
}
LARGE_INTEGER size = {};
if (!::GetFileSizeEx(h, &size)) {
auto win32_error = ::GetLastError();
::CloseHandle(h);
return result{
nullptr, size.QuadPart, HRESULT_FROM_WIN32(win32_error) };
}
return result{ h, size.QuadPart, 0 };
}
auto read_file_contents_into_vector(const wchar_t filename[]) {
struct result {
std::vector<char> contents;
int error;
};
auto f = open_file_for_read(filename);
const AutoCloseFileHandle defer_close_{ f.handle };
if (!f.handle) {
return result{ {}, f.error };
}
if (f.size >= INT_MAX) {
return result{ {}, E_OUTOFMEMORY };
}
if (f.size == 0) {
return result{};
}
static const auto int_max = static_cast<decltype(f.size)>(INT_MAX);
result r{ {}, 0 };
try {
DWORD cb_to_read = static_cast<DWORD>(f.size);
r.contents.resize(cb_to_read);
DWORD cb_read = 0;
if (!::ReadFile(
f.handle, &r.contents[0], cb_to_read, &cb_read, nullptr)) {
const auto win32_error = ::GetLastError();
return result{ {}, HRESULT_FROM_WIN32(win32_error) };
}
if (cb_read >= 0) {
r.contents.resize(cb_read);
}
} catch (const std::bad_alloc&) {
return result{ {}, E_OUTOFMEMORY };
}
return r;
}
int windows_codepage_from_charset(const char* charset) {
struct {
const char* charset;
int codePage;
} static const table[] = {
{ "ASCII", 20127 },
{ "BIG5", 950 },
{ "EUC-JP", 932 }, // 20932 },
{ "EUC-KR", 949 }, // 51949 },
{ "EUC-TW", 950 },
{ "GB18030", 54936 },
{ "GB2312", 936 },
{ "HZ-GB-2312", 52936 },
{ "IBM855", 855 },
{ "IBM866", 866 },
{ "ISO-2022-CN", 936 },
{ "ISO-2022-JP", 50222 },
{ "ISO-2022-KR", 50225 },
{ "ISO-8859-1", 28591 },
{ "ISO-8859-11", 874 },
{ "ISO-8859-13", 28603 },
{ "ISO-8859-15", 28605 },
{ "ISO-8859-2", 28592 },
{ "ISO-8859-3", 28593 },
{ "ISO-8859-4", 28594 },
{ "ISO-8859-5", 28595 },
{ "ISO-8859-6", 28596 },
{ "ISO-8859-7", 28597 },
{ "ISO-8859-8", 28598 },
{ "ISO-8859-8-I", 38598 },
{ "ISO-8859-9", 28599 },
{ "KOI8-R", 20866 },
{ "MAC-CYRILLIC", 10007 },
{ "SHIFT_JIS", 932 },
{ "SJIS", 932 },
{ "TIS-620", 874 },
{ "UTF-8", 65001 },
{ "VISCII", 1258 },
{ "WINDOWS-1250", 1250 },
{ "WINDOWS-1251", 1251 },
{ "WINDOWS-1252", 1252 },
{ "WINDOWS-1253", 1253 },
{ "WINDOWS-1250", 1250 },
{ "WINDOWS-1255", 1255 },
{ "WINDOWS-1258", 1258 },
};
for (auto i : table) {
if (_stricmp(i.charset, charset) == 0) {
return i.codePage;
}
}
return -1;
}
auto detect_encoding(const std::vector<char>& source) {
struct result {
int codePage;
bool has_bom;
int error;
};
if (source.empty()) {
return result{};
}
const auto n = source.size();
if (n >= 2) {
const auto c0 = source[0];
const auto c1 = source[1];
if (n >= 3 && c0 == '\xEF' && c1 == '\xBB') {
const auto c2 = source[2];
if (c2 == '\xBF') {
return result{ 65001, true, 0 };
}
}
else if (c0 == '\xFF' && c1 == '\xFE') {
if (n >= 4) {
const auto c2 = source[2];
const auto c3 = source[3];
if (c2 == '\x00' && c3 == '\x00') {
return result{ 12000, true, 0 };
}
}
return result{ 1200, true, 0 };
}
else if (c0 == '\xFE' && c1 == '\xFF') {
return result{ 1201, true, 0 };
}
else if (n >= 4 && c0 == '\x00' && c1 == '\x00') {
const auto c2 = source[2];
const auto c3 = source[3];
if (c2 == '\xFE' && c3 == '\xFF') {
return result{ 12001, true, 0 };
}
}
}
struct scoped_detector {
scoped_detector() = default;
~scoped_detector() {
if (handle) {
uchardet_delete(handle);
}
}
const uchardet_t handle = uchardet_new();
private:
scoped_detector(const scoped_detector&) = delete;
void operator=(const scoped_detector&) = delete;
} const detector;
if (!detector.handle) {
return result{ 0, false, -1 };
}
auto e = uchardet_handle_data(detector.handle, &source[0], source.size());
if (e != 0) {
if (e == NS_ERROR_OUT_OF_MEMORY) {
return result{ 0, false, E_OUTOFMEMORY };
}
else {
return result{ 0, false, e };
}
}
uchardet_data_end(detector.handle);
auto charset = uchardet_get_charset(detector.handle);
auto windows_codepage = windows_codepage_from_charset(charset);
return result{ windows_codepage, false, 0 };
}
auto convert_to_utf16(const std::vector<char>& source, int codePage) {
struct result{
std::vector<wchar_t> contents;
int error;
};
if (source.empty()) {
return result{};
}
auto n = source.size();
if (n >= INT_MAX) {
return result{ {}, E_OUTOFMEMORY };
}
const auto cb_source = static_cast<int>(n);
::SetLastError(0);
const auto cch_wide = ::MultiByteToWideChar(
codePage, MB_ERR_INVALID_CHARS, &source[0], cb_source, nullptr, 0);
const auto calc_size_error = ::GetLastError();
if (calc_size_error != 0) {
result{ {}, HRESULT_FROM_WIN32(calc_size_error) };
}
if (cch_wide <= 0) {
result{ {}, E_FAIL };
}
result r{ {}, 0 };
try {
r.contents.resize(cch_wide);
const auto cch_result = ::MultiByteToWideChar(
codePage, MB_ERR_INVALID_CHARS,
&source[0], cb_source, &r.contents[0], cch_wide);
if (cch_result != cch_wide) {
const auto conversion_error = ::GetLastError();
r.error = HRESULT_FROM_WIN32(conversion_error);
}
}
catch (const std::bad_alloc&) {
return result{ {}, E_OUTOFMEMORY };
}
return r;
}
auto convert_to_utf8(const std::vector<wchar_t>& source) {
struct result {
std::vector<char> contents;
int error;
};
if (source.empty()) {
return result{};
}
auto n = source.size();
if (n >= INT_MAX) {
return result{ {}, E_OUTOFMEMORY };
}
const auto cch_source = static_cast<int>(n);
const auto cch_utf8 = ::WideCharToMultiByte(
CP_UTF8, 0, &source[0], cch_source, nullptr, 0, nullptr, nullptr);
if (cch_utf8 <= 0) {
const auto win32_error = ::GetLastError();
result{ {}, HRESULT_FROM_WIN32(win32_error) };
}
result r{ {}, 0 };
try {
r.contents.resize(cch_utf8);
const auto cch_result = ::WideCharToMultiByte(
CP_UTF8, 0, &source[0], cch_source,
&r.contents[0], cch_utf8, nullptr, nullptr);
if (cch_result != cch_utf8) {
const auto conversion_error = ::GetLastError();
r.error = HRESULT_FROM_WIN32(conversion_error);
}
}
catch (const std::bad_alloc&) {
return result{ {}, E_OUTOFMEMORY };
}
return r;
}
int save_utf8_with_bom(
const wchar_t target_filename[], const std::vector<char>& contents) {
auto last_error = []() -> int {
const auto win32_error = ::GetLastError();
if (win32_error == 0) {
return E_FAIL;
}
return HRESULT_FROM_WIN32(win32_error);
};
auto h = ::CreateFileW(target_filename,
GENERIC_WRITE,
FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
nullptr, CREATE_ALWAYS, 0, nullptr);
if (h == INVALID_HANDLE_VALUE) {
return last_error();
}
const AutoCloseFileHandle defer_close_{ h };
DWORD written_bom = 0;
if (!::WriteFile(h, "\xEF\xBB\xBF", 3, &written_bom, nullptr)) {
return last_error();
}
if (written_bom != 3) {
return last_error();
}
auto remain = contents.size();
size_t offset = 0;
while (remain > 0) {
DWORD to_write = static_cast<DWORD>(
std::min(static_cast<size_t>(INT_MAX), remain));
DWORD written = 0;
if (!::WriteFile(h, &contents[offset], to_write, &written, nullptr)) {
return last_error();
}
if (remain >= written) {
offset += written;
remain -= written;
}
else {
return last_error();
}
if (written != to_write) {
return last_error();
}
}
return 0;
}
int print_result(
bool converted, int error,
const wchar_t source_filename[],
size_t source_filesize,
int source_codepage,
bool source_has_bom,
const wchar_t target_filename[],
size_t target_filesize) {
if (error) {
fwprintf_s(stderr,
L"{\"converted\": %s, \"error\": \"0x%08X\", "
L"\"source\": \"%s\", \"source_size\": %zu, "
L"\"source_codepage\": %d, \"source_has_bom\": %s, "
L"\"target\": \"%s\", \"target_size\": %zu}\n",
converted ? L"true" : L"false", error,
source_filename, source_filesize,
source_codepage, source_has_bom ? L"true" : L"false",
target_filename, target_filesize);
}
else {
fwprintf_s(stdout,
L"{\"converted\": %s, "
L"\"source\": \"%s\", \"source_size\": %zu, "
L"\"source_codepage\": %d, \"source_has_bom\": %s, "
L"\"target\": \"%s\", \"target_size\": %zu}\n",
converted ? L"true" : L"false",
source_filename, source_filesize,
source_codepage, source_has_bom ? L"true" : L"false",
target_filename, target_filesize);
}
return error;
}
int convert_encoding(
const wchar_t source_filename[],
const wchar_t target_filename[],
bool force_write_bom) {
auto source = read_file_contents_into_vector(source_filename);
const auto source_size = source.contents.size();
if (source.error != 0) {
return print_result(false, source.error,
source_filename, source_size, 0, false, target_filename, 0);
}
if (source.contents.empty()) {
return print_result(false, 0,
source_filename, source_size, 0, false, target_filename, 0);
}
auto detected = detect_encoding(source.contents);
if (detected.has_bom || detected.error || detected.codePage <= 0) {
return print_result(false, detected.error,
source_filename, source_size, detected.codePage, detected.has_bom,
target_filename, 0);
}
static const int CP_ASCII = 20127;
if (!force_write_bom &&
(detected.codePage == CP_ASCII || detected.codePage == CP_UTF8)) {
return print_result(false, detected.error,
source_filename, source_size, detected.codePage, detected.has_bom,
target_filename, 0);
}
std::vector<char> utf8_contents;
if (detected.codePage != CP_ASCII && detected.codePage != CP_UTF8) {
auto utf16 = convert_to_utf16(source.contents, detected.codePage);
source.contents.clear();
source.contents.shrink_to_fit();
if (utf16.error != 0) {
return print_result(false, utf16.error,
source_filename, source_size, detected.codePage, detected.has_bom,
target_filename, 0);
}
const auto utf8 = convert_to_utf8(utf16.contents);
utf16.contents.clear();
utf16.contents.shrink_to_fit();
utf8_contents = std::move(utf8.contents);
if (utf8.error) {
return print_result(false, utf8.error,
source_filename, source_size, detected.codePage, detected.has_bom,
target_filename, utf8_contents.size());
}
}
else {
utf8_contents = std::move(source.contents);
}
const auto save_error = save_utf8_with_bom(target_filename, utf8_contents);
return print_result(save_error == 0, save_error,
source_filename, source_size, detected.codePage, detected.has_bom,
target_filename, utf8_contents.size());
}
int wmain(int argc, wchar_t* argv[]) {
setlocale(LC_COLLATE | LC_CTYPE, "");
bool force_bom = false;
const wchar_t* source_filename = nullptr;
const wchar_t* target_filename = nullptr;
for (int i = 1; i < argc; ++i) {
if (wcscmp(argv[i], L"--force-bom") == 0) {
force_bom = true;
}
else if (!source_filename) {
source_filename = argv[i];
}
else if (!target_filename) {
target_filename = argv[i];
}
}
if (!target_filename) {
target_filename = source_filename;
}
if (source_filename && target_filename) {
return convert_encoding(source_filename, target_filename, force_bom);
}
else {
wchar_t this_filename[_MAX_FNAME]{};
if (argc > 0) {
_wsplitpath_s(argv[0], nullptr, 0, nullptr, 0,
this_filename, _countof(this_filename), nullptr, 0);
}
if (this_filename[0] == '\0') {
wcsncpy_s(this_filename, L"<ConvertTo-UTF8-BOM>", _TRUNCATE);
}
fwprintf_s(stdout,
L"{\"usage\": \"%s [--force-bom] <source filename> [target filename]\"}\n",
this_filename);
return -1;
}
}
@Bak-Jin-Hyeong
Copy link
Author

Bak-Jin-Hyeong commented May 11, 2017

➜ Get-ChildItem .. -Recurse -Include *.h, *.c, *.hpp, *.cpp, *.hxx, *.cxx, *.inl | ForEach-Object { .\ConvertTo-UTF8-BOM.exe $_.FullName }

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment