Last active
May 13, 2017 11:48
-
-
Save Bak-Jin-Hyeong/89b3375fb757171861dea5a836e2132f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define NOMINMAX | |
#include <fcntl.h> | |
#include <io.h> | |
#include <clocale> | |
#include <cstdio> | |
#include <vector> | |
#include <algorithm> | |
#include <Windows.h> | |
#include <uchardet-0.0.6/src/nscore.h> | |
#include <uchardet-0.0.6/src/uchardet.h> | |
struct AutoCloseFileHandle { | |
explicit AutoCloseFileHandle(HANDLE h) : handle(h) {} | |
~AutoCloseFileHandle() { | |
if (handle != INVALID_HANDLE_VALUE) { | |
::CloseHandle(handle); | |
} | |
} | |
private: | |
HANDLE handle = INVALID_HANDLE_VALUE; | |
AutoCloseFileHandle(const AutoCloseFileHandle&) = delete; | |
void operator=(const AutoCloseFileHandle&) = delete; | |
}; | |
auto open_file_for_read(const wchar_t filename[]) { | |
struct result { | |
HANDLE handle; | |
long long size; | |
int error; | |
}; | |
auto h = ::CreateFileW(filename, | |
GENERIC_READ, | |
FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, | |
nullptr, OPEN_EXISTING, FILE_FLAG_SEQUENTIAL_SCAN, nullptr); | |
if (h == INVALID_HANDLE_VALUE) { | |
auto win32_error = ::GetLastError(); | |
return result{ nullptr, 0, HRESULT_FROM_WIN32(win32_error) }; | |
} | |
LARGE_INTEGER size = {}; | |
if (!::GetFileSizeEx(h, &size)) { | |
auto win32_error = ::GetLastError(); | |
::CloseHandle(h); | |
return result{ | |
nullptr, size.QuadPart, HRESULT_FROM_WIN32(win32_error) }; | |
} | |
return result{ h, size.QuadPart, 0 }; | |
} | |
auto read_file_contents_into_vector(const wchar_t filename[]) { | |
struct result { | |
std::vector<char> contents; | |
int error; | |
}; | |
auto f = open_file_for_read(filename); | |
const AutoCloseFileHandle defer_close_{ f.handle }; | |
if (!f.handle) { | |
return result{ {}, f.error }; | |
} | |
if (f.size >= INT_MAX) { | |
return result{ {}, E_OUTOFMEMORY }; | |
} | |
if (f.size == 0) { | |
return result{}; | |
} | |
static const auto int_max = static_cast<decltype(f.size)>(INT_MAX); | |
result r{ {}, 0 }; | |
try { | |
DWORD cb_to_read = static_cast<DWORD>(f.size); | |
r.contents.resize(cb_to_read); | |
DWORD cb_read = 0; | |
if (!::ReadFile( | |
f.handle, &r.contents[0], cb_to_read, &cb_read, nullptr)) { | |
const auto win32_error = ::GetLastError(); | |
return result{ {}, HRESULT_FROM_WIN32(win32_error) }; | |
} | |
if (cb_read >= 0) { | |
r.contents.resize(cb_read); | |
} | |
} catch (const std::bad_alloc&) { | |
return result{ {}, E_OUTOFMEMORY }; | |
} | |
return r; | |
} | |
int windows_codepage_from_charset(const char* charset) { | |
struct { | |
const char* charset; | |
int codePage; | |
} static const table[] = { | |
{ "ASCII", 20127 }, | |
{ "BIG5", 950 }, | |
{ "EUC-JP", 932 }, // 20932 }, | |
{ "EUC-KR", 949 }, // 51949 }, | |
{ "EUC-TW", 950 }, | |
{ "GB18030", 54936 }, | |
{ "GB2312", 936 }, | |
{ "HZ-GB-2312", 52936 }, | |
{ "IBM855", 855 }, | |
{ "IBM866", 866 }, | |
{ "ISO-2022-CN", 936 }, | |
{ "ISO-2022-JP", 50222 }, | |
{ "ISO-2022-KR", 50225 }, | |
{ "ISO-8859-1", 28591 }, | |
{ "ISO-8859-11", 874 }, | |
{ "ISO-8859-13", 28603 }, | |
{ "ISO-8859-15", 28605 }, | |
{ "ISO-8859-2", 28592 }, | |
{ "ISO-8859-3", 28593 }, | |
{ "ISO-8859-4", 28594 }, | |
{ "ISO-8859-5", 28595 }, | |
{ "ISO-8859-6", 28596 }, | |
{ "ISO-8859-7", 28597 }, | |
{ "ISO-8859-8", 28598 }, | |
{ "ISO-8859-8-I", 38598 }, | |
{ "ISO-8859-9", 28599 }, | |
{ "KOI8-R", 20866 }, | |
{ "MAC-CYRILLIC", 10007 }, | |
{ "SHIFT_JIS", 932 }, | |
{ "SJIS", 932 }, | |
{ "TIS-620", 874 }, | |
{ "UTF-8", 65001 }, | |
{ "VISCII", 1258 }, | |
{ "WINDOWS-1250", 1250 }, | |
{ "WINDOWS-1251", 1251 }, | |
{ "WINDOWS-1252", 1252 }, | |
{ "WINDOWS-1253", 1253 }, | |
{ "WINDOWS-1250", 1250 }, | |
{ "WINDOWS-1255", 1255 }, | |
{ "WINDOWS-1258", 1258 }, | |
}; | |
for (auto i : table) { | |
if (_stricmp(i.charset, charset) == 0) { | |
return i.codePage; | |
} | |
} | |
return -1; | |
} | |
auto detect_encoding(const std::vector<char>& source) { | |
struct result { | |
int codePage; | |
bool has_bom; | |
int error; | |
}; | |
if (source.empty()) { | |
return result{}; | |
} | |
const auto n = source.size(); | |
if (n >= 2) { | |
const auto c0 = source[0]; | |
const auto c1 = source[1]; | |
if (n >= 3 && c0 == '\xEF' && c1 == '\xBB') { | |
const auto c2 = source[2]; | |
if (c2 == '\xBF') { | |
return result{ 65001, true, 0 }; | |
} | |
} | |
else if (c0 == '\xFF' && c1 == '\xFE') { | |
if (n >= 4) { | |
const auto c2 = source[2]; | |
const auto c3 = source[3]; | |
if (c2 == '\x00' && c3 == '\x00') { | |
return result{ 12000, true, 0 }; | |
} | |
} | |
return result{ 1200, true, 0 }; | |
} | |
else if (c0 == '\xFE' && c1 == '\xFF') { | |
return result{ 1201, true, 0 }; | |
} | |
else if (n >= 4 && c0 == '\x00' && c1 == '\x00') { | |
const auto c2 = source[2]; | |
const auto c3 = source[3]; | |
if (c2 == '\xFE' && c3 == '\xFF') { | |
return result{ 12001, true, 0 }; | |
} | |
} | |
} | |
struct scoped_detector { | |
scoped_detector() = default; | |
~scoped_detector() { | |
if (handle) { | |
uchardet_delete(handle); | |
} | |
} | |
const uchardet_t handle = uchardet_new(); | |
private: | |
scoped_detector(const scoped_detector&) = delete; | |
void operator=(const scoped_detector&) = delete; | |
} const detector; | |
if (!detector.handle) { | |
return result{ 0, false, -1 }; | |
} | |
auto e = uchardet_handle_data(detector.handle, &source[0], source.size()); | |
if (e != 0) { | |
if (e == NS_ERROR_OUT_OF_MEMORY) { | |
return result{ 0, false, E_OUTOFMEMORY }; | |
} | |
else { | |
return result{ 0, false, e }; | |
} | |
} | |
uchardet_data_end(detector.handle); | |
auto charset = uchardet_get_charset(detector.handle); | |
auto windows_codepage = windows_codepage_from_charset(charset); | |
return result{ windows_codepage, false, 0 }; | |
} | |
auto convert_to_utf16(const std::vector<char>& source, int codePage) { | |
struct result{ | |
std::vector<wchar_t> contents; | |
int error; | |
}; | |
if (source.empty()) { | |
return result{}; | |
} | |
auto n = source.size(); | |
if (n >= INT_MAX) { | |
return result{ {}, E_OUTOFMEMORY }; | |
} | |
const auto cb_source = static_cast<int>(n); | |
::SetLastError(0); | |
const auto cch_wide = ::MultiByteToWideChar( | |
codePage, MB_ERR_INVALID_CHARS, &source[0], cb_source, nullptr, 0); | |
const auto calc_size_error = ::GetLastError(); | |
if (calc_size_error != 0) { | |
result{ {}, HRESULT_FROM_WIN32(calc_size_error) }; | |
} | |
if (cch_wide <= 0) { | |
result{ {}, E_FAIL }; | |
} | |
result r{ {}, 0 }; | |
try { | |
r.contents.resize(cch_wide); | |
const auto cch_result = ::MultiByteToWideChar( | |
codePage, MB_ERR_INVALID_CHARS, | |
&source[0], cb_source, &r.contents[0], cch_wide); | |
if (cch_result != cch_wide) { | |
const auto conversion_error = ::GetLastError(); | |
r.error = HRESULT_FROM_WIN32(conversion_error); | |
} | |
} | |
catch (const std::bad_alloc&) { | |
return result{ {}, E_OUTOFMEMORY }; | |
} | |
return r; | |
} | |
auto convert_to_utf8(const std::vector<wchar_t>& source) { | |
struct result { | |
std::vector<char> contents; | |
int error; | |
}; | |
if (source.empty()) { | |
return result{}; | |
} | |
auto n = source.size(); | |
if (n >= INT_MAX) { | |
return result{ {}, E_OUTOFMEMORY }; | |
} | |
const auto cch_source = static_cast<int>(n); | |
const auto cch_utf8 = ::WideCharToMultiByte( | |
CP_UTF8, 0, &source[0], cch_source, nullptr, 0, nullptr, nullptr); | |
if (cch_utf8 <= 0) { | |
const auto win32_error = ::GetLastError(); | |
result{ {}, HRESULT_FROM_WIN32(win32_error) }; | |
} | |
result r{ {}, 0 }; | |
try { | |
r.contents.resize(cch_utf8); | |
const auto cch_result = ::WideCharToMultiByte( | |
CP_UTF8, 0, &source[0], cch_source, | |
&r.contents[0], cch_utf8, nullptr, nullptr); | |
if (cch_result != cch_utf8) { | |
const auto conversion_error = ::GetLastError(); | |
r.error = HRESULT_FROM_WIN32(conversion_error); | |
} | |
} | |
catch (const std::bad_alloc&) { | |
return result{ {}, E_OUTOFMEMORY }; | |
} | |
return r; | |
} | |
int save_utf8_with_bom( | |
const wchar_t target_filename[], const std::vector<char>& contents) { | |
auto last_error = []() -> int { | |
const auto win32_error = ::GetLastError(); | |
if (win32_error == 0) { | |
return E_FAIL; | |
} | |
return HRESULT_FROM_WIN32(win32_error); | |
}; | |
auto h = ::CreateFileW(target_filename, | |
GENERIC_WRITE, | |
FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, | |
nullptr, CREATE_ALWAYS, 0, nullptr); | |
if (h == INVALID_HANDLE_VALUE) { | |
return last_error(); | |
} | |
const AutoCloseFileHandle defer_close_{ h }; | |
DWORD written_bom = 0; | |
if (!::WriteFile(h, "\xEF\xBB\xBF", 3, &written_bom, nullptr)) { | |
return last_error(); | |
} | |
if (written_bom != 3) { | |
return last_error(); | |
} | |
auto remain = contents.size(); | |
size_t offset = 0; | |
while (remain > 0) { | |
DWORD to_write = static_cast<DWORD>( | |
std::min(static_cast<size_t>(INT_MAX), remain)); | |
DWORD written = 0; | |
if (!::WriteFile(h, &contents[offset], to_write, &written, nullptr)) { | |
return last_error(); | |
} | |
if (remain >= written) { | |
offset += written; | |
remain -= written; | |
} | |
else { | |
return last_error(); | |
} | |
if (written != to_write) { | |
return last_error(); | |
} | |
} | |
return 0; | |
} | |
int print_result( | |
bool converted, int error, | |
const wchar_t source_filename[], | |
size_t source_filesize, | |
int source_codepage, | |
bool source_has_bom, | |
const wchar_t target_filename[], | |
size_t target_filesize) { | |
if (error) { | |
fwprintf_s(stderr, | |
L"{\"converted\": %s, \"error\": \"0x%08X\", " | |
L"\"source\": \"%s\", \"source_size\": %zu, " | |
L"\"source_codepage\": %d, \"source_has_bom\": %s, " | |
L"\"target\": \"%s\", \"target_size\": %zu}\n", | |
converted ? L"true" : L"false", error, | |
source_filename, source_filesize, | |
source_codepage, source_has_bom ? L"true" : L"false", | |
target_filename, target_filesize); | |
} | |
else { | |
fwprintf_s(stdout, | |
L"{\"converted\": %s, " | |
L"\"source\": \"%s\", \"source_size\": %zu, " | |
L"\"source_codepage\": %d, \"source_has_bom\": %s, " | |
L"\"target\": \"%s\", \"target_size\": %zu}\n", | |
converted ? L"true" : L"false", | |
source_filename, source_filesize, | |
source_codepage, source_has_bom ? L"true" : L"false", | |
target_filename, target_filesize); | |
} | |
return error; | |
} | |
int convert_encoding( | |
const wchar_t source_filename[], | |
const wchar_t target_filename[], | |
bool force_write_bom) { | |
auto source = read_file_contents_into_vector(source_filename); | |
const auto source_size = source.contents.size(); | |
if (source.error != 0) { | |
return print_result(false, source.error, | |
source_filename, source_size, 0, false, target_filename, 0); | |
} | |
if (source.contents.empty()) { | |
return print_result(false, 0, | |
source_filename, source_size, 0, false, target_filename, 0); | |
} | |
auto detected = detect_encoding(source.contents); | |
if (detected.has_bom || detected.error || detected.codePage <= 0) { | |
return print_result(false, detected.error, | |
source_filename, source_size, detected.codePage, detected.has_bom, | |
target_filename, 0); | |
} | |
static const int CP_ASCII = 20127; | |
if (!force_write_bom && | |
(detected.codePage == CP_ASCII || detected.codePage == CP_UTF8)) { | |
return print_result(false, detected.error, | |
source_filename, source_size, detected.codePage, detected.has_bom, | |
target_filename, 0); | |
} | |
std::vector<char> utf8_contents; | |
if (detected.codePage != CP_ASCII && detected.codePage != CP_UTF8) { | |
auto utf16 = convert_to_utf16(source.contents, detected.codePage); | |
source.contents.clear(); | |
source.contents.shrink_to_fit(); | |
if (utf16.error != 0) { | |
return print_result(false, utf16.error, | |
source_filename, source_size, detected.codePage, detected.has_bom, | |
target_filename, 0); | |
} | |
const auto utf8 = convert_to_utf8(utf16.contents); | |
utf16.contents.clear(); | |
utf16.contents.shrink_to_fit(); | |
utf8_contents = std::move(utf8.contents); | |
if (utf8.error) { | |
return print_result(false, utf8.error, | |
source_filename, source_size, detected.codePage, detected.has_bom, | |
target_filename, utf8_contents.size()); | |
} | |
} | |
else { | |
utf8_contents = std::move(source.contents); | |
} | |
const auto save_error = save_utf8_with_bom(target_filename, utf8_contents); | |
return print_result(save_error == 0, save_error, | |
source_filename, source_size, detected.codePage, detected.has_bom, | |
target_filename, utf8_contents.size()); | |
} | |
int wmain(int argc, wchar_t* argv[]) { | |
setlocale(LC_COLLATE | LC_CTYPE, ""); | |
bool force_bom = false; | |
const wchar_t* source_filename = nullptr; | |
const wchar_t* target_filename = nullptr; | |
for (int i = 1; i < argc; ++i) { | |
if (wcscmp(argv[i], L"--force-bom") == 0) { | |
force_bom = true; | |
} | |
else if (!source_filename) { | |
source_filename = argv[i]; | |
} | |
else if (!target_filename) { | |
target_filename = argv[i]; | |
} | |
} | |
if (!target_filename) { | |
target_filename = source_filename; | |
} | |
if (source_filename && target_filename) { | |
return convert_encoding(source_filename, target_filename, force_bom); | |
} | |
else { | |
wchar_t this_filename[_MAX_FNAME]{}; | |
if (argc > 0) { | |
_wsplitpath_s(argv[0], nullptr, 0, nullptr, 0, | |
this_filename, _countof(this_filename), nullptr, 0); | |
} | |
if (this_filename[0] == '\0') { | |
wcsncpy_s(this_filename, L"<ConvertTo-UTF8-BOM>", _TRUNCATE); | |
} | |
fwprintf_s(stdout, | |
L"{\"usage\": \"%s [--force-bom] <source filename> [target filename]\"}\n", | |
this_filename); | |
return -1; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
➜ Get-ChildItem .. -Recurse -Include *.h, *.c, *.hpp, *.cpp, *.hxx, *.cxx, *.inl | ForEach-Object { .\ConvertTo-UTF8-BOM.exe $_.FullName }