Last active
August 28, 2023 13:31
-
-
Save VeryCrazyDog/c20b2cb83896e9975d22 to your computer and use it in GitHub Desktop.
UTF File to UTF-8 in std::string in C++ on Windows
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Reading ASCII, UTF-8, UTF-16LE, UTF-16BE with auto BOM detection using C++11 on Windows platform | |
// Code tested on Microsoft Visual Studio 2013 on Windows 7 | |
// Part of the code is referencing http://cfc.kizzx2.com/index.php/reading-a-unicode-utf16-file-in-windows-c/ | |
#include <stdio.h> | |
#include <tchar.h> | |
#include <string> | |
#include <fstream> | |
#include <sstream> | |
#include <locale> | |
#include <codecvt> | |
#include <iostream> | |
#include <io.h> | |
#include <fcntl.h> | |
#define TEXT_FILE_PATH "D:\\test.txt" | |
#define ENCODING_ASCII 0 | |
#define ENCODING_UTF8 1 | |
#define ENCODING_UTF16LE 2 | |
#define ENCODING_UTF16BE 3 | |
std::string readFile(std::string path) | |
{ | |
std::string result; | |
std::ifstream ifs(path.c_str(), std::ios::binary); | |
std::stringstream ss; | |
int encoding = ENCODING_ASCII; | |
if (!ifs.is_open()) { | |
// Unable to read file | |
result.clear(); | |
return result; | |
} | |
else if (ifs.eof()) { | |
result.clear(); | |
} | |
else { | |
int ch1 = ifs.get(); | |
int ch2 = ifs.get(); | |
if (ch1 == 0xff && ch2 == 0xfe) { | |
// The file contains UTF-16LE BOM | |
encoding = ENCODING_UTF16LE; | |
} | |
else if (ch1 == 0xfe && ch2 == 0xff) { | |
// The file contains UTF-16BE BOM | |
encoding = ENCODING_UTF16BE; | |
} | |
else { | |
int ch3 = ifs.get(); | |
if (ch1 == 0xef && ch2 == 0xbb && ch3 == 0xbf) { | |
// The file contains UTF-8 BOM | |
encoding = ENCODING_UTF8; | |
} | |
else { | |
// The file does not have BOM | |
encoding = ENCODING_ASCII; | |
ifs.seekg(0); | |
} | |
} | |
} | |
ss << ifs.rdbuf() << '\0'; | |
if (encoding == ENCODING_UTF16LE) { | |
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utfconv; | |
result = utfconv.to_bytes(std::wstring((wchar_t *)ss.str().c_str())); | |
} | |
else if (encoding == ENCODING_UTF16BE) { | |
std::string src = ss.str(); | |
std::string dst = src; | |
// Using Windows API | |
_swab(&src[0u], &dst[0u], src.size() + 1); | |
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utfconv; | |
result = utfconv.to_bytes(std::wstring((wchar_t *)dst.c_str())); | |
} | |
else if (encoding == ENCODING_UTF8) { | |
result = ss.str(); | |
} | |
else { | |
result = ss.str(); | |
} | |
return result; | |
} | |
int _tmain(int argc, _TCHAR* argv[]) | |
{ | |
std::string path = TEXT_FILE_PATH; | |
std::string utf8Content = readFile(path); | |
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utfconv; | |
std::wstring utf16LeContent = utfconv.from_bytes(utf8Content); | |
_setmode(_fileno(stdout), _O_U8TEXT); | |
std::wcout << utf16LeContent << std::endl; | |
return 0; | |
} |
That's truth that ifstream will not be able to handle path with Unicode character. This code is mainly demonstrate reading the file content actually. Thanks for your note.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
What if the file is located on a path with Unicode chars? You could encode that as utf-8 for the input parameter, but I doubt ifstream can handle that.