Skip to content

Instantly share code, notes, and snippets.

@rainbowpigeon
Forked from VeryCrazyDog/ReadUtfFile.cpp
Created November 12, 2022 13:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rainbowpigeon/844a8e6d708d77567948985a8327dd64 to your computer and use it in GitHub Desktop.
Save rainbowpigeon/844a8e6d708d77567948985a8327dd64 to your computer and use it in GitHub Desktop.
UTF File to UTF-8 in std::string in C++ on Windows
// Reading ASCII, UTF-8, UTF-16LE, UTF-16BE with auto BOM detection using C++11 on Windows platform
// Code tested on Microsoft Visual Studio 2013 on Windows 7
// Part of the code is referencing http://cfc.kizzx2.com/index.php/reading-a-unicode-utf16-file-in-windows-c/
#include <stdio.h>
#include <tchar.h>
#include <string>
#include <fstream>
#include <sstream>
#include <locale>
#include <codecvt>
#include <iostream>
#include <io.h>
#include <fcntl.h>
#define TEXT_FILE_PATH "D:\\test.txt"
#define ENCODING_ASCII 0
#define ENCODING_UTF8 1
#define ENCODING_UTF16LE 2
#define ENCODING_UTF16BE 3
std::string readFile(std::string path)
{
std::string result;
std::ifstream ifs(path.c_str(), std::ios::binary);
std::stringstream ss;
int encoding = ENCODING_ASCII;
if (!ifs.is_open()) {
// Unable to read file
result.clear();
return result;
}
else if (ifs.eof()) {
result.clear();
}
else {
int ch1 = ifs.get();
int ch2 = ifs.get();
if (ch1 == 0xff && ch2 == 0xfe) {
// The file contains UTF-16LE BOM
encoding = ENCODING_UTF16LE;
}
else if (ch1 == 0xfe && ch2 == 0xff) {
// The file contains UTF-16BE BOM
encoding = ENCODING_UTF16BE;
}
else {
int ch3 = ifs.get();
if (ch1 == 0xef && ch2 == 0xbb && ch3 == 0xbf) {
// The file contains UTF-8 BOM
encoding = ENCODING_UTF8;
}
else {
// The file does not have BOM
encoding = ENCODING_ASCII;
ifs.seekg(0);
}
}
}
ss << ifs.rdbuf() << '\0';
if (encoding == ENCODING_UTF16LE) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utfconv;
result = utfconv.to_bytes(std::wstring((wchar_t *)ss.str().c_str()));
}
else if (encoding == ENCODING_UTF16BE) {
std::string src = ss.str();
std::string dst = src;
// Using Windows API
_swab(&src[0u], &dst[0u], src.size() + 1);
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utfconv;
result = utfconv.to_bytes(std::wstring((wchar_t *)dst.c_str()));
}
else if (encoding == ENCODING_UTF8) {
result = ss.str();
}
else {
result = ss.str();
}
return result;
}
int _tmain(int argc, _TCHAR* argv[])
{
std::string path = TEXT_FILE_PATH;
std::string utf8Content = readFile(path);
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utfconv;
std::wstring utf16LeContent = utfconv.from_bytes(utf8Content);
_setmode(_fileno(stdout), _O_U8TEXT);
std::wcout << utf16LeContent << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment