Skip to content

Instantly share code, notes, and snippets.

@API-Beast
Created June 8, 2016 11:13
Show Gist options
  • Save API-Beast/36bdc029cd7dc397510ea999d9a4a33f to your computer and use it in GitHub Desktop.
Save API-Beast/36bdc029cd7dc397510ea999d9a4a33f to your computer and use it in GitHub Desktop.
UTF8 C++
/*
* DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
* TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
*
* 0. You just DO WHAT THE FUCK YOU WANT TO.
*/
#include "UTF8.h"
#include "Unicode.h"
namespace
{
#define Bits(a, i) (a >> (8-i))
static const char B0 = 0;
static const char B110 = 6;
static const char B1110 = 14;
static const char B11110 = 30;
static const char B111110 = 62;
const unsigned char toUC(const char c)
{
return *reinterpret_cast<const unsigned char*>(&c);
}
int GetMultibyteLength(const char c)
{
const unsigned char uc = toUC(c);
int bytes = 1;
if(Bits(uc, 1) == B0 ) bytes = 1;
else if(Bits(uc, 3) == B110 ) bytes = 2;
else if(Bits(uc, 4) == B1110 ) bytes = 3;
else if(Bits(uc, 5) == B11110 ) bytes = 4;
else if(Bits(uc, 6) == B111110) bytes = 5;
return bytes;
}
// We want: 00000000 00000001 01101000 01000100
// First we need to ignore the bits used to declare the multibyte
// We have: 11110000 10010110 10100001 10000100
// Ignore: ^^^^^ ^^ ^^ ^^
// We can do that by 0'ing the first bits (Note: 0xff >> x = first x bits = 0, the rest = 1)
// Octet 0: 11110000 & (0xff >> 5) = xxxx x000
// Octet 1: 10010110 & (0xff >> 2) = xx01 0110
// Octet 2: 10100001 & (0xff >> 2) = xx10 0001
// Octet 3: 10000100 & (0xff >> 2) = xx00 0100
// Now we need to compose the 4 byte integer out of these, so we shift them into the right position and "or" them.
// Octet 0: 11110000 << 6*3 = 000__ ________ ________
// Octet 1: 10010110 << 6*2 = ___01 0110____ ________
// Octet 2: 10100001 << 6*1 = _____ ____1000 01______
// Octet 3: 10000100 << 0 = _____ ________ __000100
// Or them all:
// We get: 00000000 00000001 01101000 01000100
// We want: 00000000 00000001 01101000 01000100
// Yay.
Codepoint ComposeFromMultibyte(const char* c, int bytes)
{
const unsigned char* uc = reinterpret_cast<const unsigned char*>(c);
Codepoint result = 0;
result = ((*uc++) & (0xff >> (bytes+1))) << (bytes-1)*6;
int i = 0;
while(++i < bytes)
result |= ((*uc++) & (0xff >> 2)) << (bytes-1-i)*6;
return result;
};
Codepoint GetUtf8Codepoint(const char* c, int& bytes)
{
const unsigned char* uc = reinterpret_cast<const unsigned char*>(c);
bytes = GetMultibyteLength(*c);
// Look at the first byte to find length
if((*uc) < 0x80) // ASCII / Length 1 Octet
return (*uc);
else if((*uc) < 0xC0) // Continuation byte? Something is wrong here, skip it, output "bad" character
return 0xFFFD; // Unicode Replacement Character - Replace the "bad" character to be better able to debug all this
else
return ComposeFromMultibyte(c, bytes);
}
}
Codepoint UTF8::DecodeNext(const std::string& str, int* position)
{
if((*position) >= str.size())
return 0;
int bytes;
bytes = GetMultibyteLength(str[*position]);
(*position) += bytes;
return GetUtf8Codepoint(&str[*position], bytes);
}
Codepoint UTF8::DecodeAt(const std::string& str, int position)
{
if(position >= str.size())
return 0;
int bytes;
bytes = GetMultibyteLength(str[position]);
return GetUtf8Codepoint(&str[position], bytes);
}
Codepoint UTF8::DecodeReverse(const std::string& str, int* position)
{
if((*position) <= 0)
return 0;
(*position)--;
while(position > 0)
{
if(toUC(str[*position]) >= 0x80 && toUC(str[*position]) < 0xC0)
(*position)--;
else
break;
}
int bytes;
return GetUtf8Codepoint(&str[*position], bytes);
}
void UTF8::SkipForward(const std::string& str, int* position, int characters)
{
if(characters < 0) return SkipBackward(str, position, -characters);
while(characters--)
(*position) += GetMultibyteLength(str[*position]);
}
void UTF8::SkipBackward(const std::string& str, int* position, int characters)
{
if(characters < 0) return SkipForward(str, position, -characters);
while(characters--)
{
(*position)--;
while(toUC(str[*position]) >= 0x80 && toUC(str[*position]) < 0xC0)
(*position)--;
}
}
std::string UTF8::Strip(const std::string& str, Codepoint c)
{
std::string temp = UTF8::StripLeft(str, c);
return UTF8::StripRight(temp, c);
}
std::string UTF8::StripLeft(const std::string& str, Codepoint c)
{
int i = 0;
Codepoint cur = UTF8::DecodeAt(str, i);
while(c == cur)
{
if(i >= str.size())
break;
cur = UTF8::DecodeNext(str, &i);
}
return str.substr(i, std::string::npos);
}
std::string UTF8::StripRight(const std::string& str, Codepoint c)
{
int i = str.size()-1;
Codepoint cur = UTF8::DecodeAt(str, i);
while(c == cur)
{
if(i <= 0)
break;
cur = UTF8::DecodeReverse(str, &i);
}
return str.substr(0, i+1);
}
std::string UTF8::Chop(const std::string& str, int fromStart, int fromEnd)
{
int start = 0;
int end = str.size();
UTF8::SkipForward(str, &start, fromStart);
UTF8::SkipBackward(str, &end, fromEnd);
return str.substr(start, end-start);
}
std::string UTF8::Encode(Codepoint c)
{
std::string text;
text.reserve(4);
if (c <= 0x7F)
{
// Plain single-byte ASCII.
text.push_back(c);
}
else if (c <= 0x7FF)
{
// Two bytes.
text.push_back(0xC0 | (c >> 6));
text.push_back(0x80 | ((c >> 0) & 0x3F));
}
else if (c <= 0xFFFF)
{
// Three bytes.
text.push_back(0xE0 | (c >> 12));
text.push_back(0x80 | ((c >> 6) & 0x3F));
text.push_back(0x80 | ((c >> 0) & 0x3F));
}
else if (c <= 0x1FFFFF)
{
// Four bytes.
text.push_back(0xF0 | (c >> 18));
text.push_back(0x80 | ((c >> 12) & 0x3F));
text.push_back(0x80 | ((c >> 6) & 0x3F));
text.push_back(0x80 | ((c >> 0) & 0x3F));
}
else
{
// Invalid char; don't encode anything.
}
return text;
}
bool UTF8::Contains(const std::string& str, Codepoint c)
{
int i = 0;
Codepoint cur = UTF8::DecodeAt(str, i);
while(cur = UTF8::DecodeNext(str, &i))
if(cur == c) return true;
return false;
}
/*
* DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
* TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
*
* 0. You just DO WHAT THE FUCK YOU WANT TO.
*/
#pragma once
#include <string>
#include <cinttypes>
typedef unsigned char Byte;
typedef char32_t Codepoint;
namespace UTF8
{
Codepoint DecodeNext (const std::string& str, int* position);
Codepoint DecodeReverse(const std::string& str, int* position);
Codepoint DecodeAt (const std::string& str, int position);
std::string Encode(Codepoint c);
void SkipForward (const std::string& str, int* position, int characters);
void SkipBackward(const std::string& str, int* position, int characters);
std::string Strip (const std::string& str, Codepoint c);
std::string StripLeft (const std::string& str, Codepoint c);
std::string StripRight(const std::string& str, Codepoint c);
std::string Chop(const std::string& str, int fromStart, int fromEnd);
bool Contains(const std::string& str, Codepoint c);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment