Created
March 27, 2017 07:53
-
-
Save zz-jason/078110974bb931b7f8e3432775ecfd05 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Utf8Iterator | |
{ | |
public: | |
Utf8Iterator(const char* data, size_t size): mData(data), mSize(size), mPos(0) {} | |
~Utf8Iterator() {} | |
bool HasNext() const | |
{ | |
return mPos < mSize; | |
} | |
size_t Next() | |
{ | |
if (! HasNext()) return mSize; | |
mPos += GetWidth(); | |
return mPos; | |
} | |
size_t Next(size_t utf8CharCount) | |
{ | |
for (size_t i = 0; i < utf8CharCount && HasNext(); ++ i) | |
{ | |
Next(); | |
} | |
return mPos; | |
} | |
size_t GetWidth() const | |
{ | |
if (! HasNext()) return 0; | |
if (Is4ByteUtf8CodePoint()) | |
{ | |
return 4; | |
} | |
else if (Is3ByteUtf8CodePoint()) | |
{ | |
return 3; | |
} | |
else if (Is2ByteUtf8CodePoint()) | |
{ | |
return 2; | |
} | |
else | |
{ | |
return 1; | |
} | |
} | |
const char* GetCodePoint() const | |
{ | |
return mData + mPos; | |
} | |
std::string GetChar() const | |
{ | |
return std::string(GetCodePoint(), GetWidth()); | |
} | |
private: | |
bool Is1ByteUtf8CodePoint() const { return 0x00 == (0x80 & *GetCodePoint()); } | |
bool Is2ByteUtf8CodePoint() const { return 0xc0 == (0xe0 & *GetCodePoint()); } | |
bool Is3ByteUtf8CodePoint() const { return 0xe0 == (0xf0 & *GetCodePoint()); } | |
bool Is4ByteUtf8CodePoint() const { return 0xf0 == (0xf8 & *GetCodePoint()); } | |
private: | |
const char* mData; | |
size_t mSize; | |
size_t mPos; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment