Skip to content

Instantly share code, notes, and snippets.

@rsms
Created November 26, 2010 14:41
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save rsms/716794 to your computer and use it in GitHub Desktop.
Save rsms/716794 to your computer and use it in GitHub Desktop.
Convert a UTF-16 string to UTF-8, mapping indices to provide low-complexity range and index lookups
#ifndef H_UTF8_MAPPED_UTF16_STRING_H_
#define H_UTF8_MAPPED_UTF16_STRING_H_
#import <Foundation/Foundation.h>
#import <string>
/*
* Convert a UTF-16 string to UTF-8, mapping indices to provide low-complexity
* range and index lookups.
*
* Copyright 2010 Rasmus Andersson. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
class HUTF8MappedUTF16String {
protected:
unichar *u16_buf_;
size_t u16_len_;
bool u16_weak_; // someone else owns |u16_buf_|?
size_t *u8to16_table_; // owned unless NULL
bool u8to16_table_weak_; // someone else owns |u8to16_table_|?
uint8_t *u8_buf_; // owned unless NULL
size_t u8_len_; // valid after a call to |convert|
public:
HUTF8MappedUTF16String(unichar *u16buf=NULL, size_t u16len=0)
: u16_buf_(u16buf)
, u16_len_(u16len)
, u16_weak_(true)
, u8to16_table_(NULL)
, u8to16_table_weak_(true)
, u8_buf_(NULL)
, u8_len_(0) {
}
~HUTF8MappedUTF16String();
// (Re)set to represent UTF-16 string data
void setUTF16String(unichar *u16buf, size_t u16len, bool weak=true);
/**
* (Re)set to represent an NSString. Will make an implicit managed copy of its
* UTF-16 characters, thus owning a strong reference meaning you can let |str|
* die without messing up the life of |this|.
*/
void setNSString(NSString *str, NSRange range);
// The number of UTF-16 characters this object represents
inline size_t length() const { return u16_len_; }
// The UTF-16 characters this object represents
inline const unichar *characters() const { return u16_buf_; }
// Access the UTF-16 character at index. Unchecked.
inline unichar const &operator[](size_t u16index) const {
// You can use this alternate prototype to allow modification:
//inline unichar &operator[] (size_t u16index) {
assert(u16index < u16_len_);
return u16_buf_[u16index];
}
// Maximum number of bytes needed to store a UTF-8 representation.
inline size_t maximumUTF8Size() { return u16_len_*4; }
/**
* Convert the represented Unicode string to UTF-8, returning a (internally
* allocated) null-terminated UTF-8 C string, which will be valid as long as
* |this| is alive or until |convert| is called. You can find out the length
* of the returned string from |UTF8Length|.
*
* See |convert(uint8_t*, size_t*)| for details.
*/
const uint8_t *convert();
// Fill |str| with the UTF-8 representation
void convert(std::string &str);
/**
* Convert the represented Unicode string to UTF-8, filling |u8buf|.
*
* @param u8buf A byte buffer to be filled which must be at least
* |maximumUTF8Size| bytes long.
*
* @param u8to16_table A user-allocated lookup table which must have at least
* |maximumUTF8Size| slots. If |u8to16_table| is NULL the
* table will be created and managed internally.
*
* @returns Number of bytes written to |u8buf|
*/
size_t convert(uint8_t *u8buf, size_t *u8to16_table=NULL);
// The number of bytes used for the UTF-8 representation
inline size_t UTF8Length() const { return u8_len_; }
/**
* Return index of UTF-16 character represented by UTF-8 character at
* |u8index|. Unchecked and expects an index less than |UTF8Length|.
*/
inline size_t UTF16IndexForUTF8Index(size_t u8index) const {
assert(u8index < u8_len_);
return u8to16_table_[u8index];
}
/**
* Convert a UTF-8 range into the range of it's equivalent UTF-16 characters
* in |characters|. This has low complexity because a lookup table is
* utilized. Automatically expands to cover any pairs.
*
* @param u8range Range in UTF-8 space
* @returns valid range in UTF-16 space
*/
NSRange UTF16RangeForUTF8Range(NSRange u8range);
// Faster version of UTF16RangeForUTF8Range without checks
inline NSRange unsafeUTF16RangeForUTF8Range(NSRange u8range) {
NSRange u16range = {u8to16_table_[u8range.location], 0};
if (u8range.length != 0) {
size_t endLocation = u8to16_table_[u8range.location+u8range.length-1];
if ((u16_buf_[endLocation]&0xfffffc00)==0xd800) // U16_IS_LEAD
++endLocation; // expects well-formed UTF-16
u16range.length = (endLocation+1) - u16range.location;
}
return u16range;
}
};
#endif // H_UTF8_MAPPED_UTF16_STRING_H_
#import "HUTF8MappedUTF16String.h"
// ----------------------------------------------------------------------------
// Macros extracted from icu/unicode/utf16.h
/**
* Is this code unit a lead surrogate (U+d800..U+dbff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
/**
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
/**
* Helper constant for U16_GET_SUPPLEMENTARY. (0x35fdc00)
* @internal
*/
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
/**
* Get a supplementary code point value (U+10000..U+10ffff)
* from its lead and trail surrogates.
* The result is undefined if the input values are not
* lead and trail surrogates.
*
* @param lead lead surrogate (U+d800..U+dbff)
* @param trail trail surrogate (U+dc00..U+dfff)
* @return supplementary code point (U+10000..U+10ffff)
* @stable ICU 2.4
*/
#define U16_GET_SUPPLEMENTARY(lead, trail) \
(((uint32_t)(lead)<<10UL)+(uint32_t)(trail)-U16_SURROGATE_OFFSET)
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset points to a single, unpaired lead surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output uint32_t variable
* @see U16_NEXT
* @stable ICU 2.4
*/
#define U16_NEXT_UNSAFE(s, i, c) { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
} \
}
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then that itself
* will be returned as the code point.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @stable ICU 2.4
*/
#define U16_NEXT(s, i, length, c) { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
}
// end of icu/unicode/utf16.h
// ----------------------------------------------------------------------------
HUTF8MappedUTF16String::~HUTF8MappedUTF16String() {
if (u8to16_table_ && !u8to16_table_weak_) {
delete u8to16_table_; u8to16_table_ = NULL;
}
if (u16_buf_ && !u16_weak_) { delete u16_buf_; u16_buf_ = NULL; }
if (u8_buf_) { delete u8_buf_; u8_buf_ = NULL; }
}
void HUTF8MappedUTF16String::setUTF16String(unichar *u16buf, size_t u16len,
bool weak/*=true*/) {
// delete old
if (u16_buf_ && !u16_weak_) delete u16_buf_;
// set new
u16_len_ = u16len;
u16_buf_ = u16buf;
u16_weak_ = weak;
// since we no longer can guarantee integrity of the map, let's waste it
if (u8to16_table_ && !u8to16_table_weak_)
delete u8to16_table_;
u8to16_table_ = NULL;
u8_len_ = 0;
if (u8_buf_) delete u8_buf_;
}
void HUTF8MappedUTF16String::setNSString(NSString *str, NSRange range) {
setUTF16String(NULL, range.length, false);
u16_buf_ = new unichar[u16_len_];
[str getCharacters:u16_buf_ range:range];
}
const uint8_t *HUTF8MappedUTF16String::convert() {
if (u8_buf_) delete u8_buf_;
u8_buf_ = new uint8_t[maximumUTF8Size()+1];
size_t u8len = convert(u8_buf_);
u8_buf_[u8len] = '\0';
return u8_buf_;
}
void HUTF8MappedUTF16String::convert(std::string &str) {
str.resize(maximumUTF8Size());
char *pch = (char*)str.data();
convert((uint8_t*)pch);
str.resize(u8_len_);
}
size_t HUTF8MappedUTF16String::convert(uint8_t *u8buf,
size_t *u8to16_table/*=NULL*/) {
// setup u8to16_table
if (u8to16_table_ && !u8to16_table_weak_)
delete u8to16_table_;
if (u8to16_table) {
u8to16_table_ = u8to16_table;
u8to16_table_weak_ = true;
} else {
u8to16_table_ = new size_t[maximumUTF8Size()];
u8to16_table_weak_ = false;
}
// reset u8_len_
u8_len_ = 0;
// For each UTF-16 character...
for (size_t u16i=0; u16i < u16_len_; ) {
// Retrieve 1-2 UTF-16 characters, forming one 32-bit unicode character
uint32_t u32c = 0;
size_t u16i_next = u16i;
// slower, but "safer"
// U16_NEXT(u16_buf_, u16i_next, u16_len_, u32c);
// faster, but does not handle unpaired surrogates or checks bounds
U16_NEXT_UNSAFE(u16_buf_, u16i_next, u32c);
// u16 offset added to |u8to16_table_|
size_t u16ix = u16i;
// Append u32c to u8buf (1-4 bytes)
if ((uint32_t)u32c <= 0x7f) {
u8to16_table_[u8_len_] = u16ix;
u8buf[u8_len_++] = (uint8_t)u32c;
} else {
if ((uint32_t)u32c <= 0x7ff) {
u8to16_table_[u8_len_] = u16ix;
u8buf[u8_len_++] = (uint8_t)((u32c>>6)|0xc0);
} else {
if ((uint32_t)u32c <= 0xffff) {
u8to16_table_[u8_len_] = u16ix;
u8buf[u8_len_++] = (uint8_t)((u32c>>12)|0xe0);
} else {
u8to16_table_[u8_len_] = u16ix;
u8buf[u8_len_++] = (uint8_t)((u32c>>18)|0xf0);
u8to16_table_[u8_len_] = u16ix;
u8buf[u8_len_++] = (uint8_t)(((u32c>>12)&0x3f)|0x80);
}
u8to16_table_[u8_len_] = u16ix;
u8buf[u8_len_++] = (uint8_t)(((u32c>>6)&0x3f)|0x80);
}
u8to16_table_[u8_len_] = u16ix;
u8buf[u8_len_++] = (uint8_t)((u32c&0x3f)|0x80);
}
u16i = u16i_next;
}
return u8_len_;
}
NSRange HUTF8MappedUTF16String::UTF16RangeForUTF8Range(NSRange u8range) {
if (u8range.location+u8range.length > u8_len_) {
[NSException raise:NSRangeException
format:@"Range %@ beyond end (%zu) of data",
NSStringFromRange(u8range), u8_len_];
return NSRange();
}
NSRange u16range = {u8to16_table_[u8range.location], 0};
// Because we never record 2nd part of a pair when building our table, this
// should never happen. We keep the code (out-commented) for clarity sake:
//if (U16_IS_TRAIL(u16_buf_[u16range.location]))
// --(u16range.location);
if (u8range.length != 0) {
size_t endLocation = u8to16_table_[u8range.location+u8range.length-1];
if (U16_IS_LEAD(u16_buf_[endLocation])) {
++endLocation; // expects well-formed UTF-16
assert(endLocation < u16_len_);
}
u16range.length = (endLocation+1) - u16range.location;
}
return u16range;
}
utf8 value => 'hej β™œ|β™ž|π„ž dΓ₯'
utf8[0] => utf16[0] -> 'h' \u68
utf8[1] => utf16[1] -> 'e' \u65
utf8[2] => utf16[2] -> 'j' \u6a
utf8[3] => utf16[3] -> ' ' \u20
utf8[4] => utf16[4] -> 'β™œ' \u265c
utf8[5] => utf16[4] -> 'β™œ' \u265c
utf8[6] => utf16[4] -> 'β™œ' \u265c
utf8[7] => utf16[5] -> '|' \u7c
utf8[8] => utf16[6] -> 'β™ž' \u265e
utf8[9] => utf16[6] -> 'β™ž' \u265e
utf8[10] => utf16[6] -> 'β™ž' \u265e
utf8[11] => utf16[7] -> '|' \u7c
utf8[12] => utf16[8..9] -> 'π„ž' \ud834 \udd1e
utf8[13] => utf16[8..9] -> 'π„ž' \ud834 \udd1e
utf8[14] => utf16[8..9] -> 'π„ž' \ud834 \udd1e
utf8[15] => utf16[8..9] -> 'π„ž' \ud834 \udd1e
utf8[16] => utf16[10] -> ' ' \u20
utf8[17] => utf16[11] -> 'd' \u64
utf8[18] => utf16[12] -> 'Γ₯' \ue5
utf8[19] => utf16[12] -> 'Γ₯' \ue5
u8range: {2, 16} -> 'j β™œ|β™ž|π„ž d'
u16range: {2, 10} -> 'j β™œ|β™ž|π„ž d'
#import "HUTF8MappedUTF16String.h"
// example and a kind of a test
#define U16_IS_SINGLE(c) !(((c)&0xfffff800)==0xd800)
int main (int argc, const char * argv[]) {
NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
// Our original Unicode test string
unichar u16chars[] = {
// Unicode characters:
// h e j β™œ | β™ž | π„ž d Γ₯
// Unicode (bits/char):
// 8 8 8 8 16 8 16 8 32 8 8 16
// UTF-8 widths (bytes/char):
// 1 1 1 1 3 1 3 1 4 1 1 2
'h','e','j',' ',0x265c,'|',0x265e,'|',0xd834,0xdd1e,' ','d',0xe5 };
NSString *str = [NSString stringWithCharacters:u16chars length:
sizeof(u16chars)/sizeof(*u16chars)];
HUTF8MappedUTF16String mappedString;
mappedString.setNSString(str, NSMakeRange(0,str.length));
// UTF-8 buffer
uint8_t *u8buf = new uint8_t[mappedString.maximumUTF8Size()+1];
// convert
size_t u8len = mappedString.convert(u8buf);
u8buf[u8len] = '\0';
fprintf(stderr, "utf8 value => '%s'\n", u8buf);
for (size_t i=0; i<u8len; i++) {
size_t index = mappedString.UTF16IndexForUTF8Index(i);
unichar c = mappedString[index];
if (U16_IS_SINGLE(c)) {
NSLog(@"utf8[%zu] => utf16[%zu] -> '%C' \\u%x", i, index, c, c);
} else {
NSLog(@"utf8[%zu] => utf16[%zu..%zu] -> '%C%C' \\u%x \\u%x",
i, index, index+1, c, mappedString[index+1],
c, mappedString[index+1]);
}
}
NSRange u8range = NSMakeRange(2, u8len-4); // should be "j β™œ|β™ž|π„ž d"
//u8range = NSMakeRange(12, 1); // should be "π„ž"
NSString *u8substr = // temporary so we can use NSLog
[[[NSString alloc] initWithBytesNoCopy:u8buf+u8range.location
length:u8range.length
encoding:NSUTF8StringEncoding
freeWhenDone:NO] autorelease];
NSLog(@"u8range: %@ -> '%@'", NSStringFromRange(u8range), u8substr);
NSRange u16range = mappedString.UTF16RangeForUTF8Range(u8range);
NSString *u16substr = [str substringWithRange:u16range];
NSLog(@"u16range: %@ -> '%@'", NSStringFromRange(u16range), u16substr);
[pool drain];
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment