rsms/HUTF8MappedUTF16String.h

## HUTF8MappedUTF16String.h
#ifndef H_UTF8_MAPPED_UTF16_STRING_H_
#define H_UTF8_MAPPED_UTF16_STRING_H_

#import <Foundation/Foundation.h>
#import <string>

/*
 * Convert a UTF-16 string to UTF-8, mapping indices to provide low-complexity
 * range and index lookups.
 *
 * Copyright 2010 Rasmus Andersson. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
class HUTF8MappedUTF16String {
 protected:
  unichar  *u16_buf_;
  size_t    u16_len_;
  bool      u16_weak_; // someone else owns |u16_buf_|?

  size_t   *u8to16_table_; // owned unless NULL
  bool      u8to16_table_weak_; // someone else owns |u8to16_table_|?

  uint8_t  *u8_buf_; // owned unless NULL
  size_t    u8_len_; // valid after a call to |convert|

 public:
  HUTF8MappedUTF16String(unichar *u16buf=NULL, size_t u16len=0)
      : u16_buf_(u16buf)
      , u16_len_(u16len)
      , u16_weak_(true)
      , u8to16_table_(NULL)
      , u8to16_table_weak_(true)
      , u8_buf_(NULL)
      , u8_len_(0) {
  }
  ~HUTF8MappedUTF16String();

  // (Re)set to represent UTF-16 string data
  void setUTF16String(unichar *u16buf, size_t u16len, bool weak=true);

  /**
   * (Re)set to represent an NSString. Will make an implicit managed copy of its
   * UTF-16 characters, thus owning a strong reference meaning you can let |str|
   * die without messing up the life of |this|.
   */
  void setNSString(NSString *str, NSRange range);

  // The number of UTF-16 characters this object represents
  inline size_t length() const { return u16_len_; }

  // The UTF-16 characters this object represents
  inline const unichar *characters() const { return u16_buf_; }

  // Access the UTF-16 character at index. Unchecked.
  inline unichar const &operator[](size_t u16index) const {
    // You can use this alternate prototype to allow modification:
    //inline unichar &operator[] (size_t u16index) {
    assert(u16index < u16_len_);
    return u16_buf_[u16index];
  }

  // Maximum number of bytes needed to store a UTF-8 representation.
  inline size_t maximumUTF8Size() { return u16_len_*4; }

  /**
   * Convert the represented Unicode string to UTF-8, returning a (internally
   * allocated) null-terminated UTF-8 C string, which will be valid as long as
   * |this| is alive or until |convert| is called. You can find out the length
   * of the returned string from |UTF8Length|.
   *
   * See |convert(uint8_t*, size_t*)| for details.
   */
  const uint8_t *convert();

  // Fill |str| with the UTF-8 representation
  void convert(std::string &str);

  /**
   * Convert the represented Unicode string to UTF-8, filling |u8buf|.
   *
   * @param u8buf         A byte buffer to be filled which must be at least
   *                      |maximumUTF8Size| bytes long.
   *
   * @param u8to16_table  A user-allocated lookup table which must have at least
   *                      |maximumUTF8Size| slots. If |u8to16_table| is NULL the
   *                      table will be created and managed internally.
   *
   * @returns Number of bytes written to |u8buf|
   */
  size_t convert(uint8_t *u8buf, size_t *u8to16_table=NULL);

  // The number of bytes used for the UTF-8 representation
  inline size_t UTF8Length() const { return u8_len_; }

  /**
   * Return index of UTF-16 character represented by UTF-8 character at
   * |u8index|. Unchecked and expects an index less than |UTF8Length|.
   */
  inline size_t UTF16IndexForUTF8Index(size_t u8index) const {
    assert(u8index < u8_len_);
    return u8to16_table_[u8index];
  }

  /**
   * Convert a UTF-8 range into the range of it's equivalent UTF-16 characters
   * in |characters|. This has low complexity because a lookup table is
   * utilized. Automatically expands to cover any pairs.
   *
   * @param u8range Range in UTF-8 space
   * @returns       valid range in UTF-16 space
   */
  NSRange UTF16RangeForUTF8Range(NSRange u8range);

  // Faster version of UTF16RangeForUTF8Range without checks
  inline NSRange unsafeUTF16RangeForUTF8Range(NSRange u8range) {
    NSRange u16range = {u8to16_table_[u8range.location], 0};
    if (u8range.length != 0) {
      size_t endLocation = u8to16_table_[u8range.location+u8range.length-1];
      if ((u16_buf_[endLocation]&0xfffffc00)==0xd800) // U16_IS_LEAD
        ++endLocation; // expects well-formed UTF-16
      u16range.length = (endLocation+1) - u16range.location;
    }
    return u16range;
  }
};

#endif  // H_UTF8_MAPPED_UTF16_STRING_H_

## HUTF8MappedUTF16String.mm
#import "HUTF8MappedUTF16String.h"

// ----------------------------------------------------------------------------
// Macros extracted from icu/unicode/utf16.h

/**
 * Is this code unit a lead surrogate (U+d800..U+dbff)?
 * @param c 16-bit code unit
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)

/**
 * Is this code unit a trail surrogate (U+dc00..U+dfff)?
 * @param c 16-bit code unit
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)

/**
 * Helper constant for U16_GET_SUPPLEMENTARY. (0x35fdc00)
 * @internal
 */
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)

/**
 * Get a supplementary code point value (U+10000..U+10ffff)
 * from its lead and trail surrogates.
 * The result is undefined if the input values are not
 * lead and trail surrogates.
 *
 * @param lead lead surrogate (U+d800..U+dbff)
 * @param trail trail surrogate (U+dc00..U+dfff)
 * @return supplementary code point (U+10000..U+10ffff)
 * @stable ICU 2.4
 */
#define U16_GET_SUPPLEMENTARY(lead, trail) \
    (((uint32_t)(lead)<<10UL)+(uint32_t)(trail)-U16_SURROGATE_OFFSET)

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * The offset may point to the lead surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the following trail surrogate as well.
 * If the offset points to a trail surrogate, then that itself
 * will be returned as the code point.
 * The result is undefined if the offset points to a single, unpaired lead surrogate.
 *
 * @param s const UChar * string
 * @param i string offset
 * @param c output uint32_t variable
 * @see U16_NEXT
 * @stable ICU 2.4
 */
#define U16_NEXT_UNSAFE(s, i, c) { \
    (c)=(s)[(i)++]; \
    if(U16_IS_LEAD(c)) { \
        (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
    } \
}

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The offset may point to the lead surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the following trail surrogate as well.
 * If the offset points to a trail surrogate or
 * to a single, unpaired lead surrogate, then that itself
 * will be returned as the code point.
 *
 * @param s const UChar * string
 * @param i string offset, must be i<length
 * @param length string length
 * @param c output UChar32 variable
 * @see U16_NEXT_UNSAFE
 * @stable ICU 2.4
 */
#define U16_NEXT(s, i, length, c) { \
    (c)=(s)[(i)++]; \
    if(U16_IS_LEAD(c)) { \
        uint16_t __c2; \
        if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
            ++(i); \
            (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
        } \
    } \
}

// end of icu/unicode/utf16.h
// ----------------------------------------------------------------------------


HUTF8MappedUTF16String::~HUTF8MappedUTF16String() {
  if (u8to16_table_ && !u8to16_table_weak_) {
    delete u8to16_table_; u8to16_table_ = NULL;
  }
  if (u16_buf_ && !u16_weak_) { delete u16_buf_; u16_buf_ = NULL; }
  if (u8_buf_) { delete u8_buf_; u8_buf_ = NULL; }
}


void HUTF8MappedUTF16String::setUTF16String(unichar *u16buf, size_t u16len,
                                            bool weak/*=true*/) {
  // delete old
  if (u16_buf_ && !u16_weak_) delete u16_buf_;
  // set new
  u16_len_ = u16len;
  u16_buf_ = u16buf;
  u16_weak_ = weak;
  // since we no longer can guarantee integrity of the map, let's waste it
  if (u8to16_table_ && !u8to16_table_weak_)
    delete u8to16_table_;
  u8to16_table_ = NULL;
  u8_len_ = 0;
  if (u8_buf_) delete u8_buf_;
}


void HUTF8MappedUTF16String::setNSString(NSString *str, NSRange range) {
  setUTF16String(NULL, range.length, false);
  u16_buf_ = new unichar[u16_len_];
  [str getCharacters:u16_buf_ range:range];
}


const uint8_t *HUTF8MappedUTF16String::convert() {
  if (u8_buf_) delete u8_buf_;
  u8_buf_ = new uint8_t[maximumUTF8Size()+1];
  size_t u8len = convert(u8_buf_);
  u8_buf_[u8len] = '\0';
  return u8_buf_;
}


void HUTF8MappedUTF16String::convert(std::string &str) {
  str.resize(maximumUTF8Size());
  char *pch = (char*)str.data();
  convert((uint8_t*)pch);
  str.resize(u8_len_);
}


size_t HUTF8MappedUTF16String::convert(uint8_t *u8buf,
                                       size_t *u8to16_table/*=NULL*/) {
  // setup u8to16_table
  if (u8to16_table_ && !u8to16_table_weak_)
    delete u8to16_table_;
  if (u8to16_table) {
    u8to16_table_ = u8to16_table;
    u8to16_table_weak_ = true;
  } else {
    u8to16_table_ = new size_t[maximumUTF8Size()];
    u8to16_table_weak_ = false;
  }

  // reset u8_len_
  u8_len_ = 0;

  // For each UTF-16 character...
  for (size_t u16i=0; u16i < u16_len_; ) {
    // Retrieve 1-2 UTF-16 characters, forming one 32-bit unicode character
    uint32_t u32c = 0;
    size_t u16i_next = u16i;
    // slower, but "safer"
    // U16_NEXT(u16_buf_, u16i_next, u16_len_, u32c);
    // faster, but does not handle unpaired surrogates or checks bounds
    U16_NEXT_UNSAFE(u16_buf_, u16i_next, u32c);

    // u16 offset added to |u8to16_table_|
    size_t u16ix = u16i;

    // Append u32c to u8buf (1-4 bytes)
    if ((uint32_t)u32c <= 0x7f) {
      u8to16_table_[u8_len_] = u16ix;
      u8buf[u8_len_++] = (uint8_t)u32c;
    } else {
      if ((uint32_t)u32c <= 0x7ff) {
        u8to16_table_[u8_len_] = u16ix;
        u8buf[u8_len_++] = (uint8_t)((u32c>>6)|0xc0);
      } else {
        if ((uint32_t)u32c <= 0xffff) {
          u8to16_table_[u8_len_] = u16ix;
          u8buf[u8_len_++] = (uint8_t)((u32c>>12)|0xe0);
        } else {
          u8to16_table_[u8_len_] = u16ix;
          u8buf[u8_len_++] = (uint8_t)((u32c>>18)|0xf0);
          u8to16_table_[u8_len_] = u16ix;
          u8buf[u8_len_++] = (uint8_t)(((u32c>>12)&0x3f)|0x80);
        }
        u8to16_table_[u8_len_] = u16ix;
        u8buf[u8_len_++] = (uint8_t)(((u32c>>6)&0x3f)|0x80);
      }
      u8to16_table_[u8_len_] = u16ix;
      u8buf[u8_len_++] = (uint8_t)((u32c&0x3f)|0x80);
    }

    u16i = u16i_next;
  }

  return u8_len_;
}


NSRange HUTF8MappedUTF16String::UTF16RangeForUTF8Range(NSRange u8range) {
  if (u8range.location+u8range.length > u8_len_) {
    [NSException raise:NSRangeException
                format:@"Range %@ beyond end (%zu) of data",
                       NSStringFromRange(u8range), u8_len_];
    return NSRange();
  }
  NSRange u16range = {u8to16_table_[u8range.location], 0};
  // Because we never record 2nd part of a pair when building our table, this
  // should never happen. We keep the code (out-commented) for clarity sake:
  //if (U16_IS_TRAIL(u16_buf_[u16range.location]))
  //  --(u16range.location);
  if (u8range.length != 0) {
    size_t endLocation = u8to16_table_[u8range.location+u8range.length-1];
    if (U16_IS_LEAD(u16_buf_[endLocation])) {
      ++endLocation; // expects well-formed UTF-16
      assert(endLocation < u16_len_);
    }
    u16range.length = (endLocation+1) - u16range.location;
  }
  return u16range;
}

## main-output.txt
utf8 value => 'hej ♜|♞|𝄞 då'
utf8[0] => utf16[0] -> 'h' \u68
utf8[1] => utf16[1] -> 'e' \u65
utf8[2] => utf16[2] -> 'j' \u6a
utf8[3] => utf16[3] -> ' ' \u20
utf8[4] => utf16[4] -> '♜' \u265c
utf8[5] => utf16[4] -> '♜' \u265c
utf8[6] => utf16[4] -> '♜' \u265c
utf8[7] => utf16[5] -> '|' \u7c
utf8[8] => utf16[6] -> '♞' \u265e
utf8[9] => utf16[6] -> '♞' \u265e
utf8[10] => utf16[6] -> '♞' \u265e
utf8[11] => utf16[7] -> '|' \u7c
utf8[12] => utf16[8..9] -> '𝄞' \ud834 \udd1e
utf8[13] => utf16[8..9] -> '𝄞' \ud834 \udd1e
utf8[14] => utf16[8..9] -> '𝄞' \ud834 \udd1e
utf8[15] => utf16[8..9] -> '𝄞' \ud834 \udd1e
utf8[16] => utf16[10] -> ' ' \u20
utf8[17] => utf16[11] -> 'd' \u64
utf8[18] => utf16[12] -> 'å' \ue5
utf8[19] => utf16[12] -> 'å' \ue5
u8range: {2, 16} -> 'j ♜|♞|𝄞 d'
u16range: {2, 10} -> 'j ♜|♞|𝄞 d'

## main.mm
#import "HUTF8MappedUTF16String.h"
// example and a kind of a test

#define U16_IS_SINGLE(c) !(((c)&0xfffff800)==0xd800)

int main (int argc, const char * argv[]) {
  NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];

  // Our original Unicode test string
  unichar u16chars[] = {
  // Unicode characters:
  //    h    e   j        ♜    |    ♞    |        𝄞             d   å
  // Unicode (bits/char):
  //    8    8   8   8    16    8    16    8       32        8   8   16
  // UTF-8 widths (bytes/char):
  //    1    1   1   1     3    1     3    1        4        1   1    2
        'h','e','j',' ',0x265c,'|',0x265e,'|',0xd834,0xdd1e,' ','d',0xe5 };
  NSString *str = [NSString stringWithCharacters:u16chars length:
      sizeof(u16chars)/sizeof(*u16chars)];
  HUTF8MappedUTF16String mappedString;
  mappedString.setNSString(str, NSMakeRange(0,str.length));

  // UTF-8 buffer
  uint8_t *u8buf = new uint8_t[mappedString.maximumUTF8Size()+1];

  // convert
  size_t u8len = mappedString.convert(u8buf);

  u8buf[u8len] = '\0';
  fprintf(stderr, "utf8 value => '%s'\n", u8buf);

  for (size_t i=0; i<u8len; i++) {
    size_t index = mappedString.UTF16IndexForUTF8Index(i);
    unichar c = mappedString[index];

    if (U16_IS_SINGLE(c)) {
      NSLog(@"utf8[%zu] => utf16[%zu] -> '%C' \\u%x", i, index, c, c);
    } else {
      NSLog(@"utf8[%zu] => utf16[%zu..%zu] -> '%C%C' \\u%x \\u%x",
            i, index, index+1, c, mappedString[index+1],
            c, mappedString[index+1]);
    }
  }

  NSRange u8range = NSMakeRange(2, u8len-4); // should be "j ♜|♞|𝄞 d"
  //u8range = NSMakeRange(12, 1); // should be "𝄞"
  NSString *u8substr = // temporary so we can use NSLog
      [[[NSString alloc] initWithBytesNoCopy:u8buf+u8range.location
                                      length:u8range.length
                                    encoding:NSUTF8StringEncoding
                                freeWhenDone:NO] autorelease];
  NSLog(@"u8range: %@ -> '%@'", NSStringFromRange(u8range), u8substr);
  NSRange u16range = mappedString.UTF16RangeForUTF8Range(u8range);
  NSString *u16substr = [str substringWithRange:u16range];
  NSLog(@"u16range: %@ -> '%@'", NSStringFromRange(u16range), u16substr);

  [pool drain];
  return 0;
}
	#ifndef H_UTF8_MAPPED_UTF16_STRING_H_
	#define H_UTF8_MAPPED_UTF16_STRING_H_

	#import <Foundation/Foundation.h>
	#import <string>

	/*
	* Convert a UTF-16 string to UTF-8, mapping indices to provide low-complexity
	* range and index lookups.
	*
	* Copyright 2010 Rasmus Andersson. All rights reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to
	* deal in the Software without restriction, including without limitation the
	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	* sell copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*/
	class HUTF8MappedUTF16String {
	protected:
	unichar *u16_buf_;
	size_t u16_len_;
	bool u16_weak_; // someone else owns \|u16_buf_\|?

	size_t *u8to16_table_; // owned unless NULL
	bool u8to16_table_weak_; // someone else owns \|u8to16_table_\|?

	uint8_t *u8_buf_; // owned unless NULL
	size_t u8_len_; // valid after a call to \|convert\|

	public:
	HUTF8MappedUTF16String(unichar *u16buf=NULL, size_t u16len=0)
	: u16_buf_(u16buf)
	, u16_len_(u16len)
	, u16_weak_(true)
	, u8to16_table_(NULL)
	, u8to16_table_weak_(true)
	, u8_buf_(NULL)
	, u8_len_(0) {
	}
	~HUTF8MappedUTF16String();

	// (Re)set to represent UTF-16 string data
	void setUTF16String(unichar *u16buf, size_t u16len, bool weak=true);

	/**
	* (Re)set to represent an NSString. Will make an implicit managed copy of its
	* UTF-16 characters, thus owning a strong reference meaning you can let \|str\|
	* die without messing up the life of \|this\|.
	*/
	void setNSString(NSString *str, NSRange range);

	// The number of UTF-16 characters this object represents
	inline size_t length() const { return u16_len_; }

	// The UTF-16 characters this object represents
	inline const unichar *characters() const { return u16_buf_; }

	// Access the UTF-16 character at index. Unchecked.
	inline unichar const &operator[](size_t u16index) const {
	// You can use this alternate prototype to allow modification:
	//inline unichar &operator[] (size_t u16index) {
	assert(u16index < u16_len_);
	return u16_buf_[u16index];
	}

	// Maximum number of bytes needed to store a UTF-8 representation.
	inline size_t maximumUTF8Size() { return u16_len_*4; }

	/**
	* Convert the represented Unicode string to UTF-8, returning a (internally
	* allocated) null-terminated UTF-8 C string, which will be valid as long as
	* \|this\| is alive or until \|convert\| is called. You can find out the length
	* of the returned string from \|UTF8Length\|.
	*
	* See \|convert(uint8_t, size_t)\| for details.
	*/
	const uint8_t *convert();

	// Fill \|str\| with the UTF-8 representation
	void convert(std::string &str);

	/**
	* Convert the represented Unicode string to UTF-8, filling \|u8buf\|.
	*
	* @param u8buf A byte buffer to be filled which must be at least
	* \|maximumUTF8Size\| bytes long.
	*
	* @param u8to16_table A user-allocated lookup table which must have at least
	* \|maximumUTF8Size\| slots. If \|u8to16_table\| is NULL the
	* table will be created and managed internally.
	*
	* @returns Number of bytes written to \|u8buf\|
	*/
	size_t convert(uint8_t u8buf, size_t u8to16_table=NULL);

	// The number of bytes used for the UTF-8 representation
	inline size_t UTF8Length() const { return u8_len_; }

	/**
	* Return index of UTF-16 character represented by UTF-8 character at
	* \|u8index\|. Unchecked and expects an index less than \|UTF8Length\|.
	*/
	inline size_t UTF16IndexForUTF8Index(size_t u8index) const {
	assert(u8index < u8_len_);
	return u8to16_table_[u8index];
	}

	/**
	* Convert a UTF-8 range into the range of it's equivalent UTF-16 characters
	* in \|characters\|. This has low complexity because a lookup table is
	* utilized. Automatically expands to cover any pairs.
	*
	* @param u8range Range in UTF-8 space
	* @returns valid range in UTF-16 space
	*/
	NSRange UTF16RangeForUTF8Range(NSRange u8range);

	// Faster version of UTF16RangeForUTF8Range without checks
	inline NSRange unsafeUTF16RangeForUTF8Range(NSRange u8range) {
	NSRange u16range = {u8to16_table_[u8range.location], 0};
	if (u8range.length != 0) {
	size_t endLocation = u8to16_table_[u8range.location+u8range.length-1];
	if ((u16_buf_[endLocation]&0xfffffc00)==0xd800) // U16_IS_LEAD
	++endLocation; // expects well-formed UTF-16
	u16range.length = (endLocation+1) - u16range.location;
	}
	return u16range;
	}
	};

	#endif // H_UTF8_MAPPED_UTF16_STRING_H_
	#import "HUTF8MappedUTF16String.h"

	// ----------------------------------------------------------------------------
	// Macros extracted from icu/unicode/utf16.h

	/**
	* Is this code unit a lead surrogate (U+d800..U+dbff)?
	* @param c 16-bit code unit
	* @return TRUE or FALSE
	* @stable ICU 2.4
	*/
	#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)

	/**
	* Is this code unit a trail surrogate (U+dc00..U+dfff)?
	* @param c 16-bit code unit
	* @return TRUE or FALSE
	* @stable ICU 2.4
	*/
	#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)

	/**
	* Helper constant for U16_GET_SUPPLEMENTARY. (0x35fdc00)
	* @internal
	*/
	#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)

	/**
	* Get a supplementary code point value (U+10000..U+10ffff)
	* from its lead and trail surrogates.
	* The result is undefined if the input values are not
	* lead and trail surrogates.
	*
	* @param lead lead surrogate (U+d800..U+dbff)
	* @param trail trail surrogate (U+dc00..U+dfff)
	* @return supplementary code point (U+10000..U+10ffff)
	* @stable ICU 2.4
	*/
	#define U16_GET_SUPPLEMENTARY(lead, trail) \
	(((uint32_t)(lead)<<10UL)+(uint32_t)(trail)-U16_SURROGATE_OFFSET)

	/**
	* Get a code point from a string at a code point boundary offset,
	* and advance the offset to the next code point boundary.
	* (Post-incrementing forward iteration.)
	* "Unsafe" macro, assumes well-formed UTF-16.
	*
	* The offset may point to the lead surrogate unit
	* for a supplementary code point, in which case the macro will read
	* the following trail surrogate as well.
	* If the offset points to a trail surrogate, then that itself
	* will be returned as the code point.
	* The result is undefined if the offset points to a single, unpaired lead surrogate.
	*
	* @param s const UChar * string
	* @param i string offset
	* @param c output uint32_t variable
	* @see U16_NEXT
	* @stable ICU 2.4
	*/
	#define U16_NEXT_UNSAFE(s, i, c) { \
	(c)=(s)[(i)++]; \
	if(U16_IS_LEAD(c)) { \
	(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
	} \
	}

	/**
	* Get a code point from a string at a code point boundary offset,
	* and advance the offset to the next code point boundary.
	* (Post-incrementing forward iteration.)
	* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
	*
	* The offset may point to the lead surrogate unit
	* for a supplementary code point, in which case the macro will read
	* the following trail surrogate as well.
	* If the offset points to a trail surrogate or
	* to a single, unpaired lead surrogate, then that itself
	* will be returned as the code point.
	*
	* @param s const UChar * string
	* @param i string offset, must be i<length
	* @param length string length
	* @param c output UChar32 variable
	* @see U16_NEXT_UNSAFE
	* @stable ICU 2.4
	*/
	#define U16_NEXT(s, i, length, c) { \
	(c)=(s)[(i)++]; \
	if(U16_IS_LEAD(c)) { \
	uint16_t __c2; \
	if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
	++(i); \
	(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
	} \
	} \
	}

	// end of icu/unicode/utf16.h
	// ----------------------------------------------------------------------------


	HUTF8MappedUTF16String::~HUTF8MappedUTF16String() {
	if (u8to16_table_ && !u8to16_table_weak_) {
	delete u8to16_table_; u8to16_table_ = NULL;
	}
	if (u16_buf_ && !u16_weak_) { delete u16_buf_; u16_buf_ = NULL; }
	if (u8_buf_) { delete u8_buf_; u8_buf_ = NULL; }
	}


	void HUTF8MappedUTF16String::setUTF16String(unichar *u16buf, size_t u16len,
	bool weak/=true/) {
	// delete old
	if (u16_buf_ && !u16_weak_) delete u16_buf_;
	// set new
	u16_len_ = u16len;
	u16_buf_ = u16buf;
	u16_weak_ = weak;
	// since we no longer can guarantee integrity of the map, let's waste it
	if (u8to16_table_ && !u8to16_table_weak_)
	delete u8to16_table_;
	u8to16_table_ = NULL;
	u8_len_ = 0;
	if (u8_buf_) delete u8_buf_;
	}


	void HUTF8MappedUTF16String::setNSString(NSString *str, NSRange range) {
	setUTF16String(NULL, range.length, false);
	u16_buf_ = new unichar[u16_len_];
	[str getCharacters:u16_buf_ range:range];
	}


	const uint8_t *HUTF8MappedUTF16String::convert() {
	if (u8_buf_) delete u8_buf_;
	u8_buf_ = new uint8_t[maximumUTF8Size()+1];
	size_t u8len = convert(u8_buf_);
	u8_buf_[u8len] = '\0';
	return u8_buf_;
	}


	void HUTF8MappedUTF16String::convert(std::string &str) {
	str.resize(maximumUTF8Size());
	char pch = (char)str.data();
	convert((uint8_t*)pch);
	str.resize(u8_len_);
	}


	size_t HUTF8MappedUTF16String::convert(uint8_t *u8buf,
	size_t u8to16_table/=NULL*/) {
	// setup u8to16_table
	if (u8to16_table_ && !u8to16_table_weak_)
	delete u8to16_table_;
	if (u8to16_table) {
	u8to16_table_ = u8to16_table;
	u8to16_table_weak_ = true;
	} else {
	u8to16_table_ = new size_t[maximumUTF8Size()];
	u8to16_table_weak_ = false;
	}

	// reset u8_len_
	u8_len_ = 0;

	// For each UTF-16 character...
	for (size_t u16i=0; u16i < u16_len_; ) {
	// Retrieve 1-2 UTF-16 characters, forming one 32-bit unicode character
	uint32_t u32c = 0;
	size_t u16i_next = u16i;
	// slower, but "safer"
	// U16_NEXT(u16_buf_, u16i_next, u16_len_, u32c);
	// faster, but does not handle unpaired surrogates or checks bounds
	U16_NEXT_UNSAFE(u16_buf_, u16i_next, u32c);

	// u16 offset added to \|u8to16_table_\|
	size_t u16ix = u16i;

	// Append u32c to u8buf (1-4 bytes)
	if ((uint32_t)u32c <= 0x7f) {
	u8to16_table_[u8_len_] = u16ix;
	u8buf[u8_len_++] = (uint8_t)u32c;
	} else {
	if ((uint32_t)u32c <= 0x7ff) {
	u8to16_table_[u8_len_] = u16ix;
	u8buf[u8_len_++] = (uint8_t)((u32c>>6)\|0xc0);
	} else {
	if ((uint32_t)u32c <= 0xffff) {
	u8to16_table_[u8_len_] = u16ix;
	u8buf[u8_len_++] = (uint8_t)((u32c>>12)\|0xe0);
	} else {
	u8to16_table_[u8_len_] = u16ix;
	u8buf[u8_len_++] = (uint8_t)((u32c>>18)\|0xf0);
	u8to16_table_[u8_len_] = u16ix;
	u8buf[u8_len_++] = (uint8_t)(((u32c>>12)&0x3f)\|0x80);
	}
	u8to16_table_[u8_len_] = u16ix;
	u8buf[u8_len_++] = (uint8_t)(((u32c>>6)&0x3f)\|0x80);
	}
	u8to16_table_[u8_len_] = u16ix;
	u8buf[u8_len_++] = (uint8_t)((u32c&0x3f)\|0x80);
	}

	u16i = u16i_next;
	}

	return u8_len_;
	}


	NSRange HUTF8MappedUTF16String::UTF16RangeForUTF8Range(NSRange u8range) {
	if (u8range.location+u8range.length > u8_len_) {
	[NSException raise:NSRangeException
	format:@"Range %@ beyond end (%zu) of data",
	NSStringFromRange(u8range), u8_len_];
	return NSRange();
	}
	NSRange u16range = {u8to16_table_[u8range.location], 0};
	// Because we never record 2nd part of a pair when building our table, this
	// should never happen. We keep the code (out-commented) for clarity sake:
	//if (U16_IS_TRAIL(u16_buf_[u16range.location]))
	// --(u16range.location);
	if (u8range.length != 0) {
	size_t endLocation = u8to16_table_[u8range.location+u8range.length-1];
	if (U16_IS_LEAD(u16_buf_[endLocation])) {
	++endLocation; // expects well-formed UTF-16
	assert(endLocation < u16_len_);
	}
	u16range.length = (endLocation+1) - u16range.location;
	}
	return u16range;
	}
	utf8 value => 'hej ♜\|♞\|𝄞 då'
	utf8[0] => utf16[0] -> 'h' \u68
	utf8[1] => utf16[1] -> 'e' \u65
	utf8[2] => utf16[2] -> 'j' \u6a
	utf8[3] => utf16[3] -> ' ' \u20
	utf8[4] => utf16[4] -> '♜' \u265c
	utf8[5] => utf16[4] -> '♜' \u265c
	utf8[6] => utf16[4] -> '♜' \u265c
	utf8[7] => utf16[5] -> '\|' \u7c
	utf8[8] => utf16[6] -> '♞' \u265e
	utf8[9] => utf16[6] -> '♞' \u265e
	utf8[10] => utf16[6] -> '♞' \u265e
	utf8[11] => utf16[7] -> '\|' \u7c
	utf8[12] => utf16[8..9] -> '𝄞' \ud834 \udd1e
	utf8[13] => utf16[8..9] -> '𝄞' \ud834 \udd1e
	utf8[14] => utf16[8..9] -> '𝄞' \ud834 \udd1e
	utf8[15] => utf16[8..9] -> '𝄞' \ud834 \udd1e
	utf8[16] => utf16[10] -> ' ' \u20
	utf8[17] => utf16[11] -> 'd' \u64
	utf8[18] => utf16[12] -> 'å' \ue5
	utf8[19] => utf16[12] -> 'å' \ue5
	u8range: {2, 16} -> 'j ♜\|♞\|𝄞 d'
	u16range: {2, 10} -> 'j ♜\|♞\|𝄞 d'
	#import "HUTF8MappedUTF16String.h"
	// example and a kind of a test

	#define U16_IS_SINGLE(c) !(((c)&0xfffff800)==0xd800)

	int main (int argc, const char * argv[]) {
	NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];

	// Our original Unicode test string
	unichar u16chars[] = {
	// Unicode characters:
	// h e j ♜ \| ♞ \| 𝄞 d å
	// Unicode (bits/char):
	// 8 8 8 8 16 8 16 8 32 8 8 16
	// UTF-8 widths (bytes/char):
	// 1 1 1 1 3 1 3 1 4 1 1 2
	'h','e','j',' ',0x265c,'\|',0x265e,'\|',0xd834,0xdd1e,' ','d',0xe5 };
	NSString *str = [NSString stringWithCharacters:u16chars length:
	sizeof(u16chars)/sizeof(*u16chars)];
	HUTF8MappedUTF16String mappedString;
	mappedString.setNSString(str, NSMakeRange(0,str.length));

	// UTF-8 buffer
	uint8_t *u8buf = new uint8_t[mappedString.maximumUTF8Size()+1];

	// convert
	size_t u8len = mappedString.convert(u8buf);

	u8buf[u8len] = '\0';
	fprintf(stderr, "utf8 value => '%s'\n", u8buf);

	for (size_t i=0; i<u8len; i++) {
	size_t index = mappedString.UTF16IndexForUTF8Index(i);
	unichar c = mappedString[index];

	if (U16_IS_SINGLE(c)) {
	NSLog(@"utf8[%zu] => utf16[%zu] -> '%C' \\u%x", i, index, c, c);
	} else {
	NSLog(@"utf8[%zu] => utf16[%zu..%zu] -> '%C%C' \\u%x \\u%x",
	i, index, index+1, c, mappedString[index+1],
	c, mappedString[index+1]);
	}
	}

	NSRange u8range = NSMakeRange(2, u8len-4); // should be "j ♜\|♞\|𝄞 d"
	//u8range = NSMakeRange(12, 1); // should be "𝄞"
	NSString *u8substr = // temporary so we can use NSLog
	[[[NSString alloc] initWithBytesNoCopy:u8buf+u8range.location
	length:u8range.length
	encoding:NSUTF8StringEncoding
	freeWhenDone:NO] autorelease];
	NSLog(@"u8range: %@ -> '%@'", NSStringFromRange(u8range), u8substr);
	NSRange u16range = mappedString.UTF16RangeForUTF8Range(u8range);
	NSString *u16substr = [str substringWithRange:u16range];
	NSLog(@"u16range: %@ -> '%@'", NSStringFromRange(u16range), u16substr);

	[pool drain];
	return 0;
	}