Skip to content

Instantly share code, notes, and snippets.

@cjheath
Created March 13, 2022 10:07
Show Gist options
  • Save cjheath/a148f710be0007057061eaa5fa8013d2 to your computer and use it in GitHub Desktop.
Save cjheath/a148f710be0007057061eaa5fa8013d2 to your computer and use it in GitHub Desktop.
A C++ class that encapsulates a pointer to UTF-8 data so that "normal" char-pointer work in multi-byte UTF-8
// See https://github.com/cjheath/strpp/blob/main/include/char_encoding.h
#include <char_encoding.h>
// This MOSTLY works. I just need a way to make these both work:
// UCS4 ch = *ptr++;
// *ptr++ = ch; // <<<< This is the tricky one.
class UTF8P
{
private:
UTF8* data;
public:
UTF8P(UTF8* s) : data(s) {} // Normal constructor
UTF8P(UTF8P& c) : data(c.data) {} // Copy constructor
UTF8P(const UTF8P& c) : data(c.data) {} // Copy constructor
~UTF8P() {};
UTF8P& operator=(UTF8* s) // Assignment
{ data = s; return *this; }
operator UTF8*() { return data; } // Access the UTF8 bytes
UCS4 operator*() // Dereference to char under the pointer
{ const UTF8* s = data; return UTF8Get(s); }
static int len(UCS4 ch) // Length in bytes of this UCS4 character
{ return UTF8Len(ch); }
int len() // length in bytes of character under the pointer
{ return UTF8Len(data); }
static bool is1st(UTF8* s) // Is this looking at the start of a UTF8 character?
{ return UTF8Is1st(*s); }
bool is1st() // Are we looking at the start of a UTF8 character?
{ return UTF8Is1st(*data); }
// Add and subtract integers:
UTF8P& operator+=(int i)
{ const UTF8* s = data;
while (i > 0) { UTF8Get(s); i--;} // Advance
while (i < 0) { s = UTF8Backup(s); i++;} // Or backup
data = (UTF8*)s; return *this;
}
UTF8P operator+(int i) { UTF8P t(*this); t += i; return t; }
UTF8P operator-=(int i) { return *this += -i; }
UTF8P operator-(int i) { UTF8P t(*this); t += -i; return t; }
// incr/decr functions:
UTF8P& preincr() { const UTF8* s = data; UTF8Get(s); data = (UTF8*)s; return *this; }
UTF8P postincr() { UTF8P save(*this); ++*this; return save; }
UTF8P& predecr() { data = (UTF8*)UTF8Backup(data); return *this; }
UTF8P postdecr() { UTF8P save(*this); --*this; return save; }
// incr/decr operators:
UTF8P& operator++() { return preincr(); }
UTF8P operator++(int) { return postincr(); }
UTF8P& operator--() { return predecr(); }
UTF8P operator--(int) { return postdecr(); }
// Store a character, advancing the pointer (like *ptr++ = ch)
UTF8P& put(UCS4 ch) { UTF8Put(data, ch); return *this; }
// This is not what I need, I want "*ptr++ = ch;" to do the right thing. I think that means the operator* and operator-> must be special
UTF8P& operator=(UCS4 ch) { return put(ch); }
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment