MightyPork/unicode_cache.c

## unicode_cache.c

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>


// --- espterm stubs ---
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
#define ICACHE_FLASH_ATTR
#define dbg(fmt, ...) printf("[ ] "fmt"\n", ##__VA_ARGS__)
#define warn(fmt, ...) printf("[w] "fmt"\n", ##__VA_ARGS__)
#define error(fmt, ...) printf("[!] "fmt"\n", ##__VA_ARGS__)
#define strneq(a, b, n) (strncmp((const char*)(a), (const char*)(b), n) == 0)
// ---------------------

typedef u8 UnicodeCacheRef;

#define UNICODE_CACHE_SIZE 160

typedef struct {
	u8 bytes[4];
	uint16_t count;
} UnicodeCacheSlot;

static UnicodeCacheSlot cache[UNICODE_CACHE_SIZE];

#define REF_TO_ID(c) (u8)(c > 127 ? c & 0x7f + 32 : c)
#define ID_TO_REF(c) (UnicodeCacheRef)(c > 31 ? c + 95 : c)

/**
 * Add a code point to the cache. ASCII is passed through.
 * If the code point is already stored, it's use counter is incremented.
 *
 * @param bytes - utf8 bytes
 * @return the obtained look-up reference
 */
UnicodeCacheRef ICACHE_FLASH_ATTR
unicode_cache_add(const u8 *bytes) {
	if (bytes[0] < 127) return bytes[0]; // ASCII, bypass

	if (bytes[0] < 32) {
		warn("utf8 cache illegal store '%c'", bytes[0]);
		return '?';
	}

	u8 slot;
	for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
		if (strneq(cache[slot].bytes, bytes, 4)) {
			cache[slot].count++;
			if (cache[slot].count == 1) {
				dbg("utf8 cache resurrect '%.4s' @ %d", bytes, slot);
			} else {
				dbg("utf8 cache inc '%.4s' @ %d, %d uses", bytes, slot, cache[slot].count);
			}
			goto suc;
		}
	}
	for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
		if (cache[slot].count==0) {
			// empty slot, store it
			strncpy(cache[slot].bytes, bytes, 4); // this will zero out the remainder
			cache[slot].count = 1;
			dbg("utf8 cache new '%.4s' @ %d", bytes, slot);
			goto suc;
		}
	}
	error("utf8 cache full");
	return '?'; // fallback to normal ASCII that will show to the user
suc:
	return ID_TO_REF(slot);
}

/**
 * Look up a code point in the cache by reference. Do not change the use counter.
 *
 * @param ref - reference obtained earlier using unicode_cache_add()
 * @param target - buffer of size 4 to hold the result.
 * @return true if the look-up succeeded
 */
bool ICACHE_FLASH_ATTR
unicode_cache_retrieve(UnicodeCacheRef ref, u8 *target) {
	if (ref > 31 && ref < 127) {
		// ASCII, bypass
		target[0] = ref;
		target[1] = 0;
		return true;
	}

	u8 slot = REF_TO_ID(ref);

	if (cache[slot].count == 0) {
		// "use after free"
		target[0] = '?';
		target[1] = 0;
		error("utf8 cache use-after-free @ %d (freed)", slot);
		return false;
	}

	dbg("utf8 cache hit '%.4s' @ %d, uses %d", cache[slot].bytes, slot, cache[slot].count);
	strncpy(target, cache[slot].bytes, 4);
	return true;
}

/**
 * Remove an occurence of a code point from the cache.
 * If the code point is used more than once, the use counter is decremented.
 *
 * @param ref - reference to remove or reduce
 * @return true if the code point was found in the cache
 */
bool ICACHE_FLASH_ATTR
unicode_cache_remove(UnicodeCacheRef ref) {
	if (ref > 31 && ref < 127) return true; // ASCII, bypass

	u8 slot = REF_TO_ID(ref);

	if (cache[slot].count == 0) {
		error("utf8 cache double-free @ %d", slot, cache[slot].count);
		return false;
	}

	cache[slot].count--;
	if (cache[slot].count) {
		dbg("utf8 cache sub '%.4s' @ %d, %d uses remain", cache[slot].bytes, slot, cache[slot].count);
	} else {
		dbg("utf8 cache del '%.4s' @ %d", cache[slot].bytes, slot, cache[slot].count);
	}
	return true;
}


void main (void)
{
	u8 buf[4];

	u8 cc = unicode_cache_add("č\0\0");
	unicode_cache_add("č\0\0");
	u8 rc = unicode_cache_add("ř\0\0");
	unicode_cache_add("ř\0\0");
	unicode_cache_add("ř\0\0");
	unicode_cache_add("ř\0\0");
	u8 heart = unicode_cache_add("💙\0");

	unicode_cache_retrieve(cc, buf);
	printf("%.4s\n", buf);
	unicode_cache_retrieve(rc, buf);
	printf("%.4s\n", buf);
	unicode_cache_retrieve(heart, buf);
	printf("%.4s\n", buf);

	unicode_cache_remove(heart);
	unicode_cache_remove(rc);
	unicode_cache_remove(rc);
	unicode_cache_remove(rc);
	unicode_cache_remove(rc);

	unicode_cache_add("💙\0");
	unicode_cache_add("A\0\0\0");
	unicode_cache_add("ñ\0\0");

	unicode_cache_remove(cc);
	unicode_cache_remove(cc);
	unicode_cache_add("ñ\0\0");
	unicode_cache_remove(heart);
	unicode_cache_add("ñ\0\0");
	unicode_cache_add("±\0\0");
	unicode_cache_add("¯\0\0");
}

	#include <stdlib.h>
	#include <stdio.h>
	#include <string.h>
	#include <stdint.h>
	#include <stdbool.h>
	#include <string.h>



	// --- espterm stubs ---
	typedef uint8_t u8;
	typedef uint16_t u16;
	typedef uint32_t u32;
	#define ICACHE_FLASH_ATTR
	#define dbg(fmt, ...) printf("[ ] "fmt"\n", ##__VA_ARGS__)
	#define warn(fmt, ...) printf("[w] "fmt"\n", ##__VA_ARGS__)
	#define error(fmt, ...) printf("[!] "fmt"\n", ##__VA_ARGS__)
	#define strneq(a, b, n) (strncmp((const char)(a), (const char)(b), n) == 0)
	// ---------------------

	typedef u8 UnicodeCacheRef;

	#define UNICODE_CACHE_SIZE 160

	typedef struct {
	u8 bytes[4];
	uint16_t count;
	} UnicodeCacheSlot;

	static UnicodeCacheSlot cache[UNICODE_CACHE_SIZE];

	#define REF_TO_ID(c) (u8)(c > 127 ? c & 0x7f + 32 : c)
	#define ID_TO_REF(c) (UnicodeCacheRef)(c > 31 ? c + 95 : c)

	/**
	* Add a code point to the cache. ASCII is passed through.
	* If the code point is already stored, it's use counter is incremented.
	*
	* @param bytes - utf8 bytes
	* @return the obtained look-up reference
	*/
	UnicodeCacheRef ICACHE_FLASH_ATTR
	unicode_cache_add(const u8 *bytes) {
	if (bytes[0] < 127) return bytes[0]; // ASCII, bypass

	if (bytes[0] < 32) {
	warn("utf8 cache illegal store '%c'", bytes[0]);
	return '?';
	}

	u8 slot;
	for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
	if (strneq(cache[slot].bytes, bytes, 4)) {
	cache[slot].count++;
	if (cache[slot].count == 1) {
	dbg("utf8 cache resurrect '%.4s' @ %d", bytes, slot);
	} else {
	dbg("utf8 cache inc '%.4s' @ %d, %d uses", bytes, slot, cache[slot].count);
	}
	goto suc;
	}
	}
	for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
	if (cache[slot].count==0) {
	// empty slot, store it
	strncpy(cache[slot].bytes, bytes, 4); // this will zero out the remainder
	cache[slot].count = 1;
	dbg("utf8 cache new '%.4s' @ %d", bytes, slot);
	goto suc;
	}
	}
	error("utf8 cache full");
	return '?'; // fallback to normal ASCII that will show to the user
	suc:
	return ID_TO_REF(slot);
	}

	/**
	* Look up a code point in the cache by reference. Do not change the use counter.
	*
	* @param ref - reference obtained earlier using unicode_cache_add()
	* @param target - buffer of size 4 to hold the result.
	* @return true if the look-up succeeded
	*/
	bool ICACHE_FLASH_ATTR
	unicode_cache_retrieve(UnicodeCacheRef ref, u8 *target) {
	if (ref > 31 && ref < 127) {
	// ASCII, bypass
	target[0] = ref;
	target[1] = 0;
	return true;
	}

	u8 slot = REF_TO_ID(ref);

	if (cache[slot].count == 0) {
	// "use after free"
	target[0] = '?';
	target[1] = 0;
	error("utf8 cache use-after-free @ %d (freed)", slot);
	return false;
	}

	dbg("utf8 cache hit '%.4s' @ %d, uses %d", cache[slot].bytes, slot, cache[slot].count);
	strncpy(target, cache[slot].bytes, 4);
	return true;
	}

	/**
	* Remove an occurence of a code point from the cache.
	* If the code point is used more than once, the use counter is decremented.
	*
	* @param ref - reference to remove or reduce
	* @return true if the code point was found in the cache
	*/
	bool ICACHE_FLASH_ATTR
	unicode_cache_remove(UnicodeCacheRef ref) {
	if (ref > 31 && ref < 127) return true; // ASCII, bypass

	u8 slot = REF_TO_ID(ref);

	if (cache[slot].count == 0) {
	error("utf8 cache double-free @ %d", slot, cache[slot].count);
	return false;
	}

	cache[slot].count--;
	if (cache[slot].count) {
	dbg("utf8 cache sub '%.4s' @ %d, %d uses remain", cache[slot].bytes, slot, cache[slot].count);
	} else {
	dbg("utf8 cache del '%.4s' @ %d", cache[slot].bytes, slot, cache[slot].count);
	}
	return true;
	}


	void main (void)
	{
	u8 buf[4];

	u8 cc = unicode_cache_add("č\0\0");
	unicode_cache_add("č\0\0");
	u8 rc = unicode_cache_add("ř\0\0");
	unicode_cache_add("ř\0\0");
	unicode_cache_add("ř\0\0");
	unicode_cache_add("ř\0\0");
	u8 heart = unicode_cache_add("💙\0");

	unicode_cache_retrieve(cc, buf);
	printf("%.4s\n", buf);
	unicode_cache_retrieve(rc, buf);
	printf("%.4s\n", buf);
	unicode_cache_retrieve(heart, buf);
	printf("%.4s\n", buf);

	unicode_cache_remove(heart);
	unicode_cache_remove(rc);
	unicode_cache_remove(rc);
	unicode_cache_remove(rc);
	unicode_cache_remove(rc);

	unicode_cache_add("💙\0");
	unicode_cache_add("A\0\0\0");
	unicode_cache_add("ñ\0\0");

	unicode_cache_remove(cc);
	unicode_cache_remove(cc);
	unicode_cache_add("ñ\0\0");
	unicode_cache_remove(heart);
	unicode_cache_add("ñ\0\0");
	unicode_cache_add("±\0\0");
	unicode_cache_add("¯\0\0");
	}