Skip to content

Instantly share code, notes, and snippets.

@dce
Created April 1, 2013 11:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dce/5284560 to your computer and use it in GitHub Desktop.
Save dce/5284560 to your computer and use it in GitHub Desktop.
Experimenting with Rope (https://github.com/josephg/librope)
// https://github.com/josephg/librope
#include <stdlib.h>
#include <stdio.h>
#include "rope.h"
int
main()
{
uint8_t *str;
rope *r = rope_new();
rope_insert(r, 0, "Hello, World!");
str = malloc(rope_byte_count(r));
rope_write_cstr(r, str);
printf("%s\n", str);
free(str);
rope_del(r, 5, 7);
str = malloc(rope_byte_count(r));
rope_write_cstr(r, str);
printf("%s\n", str);
free(str);
rope_insert(r, 5, ", Pootrick");
str = malloc(rope_byte_count(r));
rope_write_cstr(r, str);
printf("%s\n", str);
free(str);
rope_free(r);
return 0;
}
CFLAGS = -g -Wall -Wextra
all:
make rope
gcc -o main main.c rope.o
rope:
gcc -o rope.o -c rope.c
clean:
rm -rf main *.dSYM rope.o
// Implementation for rope library.
#include <stdlib.h>
#include <string.h>
// Needed for VC++, which always compiles in C++ mode and doesn't have stdbool.
#ifndef __cplusplus
#include <stdbool.h>
#endif
#include <assert.h>
#include "rope.h"
// The number of bytes the rope head structure takes up
static const size_t ROPE_SIZE = sizeof(rope) + sizeof(rope_node) * ROPE_MAX_HEIGHT;
// Create a new rope with no contents
rope *rope_new2(void *(*alloc)(size_t bytes),
void *(*realloc)(void *ptr, size_t newsize),
void (*free)(void *ptr)) {
rope *r = (rope *)alloc(ROPE_SIZE);
r->num_chars = r->num_bytes = 0;
r->alloc = alloc;
r->realloc = realloc;
r->free = free;
r->head.height = 1;
r->head.num_bytes = 0;
r->head.nexts[0].node = NULL;
r->head.nexts[0].skip_size = 0;
#if ROPE_WCHAR
r->head.nexts[0].wchar_size = 0;
#endif
return r;
}
rope *rope_new() {
return rope_new2(malloc, realloc, free);
}
// Create a new rope containing the specified string
rope *rope_new_with_utf8(const uint8_t *str) {
rope *r = rope_new();
rope_insert(r, 0, str);
return r;
}
rope *rope_copy(const rope *other) {
int i;
rope_node *n;
rope *r = (rope *)other->alloc(ROPE_SIZE);
// Just copy most of the head's data. Note this won't copy the nexts list in head.
*r = *other;
rope_node *nodes[ROPE_MAX_HEIGHT];
for (i = 0; i < other->head.height; i++) {
nodes[i] = &r->head;
// non-NULL next pointers will be rewritten below.
r->head.nexts[i] = other->head.nexts[i];
}
for (n = other->head.nexts[0].node; n != NULL; n = n->nexts[0].node) {
unsigned int i;
// I wonder if it would be faster if we took this opportunity to rebalance the node list..?
size_t h = n->height;
rope_node *n2 = (rope_node *)r->alloc(sizeof(rope_node) + h * sizeof(rope_skip_node));
// Would it be faster to just *n2 = *n; ?
n2->num_bytes = n->num_bytes;
n2->height = h;
memcpy(n2->str, n->str, n->num_bytes);
memcpy(n2->nexts, n->nexts, h * sizeof(rope_skip_node));
for (i = 0; i < h; i++) {
nodes[i]->nexts[i].node = n2;
nodes[i] = n2;
}
}
return r;
}
// Free the specified rope
void rope_free(rope *r) {
assert(r);
rope_node *next, *n;
for (n = r->head.nexts[0].node; n != NULL; n = next) {
next = n->nexts[0].node;
r->free(n);
}
r->free(r);
}
// Get the number of characters in a rope
size_t rope_char_count(rope *r) {
assert(r);
return r->num_chars;
}
// Get the number of bytes which the rope would take up if stored as a utf8
// string
size_t rope_byte_count(rope *r) {
assert(r);
return r->num_bytes;
}
// Copies the rope's contents into a utf8 encoded C string. Also copies a trailing '\0' character.
// Returns the number of bytes written, which is rope_byte_count(r) + 1.
size_t rope_write_cstr(rope *r, uint8_t *dest) {
rope_node *n;
size_t num_bytes = rope_byte_count(r);
dest[num_bytes] = '\0';
if (num_bytes) {
uint8_t *p = dest;
for (n = &r->head; n != NULL; n = n->nexts[0].node) {
memcpy(p, n->str, n->num_bytes);
p += n->num_bytes;
}
assert(p == &dest[num_bytes]);
}
return num_bytes + 1;
}
// Create a new C string which contains the rope. The string will contain
// the rope encoded as utf8.
uint8_t *rope_create_cstr(rope *r) {
uint8_t *bytes = (uint8_t *)r->alloc(rope_byte_count(r) + 1); // Room for a zero.
rope_write_cstr(r, bytes);
return bytes;
}
#if ROPE_WCHAR
size_t rope_wchar_count(rope *r) {
assert(r);
return r->head.nexts[r->head.height - 1].wchar_size;
}
#endif
#define MIN(x,y) ((x) > (y) ? (y) : (x))
#define MAX(x,y) ((x) > (y) ? (x) : (y))
#ifdef _WIN32
inline static long random() {
return rand();
}
#endif
static uint8_t random_height() {
// This function is horribly inefficient. I'm throwing away heaps of entropy, and
// the mod could be replaced by some clever shifting.
//
// However, random_height barely appears in the profiler output - so its probably
// not worth investing the time to optimise.
uint8_t height = 1;
// The root node's height is the height of the largest node + 1, so the largest
// node can only have ROPE_MAX_HEIGHT - 1.
while(height < (ROPE_MAX_HEIGHT - 1) && (random() % 100) < ROPE_BIAS) {
height++;
}
return height;
}
// Figure out how many bytes to allocate for a node with the specified height.
static size_t node_size(uint8_t height) {
return sizeof(rope_node) + height * sizeof(rope_skip_node);
}
// Allocate and return a new node. The new node will be full of junk, except
// for its height.
// This function should be replaced at some point with an object pool based version.
static rope_node *alloc_node(rope *r, uint8_t height) {
rope_node *node = (rope_node *)r->alloc(node_size(height));
node->height = height;
return node;
}
// Find out how many bytes the unicode character which starts with the specified byte
// will occupy in memory.
// Returns the number of bytes, or SIZE_MAX if the byte is invalid.
static inline size_t codepoint_size(uint8_t byte) {
if (byte <= 0x7f) { return 1; } // 0x74 = 0111 1111
else if (byte <= 0xbf) { return SIZE_MAX; } // 1011 1111. Invalid for a starting byte.
else if (byte <= 0xdf) { return 2; } // 1101 1111
else if (byte <= 0xef) { return 3; } // 1110 1111
else if (byte <= 0xf7) { return 4; } // 1111 0111
else if (byte <= 0xfb) { return 5; } // 1111 1011
else if (byte <= 0xfd) { return 6; } // 1111 1101
else { return SIZE_MAX; }
}
// This little function counts how many bytes a certain number of characters take up.
static size_t count_bytes_in_utf8(const uint8_t *str, size_t num_chars) {
unsigned int i;
const uint8_t *p = str;
for (i = 0; i < num_chars; i++) {
p += codepoint_size(*p);
}
return p - str;
}
#if ROPE_WCHAR
#define NEEDS_TWO_WCHARS(x) (((x) & 0xf0) == 0xf0)
static size_t count_wchars_in_utf8(const uint8_t *str, size_t num_chars) {
unsigned int i;
size_t wchars = 0;
for (i = 0; i < num_chars; i++) {
wchars += 1 + NEEDS_TWO_WCHARS(*str);
str += codepoint_size(*str);
}
return wchars;
}
static size_t count_utf8_in_wchars(const uint8_t *str, size_t num_wchars) {
unsigned int i;
size_t chars = num_wchars;
for (i = 0; i < num_wchars; i++) {
if (NEEDS_TWO_WCHARS(*str)) {
chars--;
i++;
}
str += codepoint_size(*str);
}
return chars;
}
#endif
// Count the number of characters in a string.
static size_t strlen_utf8(const uint8_t *str) {
const uint8_t *p = str;
size_t i = 0;
while (*p) {
p += codepoint_size(*p);
i++;
}
return i;
}
typedef struct {
// This stores the previous node at each height, and the number of characters from the start of
// the previous node to the current iterator position.
rope_skip_node s[ROPE_MAX_HEIGHT];
} rope_iter;
// Internal function for navigating to a particular character offset in the rope.
// The function returns the list of nodes which point past the position, as well as
// offsets of how far into their character lists the specified characters are.
static rope_node *iter_at_char_pos(rope *r, size_t char_pos, rope_iter *iter) {
assert(char_pos <= r->num_chars);
rope_node *e = &r->head;
int height = r->head.height - 1;
// Offset stores how many characters we still need to skip in the current node.
size_t offset = char_pos;
size_t skip;
#if ROPE_WCHAR
size_t wchar_pos = 0; // Current wchar pos from the start of the rope.
#endif
while (true) {
skip = e->nexts[height].skip_size;
if (offset > skip) {
// Go right.
assert(e == &r->head || e->num_bytes);
offset -= skip;
#if ROPE_WCHAR
wchar_pos += e->nexts[height].wchar_size;
#endif
e = e->nexts[height].node;
} else {
// Go down.
iter->s[height].skip_size = offset;
iter->s[height].node = e;
#if ROPE_WCHAR
iter->s[height].wchar_size = wchar_pos;
#endif
if (height == 0) {
break;
} else {
height--;
}
}
}
#if ROPE_WCHAR
int i;
// For some reason, this is _REALLY SLOW_. Like, 5.5Mops/s -> 4Mops/s from this block of code.
wchar_pos += count_wchars_in_utf8(e->str, offset);
// The iterator has the wchar pos from the start of the whole string.
for (i = 0; i < r->head.height; i++) {
iter->s[i].wchar_size = wchar_pos - iter->s[i].wchar_size;
}
#endif
assert(offset <= ROPE_NODE_STR_SIZE);
assert(iter->s[0].node == e);
return e;
}
#if ROPE_WCHAR
// Equivalent of iter_at_char_pos, but for wchar positions instead.
static rope_node *iter_at_wchar_pos(rope *r, size_t wchar_pos, rope_iter *iter) {
int i;
int height = r->head.height - 1;
assert(wchar_pos <= r->head.nexts[height].wchar_size);
rope_node *e = &r->head;
// Offset stores how many wchar characters we still need to skip in the current node.
size_t offset = wchar_pos;
size_t skip;
size_t char_pos = 0; // Current char pos from the start of the rope.
while (true) {
skip = e->nexts[height].wchar_size;
if (offset > skip) {
// Go right.
offset -= skip;
char_pos += e->nexts[height].skip_size;
e = e->nexts[height].node;
} else {
// Go down.
iter->s[height].skip_size = char_pos;
iter->s[height].node = e;
iter->s[height].wchar_size = offset;
if (height == 0) {
break;
} else {
height--;
}
}
}
char_pos += count_utf8_in_wchars(e->str, offset);
// The iterator has character positions from the start of the rope to the start of the node.
for (i = 0; i < r->head.height; i++) {
iter->s[i].skip_size = char_pos - iter->s[i].skip_size;
}
assert(e == iter->s[0].node);
return e;
}
#endif
#if ROPE_WCHAR
static void update_offset_list(rope *r, rope_iter *iter, size_t num_chars, size_t num_wchars) {
int i;
for (i = 0; i < r->head.height; i++) {
iter->s[i].node->nexts[i].skip_size += num_chars;
iter->s[i].node->nexts[i].wchar_size += num_wchars;
}
}
#else
static void update_offset_list(rope *r, rope_iter *iter, size_t num_chars) {
int i;
for (i = 0; i < r->head.height; i++) {
iter->s[i].node->nexts[i].skip_size += num_chars;
}
}
#endif
// Internal method of rope_insert.
// This function creates a new node in the rope at the specified position and fills it with the
// passed string.
static void insert_at(rope *r, rope_iter *iter,
const uint8_t *str, size_t num_bytes, size_t num_chars) {
#if ROPE_WCHAR
size_t num_wchars = count_wchars_in_utf8(str, num_chars);
#endif
// This describes how many levels of the iter are filled in.
uint8_t max_height = r->head.height;
uint8_t new_height = random_height();
rope_node *new_node = alloc_node(r, new_height);
new_node->num_bytes = num_bytes;
memcpy(new_node->str, str, num_bytes);
assert(new_height < ROPE_MAX_HEIGHT);
// Max height (the rope's head's height) must be 1+ the height of the largest node.
while (max_height <= new_height) {
r->head.height++;
r->head.nexts[max_height] = r->head.nexts[max_height - 1];
// This is the position (offset from the start) of the rope.
iter->s[max_height] = iter->s[max_height - 1];
max_height++;
}
// Fill in the new node's nexts array.
int i;
for (i = 0; i < new_height; i++) {
rope_skip_node *prev_skip = &iter->s[i].node->nexts[i];
new_node->nexts[i].node = prev_skip->node;
new_node->nexts[i].skip_size = num_chars + prev_skip->skip_size - iter->s[i].skip_size;
prev_skip->node = new_node;
prev_skip->skip_size = iter->s[i].skip_size;
// & move the iterator to the end of the newly inserted node.
iter->s[i].node = new_node;
iter->s[i].skip_size = num_chars;
#if ROPE_WCHAR
new_node->nexts[i].wchar_size = num_wchars + prev_skip->wchar_size - iter->s[i].wchar_size;
prev_skip->wchar_size = iter->s[i].wchar_size;
iter->s[i].wchar_size = num_wchars;
#endif
}
for (; i < max_height; i++) {
iter->s[i].node->nexts[i].skip_size += num_chars;
iter->s[i].skip_size += num_chars;
#if ROPE_WCHAR
iter->s[i].node->nexts[i].wchar_size += num_wchars;
iter->s[i].wchar_size += num_wchars;
#endif
}
r->num_chars += num_chars;
r->num_bytes += num_bytes;
}
// Insert the given utf8 string into the rope at the specified position.
static void rope_insert_at_iter(rope *r, rope_node *e, rope_iter *iter, const uint8_t *str) {
int i;
// iter.offset contains how far (in characters) into the current element to skip.
// Figure out how much that is in bytes.
size_t offset_bytes = 0;
// The insertion offset into the destination node.
size_t offset = iter->s[0].skip_size;
if (offset) {
assert(offset <= e->nexts[0].skip_size);
offset_bytes = count_bytes_in_utf8(e->str, offset);
}
// Maybe we can insert the characters into the current node?
size_t num_inserted_bytes = strlen((char *)str);
// Can we insert into the current node?
bool insert_here = e->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE;
// Can we insert into the subsequent node?
rope_node *next = NULL;
if (!insert_here && offset_bytes == e->num_bytes) {
next = e->nexts[0].node;
// We can insert into the subsequent node if:
// - We can't insert into the current node
// - There _is_ a next node to insert into
// - The insert would be at the start of the next node
// - There's room in the next node
if (next && next->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE) {
offset = offset_bytes = 0;
for (i = 0; i < next->height; i++) {
iter->s[i].node = next;
// tree offset nodes will not be used.
}
e = next;
insert_here = true;
}
}
if (insert_here) {
// First move the current bytes later on in the string.
if (offset_bytes < e->num_bytes) {
memmove(&e->str[offset_bytes + num_inserted_bytes],
&e->str[offset_bytes],
e->num_bytes - offset_bytes);
}
// Then copy in the string bytes
memcpy(&e->str[offset_bytes], str, num_inserted_bytes);
e->num_bytes += num_inserted_bytes;
r->num_bytes += num_inserted_bytes;
size_t num_inserted_chars = strlen_utf8(str);
r->num_chars += num_inserted_chars;
// .... aaaand update all the offset amounts.
#if ROPE_WCHAR
size_t num_inserted_wchars = count_wchars_in_utf8(str, num_inserted_chars);
update_offset_list(r, iter, num_inserted_chars, num_inserted_wchars);
#else
update_offset_list(r, iter, num_inserted_chars);
#endif
} else {
// There isn't room. We'll need to add at least one new node to the rope.
// If we're not at the end of the current node, we'll need to remove
// the end of the current node's data and reinsert it later.
size_t num_end_chars, num_end_bytes = e->num_bytes - offset_bytes;
if (num_end_bytes) {
// We'll pretend like the character have been deleted from the node, while leaving
// the bytes themselves there (for later).
e->num_bytes = offset_bytes;
num_end_chars = e->nexts[0].skip_size - offset;
#if ROPE_WCHAR
size_t num_end_wchars = count_wchars_in_utf8(&e->str[offset_bytes], num_end_chars);
update_offset_list(r, iter, -num_end_chars, -num_end_wchars);
#else
update_offset_list(r, iter, -num_end_chars);
#endif
r->num_chars -= num_end_chars;
r->num_bytes -= num_end_bytes;
}
// Now we insert new nodes containing the new character data. The data must be broken into
// pieces of with a maximum size of ROPE_NODE_STR_SIZE. Node boundaries must not occur in the
// middle of a utf8 codepoint.
size_t str_offset = 0;
while (str_offset < num_inserted_bytes) {
size_t new_node_bytes = 0;
size_t new_node_chars = 0;
while (str_offset + new_node_bytes < num_inserted_bytes) {
size_t cs = codepoint_size(str[str_offset + new_node_bytes]);
if (cs + new_node_bytes > ROPE_NODE_STR_SIZE) {
break;
} else {
new_node_bytes += cs;
new_node_chars++;
}
}
insert_at(r, iter, &str[str_offset], new_node_bytes, new_node_chars);
str_offset += new_node_bytes;
}
if (num_end_bytes) {
insert_at(r, iter, &e->str[offset_bytes], num_end_bytes, num_end_chars);
}
}
}
void rope_insert(rope *r, size_t pos, const uint8_t *str) {
assert(r);
assert(str);
#ifdef DEBUG
_rope_check(r);
#endif
pos = MIN(pos, r->num_chars);
rope_iter iter;
// First we need to search for the node where we'll insert the string.
rope_node *e = iter_at_char_pos(r, pos, &iter);
rope_insert_at_iter(r, e, &iter, str);
#ifdef DEBUG
_rope_check(r);
#endif
}
#if ROPE_WCHAR
// Insert the given utf8 string into the rope at the specified position.
size_t rope_insert_at_wchar(rope *r, size_t wchar_pos, const uint8_t *str) {
assert(r);
assert(str);
#ifdef DEBUG
_rope_check(r);
#endif
wchar_pos = MIN(wchar_pos, rope_wchar_count(r));
rope_iter iter;
// First we need to search for the node where we'll insert the string.
rope_node *e = iter_at_wchar_pos(r, wchar_pos, &iter);
size_t pos = iter.s[r->head.height - 1].skip_size;
rope_insert_at_iter(r, e, &iter, str);
#ifdef DEBUG
_rope_check(r);
#endif
return pos;
}
#endif
// Delete num characters at position pos. Deleting past the end of the string
// has no effect.
static void rope_del_at_iter(rope *r, rope_node *e, rope_iter *iter, size_t length) {
r->num_chars -= length;
size_t offset = iter->s[0].skip_size;
while (length) {
if (offset == e->nexts[0].skip_size) {
// End of the current node. Skip to the start of the next one.
e = iter->s[0].node->nexts[0].node;
offset = 0;
}
size_t num_chars = e->nexts[0].skip_size;
size_t removed = MIN(length, num_chars - offset);
#if ROPE_WCHAR
size_t removed_wchars;
#endif
int i;
if (removed < num_chars || e == &r->head) {
// Just trim this node down to size.
size_t leading_bytes = count_bytes_in_utf8(e->str, offset);
size_t removed_bytes = count_bytes_in_utf8(&e->str[leading_bytes], removed);
size_t trailing_bytes = e->num_bytes - leading_bytes - removed_bytes;
#if ROPE_WCHAR
removed_wchars = count_wchars_in_utf8(&e->str[leading_bytes], removed);
#endif
if (trailing_bytes) {
memmove(&e->str[leading_bytes], &e->str[leading_bytes + removed_bytes], trailing_bytes);
}
e->num_bytes -= removed_bytes;
r->num_bytes -= removed_bytes;
for (i = 0; i < e->height; i++) {
e->nexts[i].skip_size -= removed;
#if ROPE_WCHAR
e->nexts[i].wchar_size -= removed_wchars;
#endif
}
} else {
// Remove the node from the list
#if ROPE_WCHAR
removed_wchars = e->nexts[0].wchar_size;
#endif
for (i = 0; i < e->height; i++) {
iter->s[i].node->nexts[i].node = e->nexts[i].node;
iter->s[i].node->nexts[i].skip_size += e->nexts[i].skip_size - removed;
#if ROPE_WCHAR
iter->s[i].node->nexts[i].wchar_size += e->nexts[i].wchar_size - removed_wchars;
#endif
}
r->num_bytes -= e->num_bytes;
// TODO: Recycle e.
rope_node *next = e->nexts[0].node;
r->free(e);
e = next;
}
for (; i < r->head.height; i++) {
iter->s[i].node->nexts[i].skip_size -= removed;
#if ROPE_WCHAR
iter->s[i].node->nexts[i].wchar_size -= removed_wchars;
#endif
}
length -= removed;
}
}
void rope_del(rope *r, size_t pos, size_t length) {
#ifdef DEBUG
_rope_check(r);
#endif
assert(r);
pos = MIN(pos, r->num_chars);
length = MIN(length, r->num_chars - pos);
rope_iter iter;
// Search for the node where we'll insert the string.
rope_node *e = iter_at_char_pos(r, pos, &iter);
rope_del_at_iter(r, e, &iter, length);
#ifdef DEBUG
_rope_check(r);
#endif
}
#if ROPE_WCHAR
size_t rope_del_at_wchar(rope *r, size_t wchar_pos, size_t wchar_num, size_t *char_len_out) {
#ifdef DEBUG
_rope_check(r);
#endif
assert(r);
size_t wchar_total = rope_wchar_count(r);
wchar_pos = MIN(wchar_pos, wchar_total);
wchar_num = MIN(wchar_num, wchar_total - wchar_pos);
rope_iter iter;
// Search for the node where we'll insert the string.
rope_node *start = iter_at_wchar_pos(r, wchar_pos, &iter);
size_t char_pos = iter.s[r->head.height - 1].skip_size;
rope_iter end_iter;
int h = r->head.height - 1;
iter_at_wchar_pos(r, iter.s[h].wchar_size + wchar_num, &end_iter);
size_t char_length = end_iter.s[h].skip_size - iter.s[h].skip_size;
rope_del_at_iter(r, start, &iter, char_length);
#ifdef DEBUG
_rope_check(r);
#endif
if (char_len_out) {
*char_len_out = char_length;
}
return char_pos;
}
#endif
void _rope_check(rope *r) {
int i;
assert(r->head.height); // Even empty ropes have a height of 1.
assert(r->num_bytes >= r->num_chars);
rope_skip_node skip_over = r->head.nexts[r->head.height - 1];
assert(skip_over.skip_size == r->num_chars);
assert(skip_over.node == NULL);
rope_node *n;
size_t num_bytes = 0;
size_t num_chars = 0;
#if ROPE_WCHAR
size_t num_wchar = 0;
#endif
// The offsets here are used to store the total distance travelled from the start
// of the rope.
rope_iter iter = {};
for (i = 0; i < r->head.height; i++) {
iter.s[i].node = &r->head;
}
for (n = &r->head; n != NULL; n = n->nexts[0].node) {
assert(n == &r->head || n->num_bytes);
assert(n->height <= ROPE_MAX_HEIGHT);
assert(count_bytes_in_utf8(n->str, n->nexts[0].skip_size) == n->num_bytes);
#if ROPE_WCHAR
assert(count_wchars_in_utf8(n->str, n->nexts[0].skip_size) == n->nexts[0].wchar_size);
#endif
for (i = 0; i < n->height; i++) {
assert(iter.s[i].node == n);
assert(iter.s[i].skip_size == num_chars);
iter.s[i].node = n->nexts[i].node;
iter.s[i].skip_size += n->nexts[i].skip_size;
#if ROPE_WCHAR
assert(iter.s[i].wchar_size == num_wchar);
iter.s[i].wchar_size += n->nexts[i].wchar_size;
#endif
}
num_bytes += n->num_bytes;
num_chars += n->nexts[0].skip_size;
#if ROPE_WCHAR
num_wchar += n->nexts[0].wchar_size;
#endif
}
for (i = 0; i < r->head.height; i++) {
assert(iter.s[i].node == NULL);
assert(iter.s[i].skip_size == num_chars);
#if ROPE_WCHAR
assert(iter.s[i].wchar_size == num_wchar);
#endif
}
assert(r->num_bytes == num_bytes);
assert(r->num_chars == num_chars);
#if ROPE_WCHAR
assert(skip_over.wchar_size == num_wchar);
#endif
}
// For debugging.
#include <stdio.h>
void _rope_print(rope *r) {
int i;
rope_node *n;
printf("chars: %zd\tbytes: %zd\theight: %d\n", r->num_chars, r->num_bytes, r->head.height);
printf("HEAD");
for (i = 0; i < r->head.height; i++) {
printf(" |%3zd ", r->head.nexts[i].skip_size);
}
printf("\n");
int num = 0;
for (n = &r->head; n != NULL; n = n->nexts[0].node) {
printf("%3d:", num++);
for ( i = 0; i < n->height; i++) {
printf(" |%3zd ", n->nexts[i].skip_size);
}
printf(" : \"");
fwrite(n->str, n->num_bytes, 1, stdout);
printf("\"\n");
}
}
/* UTF-8 Rope implementation by Joseph Gentle
*
* This library implements a heavyweight utf8 string type with fast
* insert-at-position and delete-at-position operations.
*
* It uses skip lists instead of trees. Trees might be faster - who knows?
*
* Ropes are NOT THREAD SAFE. Do not call multiple rope methods
* simultaneously from different threads.
*/
#ifndef librope_rope_h
#define librope_rope_h
#include <stdint.h>
#include <stddef.h>
// Whether or not the rope should support converting UTF-8 character offsets to wchar array
// positions. This is useful when interoperating with strings in JS, Objective-C and many other
// languages. See http://josephg.com/post/31707645955/string-length-lies
//
// Adding wchar conversion support decreases performance by about 30%.
#ifndef ROPE_WCHAR
#define ROPE_WCHAR 0
#endif
// These two magic values seem to be approximately optimal given the benchmark in
// tests.c which does lots of small inserts.
// Must be <= UINT16_MAX. Benchmarking says this is pretty close to optimal
// (tested on a mac using clang 4.0 and x86_64).
#ifndef ROPE_NODE_STR_SIZE
#if ROPE_WCHAR
#define ROPE_NODE_STR_SIZE 64
#else
#define ROPE_NODE_STR_SIZE 136
#endif
#endif
// The likelyhood (%) a node will have height (n+1) instead of n
#ifndef ROPE_BIAS
#define ROPE_BIAS 25
#endif
// The rope will stop being efficient after the string is 2 ^ ROPE_MAX_HEIGHT nodes.
#ifndef ROPE_MAX_HEIGHT
#define ROPE_MAX_HEIGHT 60
#endif
struct rope_node_t;
// The number of characters in str can be read out of nexts[0].skip_size.
typedef struct {
// The number of _characters_ between the start of the current node
// and the start of next.
size_t skip_size;
// For some reason, librope runs about 1% faster when this next pointer is
// exactly _here_ in the struct.
struct rope_node_t *node;
#if ROPE_WCHAR
// The number of wide characters contained in space.
size_t wchar_size;
#endif
} rope_skip_node;
typedef struct rope_node_t {
uint8_t str[ROPE_NODE_STR_SIZE];
// The number of bytes in str in use
uint16_t num_bytes;
// This is the number of elements allocated in nexts.
// Each height is 1/2 as likely as the height before. The minimum height is 1.
uint8_t height;
rope_skip_node nexts[];
} rope_node;
typedef struct {
// The total number of characters in the rope.
size_t num_chars;
// The total number of bytes which the characters in the rope take up.
size_t num_bytes;
void *(*alloc)(size_t bytes);
void *(*realloc)(void *ptr, size_t newsize);
void (*free)(void *ptr);
// The first node exists inline in the rope structure itself.
rope_node head;
} rope;
#ifdef __cplusplus
extern "C" {
#endif
// Create a new rope with no contents
rope *rope_new();
// Create a new rope using custom allocators.
rope *rope_new2(void *(*alloc)(size_t bytes),
void *(*realloc)(void *ptr, size_t newsize),
void (*free)(void *ptr));
// Create a new rope containing a copy of the given string. Shorthand for
// r = rope_new(); rope_insert(r, 0, str);
rope *rope_new_with_utf8(const uint8_t *str);
// Make a copy of an existing rope
rope *rope_copy(const rope *r);
// Free the specified rope
void rope_free(rope *r);
// Get the number of characters in a rope
size_t rope_char_count(rope *r);
// Get the number of bytes which the rope would take up if stored as a utf8
// string
size_t rope_byte_count(rope *r);
// Copies the rope's contents into a utf8 encoded C string. Also copies a trailing '\0' character.
// Returns the number of bytes written, which is rope_byte_count(r) + 1.
size_t rope_write_cstr(rope *r, uint8_t *dest);
// Create a new C string which contains the rope. The string will contain
// the rope encoded as utf8, followed by a trailing '\0'.
// Use rope_byte_count(r) to get the length of the returned string.
uint8_t *rope_create_cstr(rope *r);
// Insert the given utf8 string into the rope at the specified position.
void rope_insert(rope *r, size_t pos, const uint8_t *str);
// Delete num characters at position pos. Deleting past the end of the string
// has no effect.
void rope_del(rope *r, size_t pos, size_t num);
// This macro expands to a for() loop header which loops over the segments in a rope.
//
// Eg:
// rope *r = rope_new_with_utf8(str);
// ROPE_FOREACH(r, iter) {
// printf("%s", rope_node_data(iter));
// }
#define ROPE_FOREACH(rope, iter) \
for (rope_node *iter = &(rope)->head; iter != NULL; iter = iter->nexts[0].node)
// Get the actual data inside a rope node.
static inline uint8_t *rope_node_data(rope_node *n) {
return n->str;
}
// Get the number of bytes inside a rope node. This is useful when you're looping through a rope.
static inline size_t rope_node_num_bytes(rope_node *n) {
return n->num_bytes;
}
// Get the number of characters inside a rope node.
static inline size_t rope_node_chars(rope_node *n) {
return n->nexts[0].skip_size;
}
#if ROPE_WCHAR
// Get the number of wchar characters in the rope
size_t rope_wchar_count(rope *r);
// Insert the given utf8 string into the rope at the specified wchar position. This is compatible
// with NSString, Javascript, etc. The string still needs to be passed in using UTF-8.
//
// Returns the insertion position in characters.
size_t rope_insert_at_wchar(rope *r, size_t wchar_pos, const uint8_t *utf8_str);
// Delete wchar_num wide characters at the specified wchar position offset.
// Returns the deletion position in characters. *char_len_out is set to the deletion length, in
// chars if its not null.
// If the range is inside character boundaries, behaviour is undefined.
size_t rope_del_at_wchar(rope *r, size_t wchar_pos, size_t wchar_num, size_t *char_len_out);
// Get the number of wchars inside a rope node. This is useful when you're looping throuhg a rope.
static inline size_t rope_node_wchars(rope_node *n) {
return n->nexts[0].wchar_size;
}
#endif
// For debugging.
void _rope_check(rope *r);
void _rope_print(rope *r);
#ifdef __cplusplus
}
#endif
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment