/RLE0DifEncoder.cpp

## RLE0DifEncoder.cpp
#include "RLE0DifEncoder.h"
#include <cassert>
#include <stdint.h>
#include <memory>
#include <stdexcept>

///////////////////////////////////////////////////////////////////////////////

namespace matise {
namespace journal {
namespace details {

///////////////////////////////////////////////////////////////////////////////

typedef uint16_t marker_t;
static const marker_t ZERO_MASK   = 1 << 15;
static const marker_t COUNT_MASK  = 0x7FFF;
static const marker_t COUNT_MAX   = 0x7FFF;
static const char* INPUT_OVERFLOW_ERR = "Input diffs buffer size smaller than its encoded contents implies. Alternatively the buffer content may be corrupted.";

///////////////////////////////////////////////////////////////////////////////

bool RLE0DifEncoder::Encode(
    const void* original,
    const void* update,
    size_t&     input_size,
    void*       output,
    size_t&     output_size) {

    assert(original != nullptr);
    assert(update != nullptr);
    assert(input_size > 0);
    assert(output != nullptr);
    assert(output_size > 0);

    marker_t count;
    uint8_t* outp = (uint8_t*)output;
    uint8_t* iorigp = (uint8_t*)original;
    uint8_t* iupdp = (uint8_t*)update;
    const uint8_t* const iEnd = ((const uint8_t* const)original) + input_size;
    const uint8_t* const oEnd = ((const uint8_t* const)output) + output_size;

    while (outp <= oEnd - sizeof(marker_t) && iorigp < iEnd) {
        count = 0;

        if (*iorigp == *iupdp) { // Run of equal bytes

            // Stride by 'sizeof(size_t)' first
            while ((iEnd - iorigp >= sizeof(size_t)) &&
                   (count + sizeof(size_t) <= COUNT_MAX) &&
                   (*(size_t*)iorigp == *(size_t*)iupdp)) {
                count += sizeof(size_t);
                iorigp += sizeof(size_t);
                iupdp += sizeof(size_t);
            }

            // Remainder as single bytes.
            while ((iorigp < iEnd) &&
                   (count < COUNT_MAX) &&
                   (*iorigp == *iupdp)) {
                count++, iorigp++, iupdp++;
            }

            // Write 'run-of-0s' marker.
            *(marker_t*)(outp) = (count | ZERO_MASK);
            outp += sizeof(marker_t);

        } else { // Run of different bytes.

            // record marker position & advance
            marker_t* countPos = (marker_t*)outp;
            outp += sizeof(marker_t);

            // Copy until at end of buffer or we encounter
            // at least 4 consecutive equal bytes.
            while ((iEnd - iorigp >= sizeof(uint32_t)) &&
                   (*(uint32_t*)iorigp != *(uint32_t*)iupdp) &&
                   (oEnd - outp >= sizeof(uint32_t)) &&
                   (count + sizeof(uint32_t) <= COUNT_MAX)) {
                *(uint32_t*)outp = *(uint32_t*)iupdp;
                outp += sizeof(uint32_t);
                iorigp += sizeof(uint32_t);
                iupdp += sizeof(uint32_t);
                count += sizeof(uint32_t);
            }

            // last few bytes if any.
            if (iEnd - iorigp < sizeof(uint32_t)) {
                while (outp < oEnd && iorigp < iEnd && count <= COUNT_MAX) {
                    *outp++ = *iupdp++;
                    iorigp++;
                    count++;
                }
            }

            // record number of changed bytes
            *countPos = count;
        }
    }

    output_size = outp - ((uint8_t*)output);
    input_size = iorigp - ((uint8_t*)original);
    return (iorigp <= oEnd) && (iorigp >= iEnd);
}

///////////////////////////////////////////////////////////////////////////////

bool RLE0DifEncoder::Decode(
    const void* original,
    void*       dest,
    size_t&     dest_size,
    const void* diffs,
    size_t      diff_size) {

    assert(original != nullptr);
    assert(dest != nullptr);
    assert(dest_size > 0);
    assert(diffs != nullptr);
    assert(diff_size > 0);

    marker_t count;
    uint8_t *outp = (uint8_t*)dest;
    uint8_t *origp = (uint8_t*)original;
    uint8_t *inp = (uint8_t*)diffs;
    uint8_t *src;
    const uint8_t * const iEnd = ((const uint8_t * const)diffs) + diff_size;
    const uint8_t * const oEnd = ((const uint8_t * const)dest) + dest_size;
    dest_size = 0;

    while (outp < oEnd && inp + sizeof(marker_t) <= iEnd) {
        count = *(marker_t*)inp;
        inp += sizeof(marker_t);

        if (count & ZERO_MASK) {

            // same as original
            count &= COUNT_MASK;
            src = origp; // copy from original

        } else { // different from original

            // check for input overflow (error).
            assert(count <= iEnd - inp);
            if (count > iEnd - inp)
                throw std::length_error(INPUT_OVERFLOW_ERR);
            src = inp; // copy from encoded input
            inp += count;
        }

        // check for output overflow
        if (count > oEnd - outp)
            return false;

        // copy block from either original or diffs to destination.
        memcpy(outp, src, count);
        outp += count;
        origp += count;
        dest_size += count;
    }
    return true;
}

///////////////////////////////////////////////////////////////////////////////

bool RLE0DifEncoder::Decode(
    void*       dest,
    size_t&     dest_size,
    const void* diffs,
    size_t      diffs_size) {

    assert(dest != nullptr);
    assert(dest_size > 0);
    assert(diffs != nullptr);
    assert(diffs_size > 0);

    marker_t count;
    uint8_t *outp = (uint8_t*)dest;
    uint8_t *inp = (uint8_t*)diffs;
    const uint8_t * const iEnd = ((const uint8_t * const)diffs) + diffs_size;
    const uint8_t * const oEnd = ((const uint8_t * const)dest) + dest_size;
    dest_size = 0;

    while (outp < oEnd && inp + sizeof(marker_t) <= iEnd) {
        count = *(marker_t*)inp;
        inp += sizeof(marker_t);

        if (count & ZERO_MASK) {

            // same as original, we can simply skip.
            count &= COUNT_MASK;

        } else { // different, copy changes to destination

            // check for input overflow (error).
            assert(count <= iEnd - inp);
            if (count > iEnd - inp)
                throw std::length_error(INPUT_OVERFLOW_ERR);

            // check for output overflow
            if (count > oEnd - outp)
                return false;

            // copy the block of differences.
            memcpy(outp, inp, count);
            inp += count;
        }
        outp += count;
        dest_size += count;
    }
    return outp <= oEnd;
}

///////////////////////////////////////////////////////////////////////////////

}}}

## RLE0DifEncoder.cs
namespace TryOuts
{
    using System;
    using System.Reflection.Emit;

    public unsafe class RLE0DifEncoder
    {
        private delegate void MemCopyFunc(void* dest, void* src, int n);
        private static readonly MemCopyFunc _memcpy;
        const UInt32 ZERO_MASK = 1u << 31;
        const UInt32 COUNT_MASK = 0x7FFFFFFF;

        static RLE0DifEncoder()
        {
            _memcpy = BuildMemCpy();
        }

        static MemCopyFunc BuildMemCpy()
        {
            var args = new[] { typeof(void*), typeof(void*), typeof(int) };

            // input may be unaligned
            var method = new DynamicMethod("DynamicIL_cpblk_unaligned", typeof(void), args, typeof(RLE0DifEncoder));
            var ilGen = method.GetILGenerator();
            ilGen.Emit(OpCodes.Ldarg_0);
            ilGen.Emit(OpCodes.Ldarg_1);
            ilGen.Emit(OpCodes.Ldarg_2);
            if (sizeof(IntPtr) == 8)
                ilGen.Emit(OpCodes.Unaligned, 1);
            ilGen.Emit(OpCodes.Cpblk);
            ilGen.Emit(OpCodes.Ret);
            return (MemCopyFunc)method.CreateDelegate(typeof(MemCopyFunc));
        }

        public static long Encode(byte* original, byte* update, int input_size, byte* output, int output_size)
        {
            // Encoding is per 32 bits, input and output must be a multiple of that.
            System.Diagnostics.Debug.Assert((input_size % sizeof(UInt32)) == 0);
            System.Diagnostics.Debug.Assert((output_size % sizeof(UInt32)) == 0);

            UInt32 count = 0;
            UInt32* origp = (UInt32*)original, newp = (UInt32*)update;
            UInt32* outp = (UInt32*)output;
            UInt32* iEnd = (UInt32*)(original + input_size);
            UInt32* oEnd = (UInt32*)(output + output_size);

            count = 0;
            while (origp < iEnd && outp < oEnd)
            {
                count = 0;
                if (*origp == *newp) // run of 0s
                {
                    while (origp < iEnd && *origp == *newp)
                    {
                        count++; origp++; newp++;
                    }
                    *outp++ = (count | ZERO_MASK); // record run of 0s in output.
                }
                else // changes
                {
                    UInt32* marker_pos = outp++;
                    while (origp < iEnd && outp < oEnd && *origp != *newp)
                    {
                        *outp++ = *newp++;
                        count++; origp++;
                    }
                    *marker_pos = count; // record number of changes
                }
            }
            return origp < iEnd
                ? original - (byte*)iEnd  // output buffer too small, return '-remaining bytes in input'
                : (((byte*)outp) - output);
        }

        public static bool Decode(byte* destination, int dest_size, byte* differences, long dif_size)
        {
            // Encoding is per 32 bits, input and output must be a multiple of that.
            System.Diagnostics.Debug.Assert((dest_size % sizeof(UInt32)) == 0);
            System.Diagnostics.Debug.Assert((dif_size % sizeof(UInt32)) == 0);

            UInt32 count = 0;
            UInt32* destp = (UInt32*)destination, srcp = (UInt32*)differences;
            UInt32* oEnd = (UInt32*)(destination + dest_size);
            UInt32* iEnd = (UInt32*)(differences + dif_size);

            while (destp < oEnd && srcp < iEnd)
            {
                count = *srcp++;
                if ((count & ZERO_MASK) == ZERO_MASK)
                {
                    count &= COUNT_MASK; // we can skip these.
                }
                else // block of changes to copy
                {
                    if (destp + count > oEnd && srcp + count > iEnd)
                        return false; // buffer sizes not correct.
                    _memcpy(destp, srcp, (int)(count * sizeof(UInt32)));
                    srcp += count;
                }
                destp += count;
            }
            return destp <= oEnd && srcp == iEnd;
        }
    }
}

## RLE0DifEncoder.h
#pragma once

namespace matise {
namespace journal {
namespace details {

/** Encoder for differences between an original and updated memory block.
 *
 *  The idea is that changes to B+Tree pages tend to be relatively small in
 *  most cases and rather localized. Thus, for most of the bytes in a page,
 *  the difference between original and updated page would be 0. We can simply
 *  compare bytes of the original and updated page and store the count of
 *  bytes with no difference, or the count of bytes that are different followed
 *  by those changed bytes. Decoding then becomes really simple as well. When
 *  we come across a "run of 0-s" (i.e. no changes) marker, we can simply skip
 *  the count of 0s forward in the destination, without writing to it, or
 *  alternatively, copy those from a given original to the destination. When
 *  we see a "different count" marker, we can simply copy that amount of bytes
 *  from the encoded contents to the destination.
 *
 *  Thus output is of the following format:
 *
 *  ... [x bytes unchanged][y bytes changed][byte 1 .. y][z bytes unchanged] ...
 *
 *  This results in only storing the changes, with little overhead for a
 *  (16 bit) marker word, unless the changes are very fragmented, which due to
 *  the design of B+Tree nodes and other stored structures should be rare.
 *
 *  Note: for a first version of a page, compare to a 0 filled original.
 */
class RLE0DifEncoder {
public:
    /** Encodes the difference between the given @original and @update memory
     *  blocks of size @input_size and stores the encoded differences in the
     *  given @output. Will not exceed @output_size in the output, and update
     *  @output_size to the amount of bytes written to @output.
     *  Returns 'false' if the encoded bytes don't fit in the given @output.
     */
    static bool Encode(
        const void* original,     // in: original mem block before update
        const void* update,       // in: mem block after updata
        size_t&     input_size,   // in: size of input mem blocks; out: # bytes consumed
        void*       output,       // out: encoded diffs between @original and @update
        size_t&     output_size); // in: size of @output; out: # bytes written to @output

    /** Decodes the encoded differences in @diffs versus the given read-only
     *  @original and writes them to the given @dest page, copying bytes from
     *  @original to @dest when a 'run-of-0s' is encountered.
     *  Returns 'false' if the given @dest_size is not large enough.
     *  Throws if @diffs_size is too small according to the encoded contents
     *  of @diffs.
     */
    static bool Decode(
        const void* original,     // in: original mem block before update
        void*       dest,         // out: reconstructed update
        size_t&     dest_size,    // in: size of input mem block; out: # bytes written
        const void* diffs,        // in: RLE0 dif encoded data to be restored
        size_t      diffs_size);  // in: size of @diffs

    /** Decodes the encoded differences in @input versus the writable
     *  copy of the original given as @dest, skipping bytes in @dest when
     *  a 'run-of-0s' is encountered.
     *  Returns 'false' if the given @dest_size is not large enough.
     *  Throws if @diffs_size is too small according to the encoded contents
     *  of @diffs.
     */
    static bool Decode(
        void*       dest,         // in: mem block with original data to be updated in-place
        size_t&     dest_size,    // in: size of input mem block; out: # bytes written
        const void* diffs,        // in: RLE0 dif encoded data to be restored
        size_t      diffs_size);  // in: size of @diffs
};

}}}

## sample.cpp
#include "RLE0DifEncoder.h"
#include <random>
#include <algorithm>
#include <iostream>

static void TryRLE0() {
    using random_bytes_engine = std::independent_bits_engine<std::default_random_engine, std::numeric_limits<uint16_t>::digits, uint16_t>;
    random_bytes_engine rbe;
    auto rbe_uint8_t = [&rbe] {
        return (uint8_t)(rbe() & 0xFF);
    };

    // memory for the different pages.
    auto mem = (uint8_t*)malloc(4096 * 4);
    auto orig = mem;
    auto update = orig + 4096;
    auto restored = update + 4096;
    auto diffs = restored + 4096;

    // generate random content for original
    std::generate(orig, orig + 4096, std::ref(rbe_uint8_t));

    for (auto i = 0; i < 1000; ++i) {

        // create updates in update page.
        memcpy(update, orig, 4096);
        size_t encoded_len = 0;
        auto n = 1 + (rbe() & 7);
        for (auto j = 0; j < n; j++) {
            auto offs = rbe() % 4096;
            auto len = 1 + (rbe() % 256);
            if (offs + len > 4096)
                len = 4096 - offs;
            std::generate(update + offs, update + offs + len, std::ref(rbe_uint8_t));
            encoded_len += len;
        }

        // encode differences.
        size_t in_sz = 4096;
        size_t out_sz = 4096;
        if (!matise::journal::details::RLE0DifEncoder::Encode(orig, update, in_sz, diffs, out_sz)) {
            std::cout << "Failed to encode" << std::endl;
            break;
        }
        std::cout << i << ": encoded " << encoded_len << " bytes changes in " << out_sz << " bytes." << std::endl;

        // decode to restored.
        in_sz = 4096;
        if (rbe() & 1) {
            if (!matise::journal::details::RLE0DifEncoder::Decode(orig, restored, in_sz, diffs, out_sz)) {
                std::cout << "Failed to decode (with orig option)" << std::endl;
                break;
            }
        } else {
            memcpy(restored, orig, 4096);
            if (!matise::journal::details::RLE0DifEncoder::Decode(restored, in_sz, diffs, out_sz)) {
                std::cout << "Failed to decode (with in place option)" << std::endl;
                break;
            }
        }

        // ensure restored matches update.
        if (memcmp(restored, update, 4096) != 0) {
            std::cout << "Failed: restored does not match update" << std::endl;
            break;
        }
    }
	#include "RLE0DifEncoder.h"
	#include <cassert>
	#include <stdint.h>
	#include <memory>
	#include <stdexcept>

	///////////////////////////////////////////////////////////////////////////////

	namespace matise {
	namespace journal {
	namespace details {

	///////////////////////////////////////////////////////////////////////////////

	typedef uint16_t marker_t;
	static const marker_t ZERO_MASK = 1 << 15;
	static const marker_t COUNT_MASK = 0x7FFF;
	static const marker_t COUNT_MAX = 0x7FFF;
	static const char* INPUT_OVERFLOW_ERR = "Input diffs buffer size smaller than its encoded contents implies. Alternatively the buffer content may be corrupted.";

	///////////////////////////////////////////////////////////////////////////////

	bool RLE0DifEncoder::Encode(
	const void* original,
	const void* update,
	size_t& input_size,
	void* output,
	size_t& output_size) {

	assert(original != nullptr);
	assert(update != nullptr);
	assert(input_size > 0);
	assert(output != nullptr);
	assert(output_size > 0);

	marker_t count;
	uint8_t* outp = (uint8_t*)output;
	uint8_t* iorigp = (uint8_t*)original;
	uint8_t* iupdp = (uint8_t*)update;
	const uint8_t* const iEnd = ((const uint8_t* const)original) + input_size;
	const uint8_t* const oEnd = ((const uint8_t* const)output) + output_size;

	while (outp <= oEnd - sizeof(marker_t) && iorigp < iEnd) {
	count = 0;

	if (iorigp == iupdp) { // Run of equal bytes

	// Stride by 'sizeof(size_t)' first
	while ((iEnd - iorigp >= sizeof(size_t)) &&
	(count + sizeof(size_t) <= COUNT_MAX) &&
	((size_t)iorigp == (size_t)iupdp)) {
	count += sizeof(size_t);
	iorigp += sizeof(size_t);
	iupdp += sizeof(size_t);
	}

	// Remainder as single bytes.
	while ((iorigp < iEnd) &&
	(count < COUNT_MAX) &&
	(iorigp == iupdp)) {
	count++, iorigp++, iupdp++;
	}

	// Write 'run-of-0s' marker.
	(marker_t)(outp) = (count \| ZERO_MASK);
	outp += sizeof(marker_t);

	} else { // Run of different bytes.

	// record marker position & advance
	marker_t* countPos = (marker_t*)outp;
	outp += sizeof(marker_t);

	// Copy until at end of buffer or we encounter
	// at least 4 consecutive equal bytes.
	while ((iEnd - iorigp >= sizeof(uint32_t)) &&
	((uint32_t)iorigp != (uint32_t)iupdp) &&
	(oEnd - outp >= sizeof(uint32_t)) &&
	(count + sizeof(uint32_t) <= COUNT_MAX)) {
	(uint32_t)outp = (uint32_t)iupdp;
	outp += sizeof(uint32_t);
	iorigp += sizeof(uint32_t);
	iupdp += sizeof(uint32_t);
	count += sizeof(uint32_t);
	}

	// last few bytes if any.
	if (iEnd - iorigp < sizeof(uint32_t)) {
	while (outp < oEnd && iorigp < iEnd && count <= COUNT_MAX) {
	outp++ = iupdp++;
	iorigp++;
	count++;
	}
	}

	// record number of changed bytes
	*countPos = count;
	}
	}

	output_size = outp - ((uint8_t*)output);
	input_size = iorigp - ((uint8_t*)original);
	return (iorigp <= oEnd) && (iorigp >= iEnd);
	}

	///////////////////////////////////////////////////////////////////////////////

	bool RLE0DifEncoder::Decode(
	const void* original,
	void* dest,
	size_t& dest_size,
	const void* diffs,
	size_t diff_size) {

	assert(original != nullptr);
	assert(dest != nullptr);
	assert(dest_size > 0);
	assert(diffs != nullptr);
	assert(diff_size > 0);

	marker_t count;
	uint8_t outp = (uint8_t)dest;
	uint8_t origp = (uint8_t)original;
	uint8_t inp = (uint8_t)diffs;
	uint8_t *src;
	const uint8_t * const iEnd = ((const uint8_t * const)diffs) + diff_size;
	const uint8_t * const oEnd = ((const uint8_t * const)dest) + dest_size;
	dest_size = 0;

	while (outp < oEnd && inp + sizeof(marker_t) <= iEnd) {
	count = (marker_t)inp;
	inp += sizeof(marker_t);

	if (count & ZERO_MASK) {

	// same as original
	count &= COUNT_MASK;
	src = origp; // copy from original

	} else { // different from original

	// check for input overflow (error).
	assert(count <= iEnd - inp);
	if (count > iEnd - inp)
	throw std::length_error(INPUT_OVERFLOW_ERR);
	src = inp; // copy from encoded input
	inp += count;
	}

	// check for output overflow
	if (count > oEnd - outp)
	return false;

	// copy block from either original or diffs to destination.
	memcpy(outp, src, count);
	outp += count;
	origp += count;
	dest_size += count;
	}
	return true;
	}

	///////////////////////////////////////////////////////////////////////////////

	bool RLE0DifEncoder::Decode(
	void* dest,
	size_t& dest_size,
	const void* diffs,
	size_t diffs_size) {

	assert(dest != nullptr);
	assert(dest_size > 0);
	assert(diffs != nullptr);
	assert(diffs_size > 0);

	marker_t count;
	uint8_t outp = (uint8_t)dest;
	uint8_t inp = (uint8_t)diffs;
	const uint8_t * const iEnd = ((const uint8_t * const)diffs) + diffs_size;
	const uint8_t * const oEnd = ((const uint8_t * const)dest) + dest_size;
	dest_size = 0;

	while (outp < oEnd && inp + sizeof(marker_t) <= iEnd) {
	count = (marker_t)inp;
	inp += sizeof(marker_t);

	if (count & ZERO_MASK) {

	// same as original, we can simply skip.
	count &= COUNT_MASK;

	} else { // different, copy changes to destination

	// check for input overflow (error).
	assert(count <= iEnd - inp);
	if (count > iEnd - inp)
	throw std::length_error(INPUT_OVERFLOW_ERR);

	// check for output overflow
	if (count > oEnd - outp)
	return false;

	// copy the block of differences.
	memcpy(outp, inp, count);
	inp += count;
	}
	outp += count;
	dest_size += count;
	}
	return outp <= oEnd;
	}

	///////////////////////////////////////////////////////////////////////////////

	}}}
	namespace TryOuts
	{
	using System;
	using System.Reflection.Emit;

	public unsafe class RLE0DifEncoder
	{
	private delegate void MemCopyFunc(void* dest, void* src, int n);
	private static readonly MemCopyFunc _memcpy;
	const UInt32 ZERO_MASK = 1u << 31;
	const UInt32 COUNT_MASK = 0x7FFFFFFF;

	static RLE0DifEncoder()
	{
	_memcpy = BuildMemCpy();
	}

	static MemCopyFunc BuildMemCpy()
	{
	var args = new[] { typeof(void), typeof(void), typeof(int) };

	// input may be unaligned
	var method = new DynamicMethod("DynamicIL_cpblk_unaligned", typeof(void), args, typeof(RLE0DifEncoder));
	var ilGen = method.GetILGenerator();
	ilGen.Emit(OpCodes.Ldarg_0);
	ilGen.Emit(OpCodes.Ldarg_1);
	ilGen.Emit(OpCodes.Ldarg_2);
	if (sizeof(IntPtr) == 8)
	ilGen.Emit(OpCodes.Unaligned, 1);
	ilGen.Emit(OpCodes.Cpblk);
	ilGen.Emit(OpCodes.Ret);
	return (MemCopyFunc)method.CreateDelegate(typeof(MemCopyFunc));
	}

	public static long Encode(byte* original, byte* update, int input_size, byte* output, int output_size)
	{
	// Encoding is per 32 bits, input and output must be a multiple of that.
	System.Diagnostics.Debug.Assert((input_size % sizeof(UInt32)) == 0);
	System.Diagnostics.Debug.Assert((output_size % sizeof(UInt32)) == 0);

	UInt32 count = 0;
	UInt32* origp = (UInt32)original, newp = (UInt32)update;
	UInt32* outp = (UInt32*)output;
	UInt32* iEnd = (UInt32*)(original + input_size);
	UInt32* oEnd = (UInt32*)(output + output_size);

	count = 0;
	while (origp < iEnd && outp < oEnd)
	{
	count = 0;
	if (origp == newp) // run of 0s
	{
	while (origp < iEnd && origp == newp)
	{
	count++; origp++; newp++;
	}
	*outp++ = (count \| ZERO_MASK); // record run of 0s in output.
	}
	else // changes
	{
	UInt32* marker_pos = outp++;
	while (origp < iEnd && outp < oEnd && origp != newp)
	{
	outp++ = newp++;
	count++; origp++;
	}
	*marker_pos = count; // record number of changes
	}
	}
	return origp < iEnd
	? original - (byte*)iEnd // output buffer too small, return '-remaining bytes in input'
	: (((byte*)outp) - output);
	}

	public static bool Decode(byte* destination, int dest_size, byte* differences, long dif_size)
	{
	// Encoding is per 32 bits, input and output must be a multiple of that.
	System.Diagnostics.Debug.Assert((dest_size % sizeof(UInt32)) == 0);
	System.Diagnostics.Debug.Assert((dif_size % sizeof(UInt32)) == 0);

	UInt32 count = 0;
	UInt32* destp = (UInt32)destination, srcp = (UInt32)differences;
	UInt32* oEnd = (UInt32*)(destination + dest_size);
	UInt32* iEnd = (UInt32*)(differences + dif_size);

	while (destp < oEnd && srcp < iEnd)
	{
	count = *srcp++;
	if ((count & ZERO_MASK) == ZERO_MASK)
	{
	count &= COUNT_MASK; // we can skip these.
	}
	else // block of changes to copy
	{
	if (destp + count > oEnd && srcp + count > iEnd)
	return false; // buffer sizes not correct.
	_memcpy(destp, srcp, (int)(count * sizeof(UInt32)));
	srcp += count;
	}
	destp += count;
	}
	return destp <= oEnd && srcp == iEnd;
	}
	}
	}
	#pragma once

	namespace matise {
	namespace journal {
	namespace details {

	/** Encoder for differences between an original and updated memory block.
	*
	* The idea is that changes to B+Tree pages tend to be relatively small in
	* most cases and rather localized. Thus, for most of the bytes in a page,
	* the difference between original and updated page would be 0. We can simply
	* compare bytes of the original and updated page and store the count of
	* bytes with no difference, or the count of bytes that are different followed
	* by those changed bytes. Decoding then becomes really simple as well. When
	* we come across a "run of 0-s" (i.e. no changes) marker, we can simply skip
	* the count of 0s forward in the destination, without writing to it, or
	* alternatively, copy those from a given original to the destination. When
	* we see a "different count" marker, we can simply copy that amount of bytes
	* from the encoded contents to the destination.
	*
	* Thus output is of the following format:
	*
	* ... [x bytes unchanged][y bytes changed][byte 1 .. y][z bytes unchanged] ...
	*
	* This results in only storing the changes, with little overhead for a
	* (16 bit) marker word, unless the changes are very fragmented, which due to
	* the design of B+Tree nodes and other stored structures should be rare.
	*
	* Note: for a first version of a page, compare to a 0 filled original.
	*/
	class RLE0DifEncoder {
	public:
	/** Encodes the difference between the given @original and @update memory
	* blocks of size @input_size and stores the encoded differences in the
	* given @output. Will not exceed @output_size in the output, and update
	* @output_size to the amount of bytes written to @output.
	* Returns 'false' if the encoded bytes don't fit in the given @output.
	*/
	static bool Encode(
	const void* original, // in: original mem block before update
	const void* update, // in: mem block after updata
	size_t& input_size, // in: size of input mem blocks; out: # bytes consumed
	void* output, // out: encoded diffs between @original and @update
	size_t& output_size); // in: size of @output; out: # bytes written to @output

	/** Decodes the encoded differences in @diffs versus the given read-only
	* @original and writes them to the given @dest page, copying bytes from
	* @original to @dest when a 'run-of-0s' is encountered.
	* Returns 'false' if the given @dest_size is not large enough.
	* Throws if @diffs_size is too small according to the encoded contents
	* of @diffs.
	*/
	static bool Decode(
	const void* original, // in: original mem block before update
	void* dest, // out: reconstructed update
	size_t& dest_size, // in: size of input mem block; out: # bytes written
	const void* diffs, // in: RLE0 dif encoded data to be restored
	size_t diffs_size); // in: size of @diffs

	/** Decodes the encoded differences in @input versus the writable
	* copy of the original given as @dest, skipping bytes in @dest when
	* a 'run-of-0s' is encountered.
	* Returns 'false' if the given @dest_size is not large enough.
	* Throws if @diffs_size is too small according to the encoded contents
	* of @diffs.
	*/
	static bool Decode(
	void* dest, // in: mem block with original data to be updated in-place
	size_t& dest_size, // in: size of input mem block; out: # bytes written
	const void* diffs, // in: RLE0 dif encoded data to be restored
	size_t diffs_size); // in: size of @diffs
	};

	}}}
	#include "RLE0DifEncoder.h"
	#include <random>
	#include <algorithm>
	#include <iostream>

	static void TryRLE0() {
	using random_bytes_engine = std::independent_bits_engine<std::default_random_engine, std::numeric_limits<uint16_t>::digits, uint16_t>;
	random_bytes_engine rbe;
	auto rbe_uint8_t = [&rbe] {
	return (uint8_t)(rbe() & 0xFF);
	};

	// memory for the different pages.
	auto mem = (uint8_t)malloc(4096 4);
	auto orig = mem;
	auto update = orig + 4096;
	auto restored = update + 4096;
	auto diffs = restored + 4096;

	// generate random content for original
	std::generate(orig, orig + 4096, std::ref(rbe_uint8_t));

	for (auto i = 0; i < 1000; ++i) {

	// create updates in update page.
	memcpy(update, orig, 4096);
	size_t encoded_len = 0;
	auto n = 1 + (rbe() & 7);
	for (auto j = 0; j < n; j++) {
	auto offs = rbe() % 4096;
	auto len = 1 + (rbe() % 256);
	if (offs + len > 4096)
	len = 4096 - offs;
	std::generate(update + offs, update + offs + len, std::ref(rbe_uint8_t));
	encoded_len += len;
	}

	// encode differences.
	size_t in_sz = 4096;
	size_t out_sz = 4096;
	if (!matise::journal::details::RLE0DifEncoder::Encode(orig, update, in_sz, diffs, out_sz)) {
	std::cout << "Failed to encode" << std::endl;
	break;
	}
	std::cout << i << ": encoded " << encoded_len << " bytes changes in " << out_sz << " bytes." << std::endl;

	// decode to restored.
	in_sz = 4096;
	if (rbe() & 1) {
	if (!matise::journal::details::RLE0DifEncoder::Decode(orig, restored, in_sz, diffs, out_sz)) {
	std::cout << "Failed to decode (with orig option)" << std::endl;
	break;
	}
	} else {
	memcpy(restored, orig, 4096);
	if (!matise::journal::details::RLE0DifEncoder::Decode(restored, in_sz, diffs, out_sz)) {
	std::cout << "Failed to decode (with in place option)" << std::endl;
	break;
	}
	}

	// ensure restored matches update.
	if (memcmp(restored, update, 4096) != 0) {
	std::cout << "Failed: restored does not match update" << std::endl;
	break;
	}
	}