Created
August 5, 2016 22:11
-
-
Save anonymous/7acba42f4039f0c5b1405d8807f6ba8e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "RLE0DifEncoder.h" | |
#include <cassert> | |
#include <stdint.h> | |
#include <memory> | |
#include <stdexcept> | |
/////////////////////////////////////////////////////////////////////////////// | |
namespace matise { | |
namespace journal { | |
namespace details { | |
/////////////////////////////////////////////////////////////////////////////// | |
typedef uint16_t marker_t; | |
static const marker_t ZERO_MASK = 1 << 15; | |
static const marker_t COUNT_MASK = 0x7FFF; | |
static const marker_t COUNT_MAX = 0x7FFF; | |
static const char* INPUT_OVERFLOW_ERR = "Input diffs buffer size smaller than its encoded contents implies. Alternatively the buffer content may be corrupted."; | |
/////////////////////////////////////////////////////////////////////////////// | |
bool RLE0DifEncoder::Encode( | |
const void* original, | |
const void* update, | |
size_t& input_size, | |
void* output, | |
size_t& output_size) { | |
assert(original != nullptr); | |
assert(update != nullptr); | |
assert(input_size > 0); | |
assert(output != nullptr); | |
assert(output_size > 0); | |
marker_t count; | |
uint8_t* outp = (uint8_t*)output; | |
uint8_t* iorigp = (uint8_t*)original; | |
uint8_t* iupdp = (uint8_t*)update; | |
const uint8_t* const iEnd = ((const uint8_t* const)original) + input_size; | |
const uint8_t* const oEnd = ((const uint8_t* const)output) + output_size; | |
while (outp <= oEnd - sizeof(marker_t) && iorigp < iEnd) { | |
count = 0; | |
if (*iorigp == *iupdp) { // Run of equal bytes | |
// Stride by 'sizeof(size_t)' first | |
while ((iEnd - iorigp >= sizeof(size_t)) && | |
(count + sizeof(size_t) <= COUNT_MAX) && | |
(*(size_t*)iorigp == *(size_t*)iupdp)) { | |
count += sizeof(size_t); | |
iorigp += sizeof(size_t); | |
iupdp += sizeof(size_t); | |
} | |
// Remainder as single bytes. | |
while ((iorigp < iEnd) && | |
(count < COUNT_MAX) && | |
(*iorigp == *iupdp)) { | |
count++, iorigp++, iupdp++; | |
} | |
// Write 'run-of-0s' marker. | |
*(marker_t*)(outp) = (count | ZERO_MASK); | |
outp += sizeof(marker_t); | |
} else { // Run of different bytes. | |
// record marker position & advance | |
marker_t* countPos = (marker_t*)outp; | |
outp += sizeof(marker_t); | |
// Copy until at end of buffer or we encounter | |
// at least 4 consecutive equal bytes. | |
while ((iEnd - iorigp >= sizeof(uint32_t)) && | |
(*(uint32_t*)iorigp != *(uint32_t*)iupdp) && | |
(oEnd - outp >= sizeof(uint32_t)) && | |
(count + sizeof(uint32_t) <= COUNT_MAX)) { | |
*(uint32_t*)outp = *(uint32_t*)iupdp; | |
outp += sizeof(uint32_t); | |
iorigp += sizeof(uint32_t); | |
iupdp += sizeof(uint32_t); | |
count += sizeof(uint32_t); | |
} | |
// last few bytes if any. | |
if (iEnd - iorigp < sizeof(uint32_t)) { | |
while (outp < oEnd && iorigp < iEnd && count <= COUNT_MAX) { | |
*outp++ = *iupdp++; | |
iorigp++; | |
count++; | |
} | |
} | |
// record number of changed bytes | |
*countPos = count; | |
} | |
} | |
output_size = outp - ((uint8_t*)output); | |
input_size = iorigp - ((uint8_t*)original); | |
return (iorigp <= oEnd) && (iorigp >= iEnd); | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
bool RLE0DifEncoder::Decode( | |
const void* original, | |
void* dest, | |
size_t& dest_size, | |
const void* diffs, | |
size_t diff_size) { | |
assert(original != nullptr); | |
assert(dest != nullptr); | |
assert(dest_size > 0); | |
assert(diffs != nullptr); | |
assert(diff_size > 0); | |
marker_t count; | |
uint8_t *outp = (uint8_t*)dest; | |
uint8_t *origp = (uint8_t*)original; | |
uint8_t *inp = (uint8_t*)diffs; | |
uint8_t *src; | |
const uint8_t * const iEnd = ((const uint8_t * const)diffs) + diff_size; | |
const uint8_t * const oEnd = ((const uint8_t * const)dest) + dest_size; | |
dest_size = 0; | |
while (outp < oEnd && inp + sizeof(marker_t) <= iEnd) { | |
count = *(marker_t*)inp; | |
inp += sizeof(marker_t); | |
if (count & ZERO_MASK) { | |
// same as original | |
count &= COUNT_MASK; | |
src = origp; // copy from original | |
} else { // different from original | |
// check for input overflow (error). | |
assert(count <= iEnd - inp); | |
if (count > iEnd - inp) | |
throw std::length_error(INPUT_OVERFLOW_ERR); | |
src = inp; // copy from encoded input | |
inp += count; | |
} | |
// check for output overflow | |
if (count > oEnd - outp) | |
return false; | |
// copy block from either original or diffs to destination. | |
memcpy(outp, src, count); | |
outp += count; | |
origp += count; | |
dest_size += count; | |
} | |
return true; | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
bool RLE0DifEncoder::Decode( | |
void* dest, | |
size_t& dest_size, | |
const void* diffs, | |
size_t diffs_size) { | |
assert(dest != nullptr); | |
assert(dest_size > 0); | |
assert(diffs != nullptr); | |
assert(diffs_size > 0); | |
marker_t count; | |
uint8_t *outp = (uint8_t*)dest; | |
uint8_t *inp = (uint8_t*)diffs; | |
const uint8_t * const iEnd = ((const uint8_t * const)diffs) + diffs_size; | |
const uint8_t * const oEnd = ((const uint8_t * const)dest) + dest_size; | |
dest_size = 0; | |
while (outp < oEnd && inp + sizeof(marker_t) <= iEnd) { | |
count = *(marker_t*)inp; | |
inp += sizeof(marker_t); | |
if (count & ZERO_MASK) { | |
// same as original, we can simply skip. | |
count &= COUNT_MASK; | |
} else { // different, copy changes to destination | |
// check for input overflow (error). | |
assert(count <= iEnd - inp); | |
if (count > iEnd - inp) | |
throw std::length_error(INPUT_OVERFLOW_ERR); | |
// check for output overflow | |
if (count > oEnd - outp) | |
return false; | |
// copy the block of differences. | |
memcpy(outp, inp, count); | |
inp += count; | |
} | |
outp += count; | |
dest_size += count; | |
} | |
return outp <= oEnd; | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace TryOuts | |
{ | |
using System; | |
using System.Reflection.Emit; | |
public unsafe class RLE0DifEncoder | |
{ | |
private delegate void MemCopyFunc(void* dest, void* src, int n); | |
private static readonly MemCopyFunc _memcpy; | |
const UInt32 ZERO_MASK = 1u << 31; | |
const UInt32 COUNT_MASK = 0x7FFFFFFF; | |
static RLE0DifEncoder() | |
{ | |
_memcpy = BuildMemCpy(); | |
} | |
static MemCopyFunc BuildMemCpy() | |
{ | |
var args = new[] { typeof(void*), typeof(void*), typeof(int) }; | |
// input may be unaligned | |
var method = new DynamicMethod("DynamicIL_cpblk_unaligned", typeof(void), args, typeof(RLE0DifEncoder)); | |
var ilGen = method.GetILGenerator(); | |
ilGen.Emit(OpCodes.Ldarg_0); | |
ilGen.Emit(OpCodes.Ldarg_1); | |
ilGen.Emit(OpCodes.Ldarg_2); | |
if (sizeof(IntPtr) == 8) | |
ilGen.Emit(OpCodes.Unaligned, 1); | |
ilGen.Emit(OpCodes.Cpblk); | |
ilGen.Emit(OpCodes.Ret); | |
return (MemCopyFunc)method.CreateDelegate(typeof(MemCopyFunc)); | |
} | |
public static long Encode(byte* original, byte* update, int input_size, byte* output, int output_size) | |
{ | |
// Encoding is per 32 bits, input and output must be a multiple of that. | |
System.Diagnostics.Debug.Assert((input_size % sizeof(UInt32)) == 0); | |
System.Diagnostics.Debug.Assert((output_size % sizeof(UInt32)) == 0); | |
UInt32 count = 0; | |
UInt32* origp = (UInt32*)original, newp = (UInt32*)update; | |
UInt32* outp = (UInt32*)output; | |
UInt32* iEnd = (UInt32*)(original + input_size); | |
UInt32* oEnd = (UInt32*)(output + output_size); | |
count = 0; | |
while (origp < iEnd && outp < oEnd) | |
{ | |
count = 0; | |
if (*origp == *newp) // run of 0s | |
{ | |
while (origp < iEnd && *origp == *newp) | |
{ | |
count++; origp++; newp++; | |
} | |
*outp++ = (count | ZERO_MASK); // record run of 0s in output. | |
} | |
else // changes | |
{ | |
UInt32* marker_pos = outp++; | |
while (origp < iEnd && outp < oEnd && *origp != *newp) | |
{ | |
*outp++ = *newp++; | |
count++; origp++; | |
} | |
*marker_pos = count; // record number of changes | |
} | |
} | |
return origp < iEnd | |
? original - (byte*)iEnd // output buffer too small, return '-remaining bytes in input' | |
: (((byte*)outp) - output); | |
} | |
public static bool Decode(byte* destination, int dest_size, byte* differences, long dif_size) | |
{ | |
// Encoding is per 32 bits, input and output must be a multiple of that. | |
System.Diagnostics.Debug.Assert((dest_size % sizeof(UInt32)) == 0); | |
System.Diagnostics.Debug.Assert((dif_size % sizeof(UInt32)) == 0); | |
UInt32 count = 0; | |
UInt32* destp = (UInt32*)destination, srcp = (UInt32*)differences; | |
UInt32* oEnd = (UInt32*)(destination + dest_size); | |
UInt32* iEnd = (UInt32*)(differences + dif_size); | |
while (destp < oEnd && srcp < iEnd) | |
{ | |
count = *srcp++; | |
if ((count & ZERO_MASK) == ZERO_MASK) | |
{ | |
count &= COUNT_MASK; // we can skip these. | |
} | |
else // block of changes to copy | |
{ | |
if (destp + count > oEnd && srcp + count > iEnd) | |
return false; // buffer sizes not correct. | |
_memcpy(destp, srcp, (int)(count * sizeof(UInt32))); | |
srcp += count; | |
} | |
destp += count; | |
} | |
return destp <= oEnd && srcp == iEnd; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
namespace matise { | |
namespace journal { | |
namespace details { | |
/** Encoder for differences between an original and updated memory block. | |
* | |
* The idea is that changes to B+Tree pages tend to be relatively small in | |
* most cases and rather localized. Thus, for most of the bytes in a page, | |
* the difference between original and updated page would be 0. We can simply | |
* compare bytes of the original and updated page and store the count of | |
* bytes with no difference, or the count of bytes that are different followed | |
* by those changed bytes. Decoding then becomes really simple as well. When | |
* we come across a "run of 0-s" (i.e. no changes) marker, we can simply skip | |
* the count of 0s forward in the destination, without writing to it, or | |
* alternatively, copy those from a given original to the destination. When | |
* we see a "different count" marker, we can simply copy that amount of bytes | |
* from the encoded contents to the destination. | |
* | |
* Thus output is of the following format: | |
* | |
* ... [x bytes unchanged][y bytes changed][byte 1 .. y][z bytes unchanged] ... | |
* | |
* This results in only storing the changes, with little overhead for a | |
* (16 bit) marker word, unless the changes are very fragmented, which due to | |
* the design of B+Tree nodes and other stored structures should be rare. | |
* | |
* Note: for a first version of a page, compare to a 0 filled original. | |
*/ | |
class RLE0DifEncoder { | |
public: | |
/** Encodes the difference between the given @original and @update memory | |
* blocks of size @input_size and stores the encoded differences in the | |
* given @output. Will not exceed @output_size in the output, and update | |
* @output_size to the amount of bytes written to @output. | |
* Returns 'false' if the encoded bytes don't fit in the given @output. | |
*/ | |
static bool Encode( | |
const void* original, // in: original mem block before update | |
const void* update, // in: mem block after updata | |
size_t& input_size, // in: size of input mem blocks; out: # bytes consumed | |
void* output, // out: encoded diffs between @original and @update | |
size_t& output_size); // in: size of @output; out: # bytes written to @output | |
/** Decodes the encoded differences in @diffs versus the given read-only | |
* @original and writes them to the given @dest page, copying bytes from | |
* @original to @dest when a 'run-of-0s' is encountered. | |
* Returns 'false' if the given @dest_size is not large enough. | |
* Throws if @diffs_size is too small according to the encoded contents | |
* of @diffs. | |
*/ | |
static bool Decode( | |
const void* original, // in: original mem block before update | |
void* dest, // out: reconstructed update | |
size_t& dest_size, // in: size of input mem block; out: # bytes written | |
const void* diffs, // in: RLE0 dif encoded data to be restored | |
size_t diffs_size); // in: size of @diffs | |
/** Decodes the encoded differences in @input versus the writable | |
* copy of the original given as @dest, skipping bytes in @dest when | |
* a 'run-of-0s' is encountered. | |
* Returns 'false' if the given @dest_size is not large enough. | |
* Throws if @diffs_size is too small according to the encoded contents | |
* of @diffs. | |
*/ | |
static bool Decode( | |
void* dest, // in: mem block with original data to be updated in-place | |
size_t& dest_size, // in: size of input mem block; out: # bytes written | |
const void* diffs, // in: RLE0 dif encoded data to be restored | |
size_t diffs_size); // in: size of @diffs | |
}; | |
}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "RLE0DifEncoder.h" | |
#include <random> | |
#include <algorithm> | |
#include <iostream> | |
static void TryRLE0() { | |
using random_bytes_engine = std::independent_bits_engine<std::default_random_engine, std::numeric_limits<uint16_t>::digits, uint16_t>; | |
random_bytes_engine rbe; | |
auto rbe_uint8_t = [&rbe] { | |
return (uint8_t)(rbe() & 0xFF); | |
}; | |
// memory for the different pages. | |
auto mem = (uint8_t*)malloc(4096 * 4); | |
auto orig = mem; | |
auto update = orig + 4096; | |
auto restored = update + 4096; | |
auto diffs = restored + 4096; | |
// generate random content for original | |
std::generate(orig, orig + 4096, std::ref(rbe_uint8_t)); | |
for (auto i = 0; i < 1000; ++i) { | |
// create updates in update page. | |
memcpy(update, orig, 4096); | |
size_t encoded_len = 0; | |
auto n = 1 + (rbe() & 7); | |
for (auto j = 0; j < n; j++) { | |
auto offs = rbe() % 4096; | |
auto len = 1 + (rbe() % 256); | |
if (offs + len > 4096) | |
len = 4096 - offs; | |
std::generate(update + offs, update + offs + len, std::ref(rbe_uint8_t)); | |
encoded_len += len; | |
} | |
// encode differences. | |
size_t in_sz = 4096; | |
size_t out_sz = 4096; | |
if (!matise::journal::details::RLE0DifEncoder::Encode(orig, update, in_sz, diffs, out_sz)) { | |
std::cout << "Failed to encode" << std::endl; | |
break; | |
} | |
std::cout << i << ": encoded " << encoded_len << " bytes changes in " << out_sz << " bytes." << std::endl; | |
// decode to restored. | |
in_sz = 4096; | |
if (rbe() & 1) { | |
if (!matise::journal::details::RLE0DifEncoder::Decode(orig, restored, in_sz, diffs, out_sz)) { | |
std::cout << "Failed to decode (with orig option)" << std::endl; | |
break; | |
} | |
} else { | |
memcpy(restored, orig, 4096); | |
if (!matise::journal::details::RLE0DifEncoder::Decode(restored, in_sz, diffs, out_sz)) { | |
std::cout << "Failed to decode (with in place option)" << std::endl; | |
break; | |
} | |
} | |
// ensure restored matches update. | |
if (memcmp(restored, update, 4096) != 0) { | |
std::cout << "Failed: restored does not match update" << std::endl; | |
break; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment