Skip to content

Instantly share code, notes, and snippets.

@redknightlois
Created February 2, 2018 20:28
Show Gist options
  • Save redknightlois/2789379536da6d57cdf27e7ba65fd527 to your computer and use it in GitHub Desktop.
Save redknightlois/2789379536da6d57cdf27e7ba65fd527 to your computer and use it in GitHub Desktop.
Diffing memory example
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
namespace Sparrow.Utils
{
/// <summary>
/// This class computes a diff between two buffers and write
/// the diff to a temp location.
///
/// Assumptions:
/// - The buffers are all the same size
/// - The buffers size is a multiple of page boundary
///
/// If the diff overflow, we'll use the modified value.
/// </summary>
public unsafe class DiffPages
{
public byte* Output;
public long OutputSize;
public bool IsDiff { get; private set; }
public void ComputeDiff(void* originalBuffer, void* modifiedBuffer, int size)
{
Debug.Assert(size % 4096 == 0);
const int pageSize = 4096;
const int cacheBlocks = 512;
const int blocksCount = pageSize / cacheBlocks;
OutputSize = 0;
IsDiff = true;
int pages = size / pageSize;
ulong* startPtr = (ulong*)originalBuffer;
ulong* ptr = startPtr;
// PERF: This allows us to do pointer arithmetics and use relative addressing using the
// hardware instructions without needed an extra register.
long offset = (long*)modifiedBuffer - (long*)originalBuffer;
ulong* runStartPtr = null;
ulong* runEndPtr = null;
// For each page of 4096 bytes in size
for (int page = 0; page < pages; page++)
{
ulong st1 = 0, st2 = 0, st3 = 0, st4 = 0;
ulong* blockPtr = ptr;
// For each block of X bytes in size ( usually 512 bytes, that is 64 long values per block)
for (int block = 0; block < blocksCount; block++)
{
// PERF: We should prefetch here the X bytes when PREFETCH is available.
const int packetsCount = cacheBlocks / sizeof(ulong) / 4;
for (int packet = 0; packet < packetsCount; packet++, ptr += 4)
{
// PERF: In order to minimize latency
st1 <<= 4;
st2 <<= 4;
st3 <<= 4;
st4 <<= 4;
st1 |= *(ptr + 0) == *(ptr + offset + 0) ? 0ul : 1ul;
st2 |= *(ptr + 1) == *(ptr + offset + 1) ? 0ul : 1ul;
st3 |= *(ptr + 2) == *(ptr + offset + 2) ? 0ul : 1ul;
st4 |= *(ptr + 3) == *(ptr + offset + 3) ? 0ul : 1ul;
// This is an optimization that would allow us to find exact runs using some bit magic.
// We are not ready for that yet.
// PERF: This idiom should be optimized like hell (we should just 'or' the zero flag, if not report it to Andy :D )
//st1 |= (*(ptr + 0) ^ *(ptr + offset + 0)) == 0 ? 0ul : 1ul;
//st2 |= (*(ptr + 1) ^ *(ptr + offset + 1)) == 0 ? 0ul : 1ul;
//st3 |= (*(ptr + 2) ^ *(ptr + offset + 2)) == 0 ? 0ul : 1ul;
//st4 |= (*(ptr + 3) ^ *(ptr + offset + 3)) == 0 ? 0ul : 1ul;
}
// PERF: We construct if from different registers to avoid the latency introduced by RAW hazards
ulong blockBitmap = (st1 << 3) | (st2 << 2) | (st3 << 1) | st4;
if (blockBitmap == 0)
{
// If the block hasnt been touched, nothing to do here unless we have an open pointer.
if (runStartPtr != null)
{
// We have a run and we are looking to the start of a hole.
WriteDiff(runStartPtr, runEndPtr, startPtr, offset); // TODO: Handle all zeroes case.
runStartPtr = null;
runEndPtr = null;
}
}
else
{
const int ulongsPerPacket = 4;
const int ulongsPerBlock = cacheBlocks / sizeof(ulong);
Debug.Assert( ulongsPerBlock % ulongsPerPacket == 0);
ulong mask = 0x0Ful << (64 - ulongsPerPacket);
for ( int i = 0; i < ulongsPerBlock / ulongsPerPacket; i += 1, mask >>= ulongsPerPacket)
{
bool isUntouched = (blockBitmap & mask) == 0;
if (runStartPtr == null)
{
if (isUntouched)
continue; // We dont have a run, therefore we go to the next.
runStartPtr = blockPtr + (i * ulongsPerPacket);
runEndPtr = runStartPtr + ulongsPerPacket; // TODO: Check we are using 'ptr indirect addressing' on runEndPtr
}
else // if (runPtr != null)
{
if (!isUntouched)
{
// Add the 4 ulong that had been tested to the run.
runEndPtr += ulongsPerPacket;
}
else
{
//long countCheck = allZeros ? 0 : runEndPtr - runStartPtr;
long countCheck = (runEndPtr - runStartPtr) * sizeof(ulong);
if (OutputSize + countCheck + sizeof(ulong) * 2 > size)
goto CopyFull;
// We have a run and we are looking to the start of a hole.
WriteDiff(runStartPtr, runEndPtr, startPtr, offset); // TODO: Handle all zeroes case.
runStartPtr = null;
runEndPtr = null;
}
}
}
}
// We advance the pointer to the next block.
blockPtr = ptr;
}
}
// If the block hasnt been touched, nothing to do here unless we have an open pointer.
if (runStartPtr != null)
{
//long countCheck = allZeros ? 0 : runEndPtr - runStartPtr;
long countCheck = (runEndPtr - runStartPtr) * sizeof(ulong);
if (OutputSize + countCheck + sizeof(ulong) * 2 > size)
goto CopyFull;
// We have a run and we are looking to the start of a hole.
WriteDiff(runStartPtr, runEndPtr, startPtr, offset); // TODO: Handle all zeroes case.
}
return;
CopyFull:
CopyFullBuffer((byte*)modifiedBuffer, size);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void WriteDiff(ulong* start, ulong* end, ulong* srcPtr, long destOffset)
{
Debug.Assert(end - start > 0);
Debug.Assert((OutputSize % sizeof(long)) == 0);
long outputSize = OutputSize;
long* outputPtr = (long*)Output + outputSize / sizeof(long);
long startIdx = start - srcPtr; // The start index of the run based from the start of the page we are diffing.
long runLengthInBytes = (end - start) * sizeof(ulong); // The run length from the start to the end.
outputPtr[0] = startIdx * sizeof(ulong);
outputPtr[1] = runLengthInBytes;
outputSize += sizeof(long) * 2;
Unsafe.CopyBlock(Output + outputSize, (srcPtr + destOffset) + startIdx, (uint)runLengthInBytes);
OutputSize = outputSize + runLengthInBytes;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void CopyFullBuffer(byte* modified, int size)
{
// too big, no saving, just use the full modification
OutputSize = size;
Memory.Copy(Output, modified, size);
IsDiff = false;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment