Created
February 2, 2018 20:28
-
-
Save redknightlois/2789379536da6d57cdf27e7ba65fd527 to your computer and use it in GitHub Desktop.
Diffing memory example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Diagnostics; | |
using System.Runtime.CompilerServices; | |
namespace Sparrow.Utils | |
{ | |
/// <summary> | |
/// This class computes a diff between two buffers and write | |
/// the diff to a temp location. | |
/// | |
/// Assumptions: | |
/// - The buffers are all the same size | |
/// - The buffers size is a multiple of page boundary | |
/// | |
/// If the diff overflow, we'll use the modified value. | |
/// </summary> | |
public unsafe class DiffPages | |
{ | |
public byte* Output; | |
public long OutputSize; | |
public bool IsDiff { get; private set; } | |
public void ComputeDiff(void* originalBuffer, void* modifiedBuffer, int size) | |
{ | |
Debug.Assert(size % 4096 == 0); | |
const int pageSize = 4096; | |
const int cacheBlocks = 512; | |
const int blocksCount = pageSize / cacheBlocks; | |
OutputSize = 0; | |
IsDiff = true; | |
int pages = size / pageSize; | |
ulong* startPtr = (ulong*)originalBuffer; | |
ulong* ptr = startPtr; | |
// PERF: This allows us to do pointer arithmetics and use relative addressing using the | |
// hardware instructions without needed an extra register. | |
long offset = (long*)modifiedBuffer - (long*)originalBuffer; | |
ulong* runStartPtr = null; | |
ulong* runEndPtr = null; | |
// For each page of 4096 bytes in size | |
for (int page = 0; page < pages; page++) | |
{ | |
ulong st1 = 0, st2 = 0, st3 = 0, st4 = 0; | |
ulong* blockPtr = ptr; | |
// For each block of X bytes in size ( usually 512 bytes, that is 64 long values per block) | |
for (int block = 0; block < blocksCount; block++) | |
{ | |
// PERF: We should prefetch here the X bytes when PREFETCH is available. | |
const int packetsCount = cacheBlocks / sizeof(ulong) / 4; | |
for (int packet = 0; packet < packetsCount; packet++, ptr += 4) | |
{ | |
// PERF: In order to minimize latency | |
st1 <<= 4; | |
st2 <<= 4; | |
st3 <<= 4; | |
st4 <<= 4; | |
st1 |= *(ptr + 0) == *(ptr + offset + 0) ? 0ul : 1ul; | |
st2 |= *(ptr + 1) == *(ptr + offset + 1) ? 0ul : 1ul; | |
st3 |= *(ptr + 2) == *(ptr + offset + 2) ? 0ul : 1ul; | |
st4 |= *(ptr + 3) == *(ptr + offset + 3) ? 0ul : 1ul; | |
// This is an optimization that would allow us to find exact runs using some bit magic. | |
// We are not ready for that yet. | |
// PERF: This idiom should be optimized like hell (we should just 'or' the zero flag, if not report it to Andy :D ) | |
//st1 |= (*(ptr + 0) ^ *(ptr + offset + 0)) == 0 ? 0ul : 1ul; | |
//st2 |= (*(ptr + 1) ^ *(ptr + offset + 1)) == 0 ? 0ul : 1ul; | |
//st3 |= (*(ptr + 2) ^ *(ptr + offset + 2)) == 0 ? 0ul : 1ul; | |
//st4 |= (*(ptr + 3) ^ *(ptr + offset + 3)) == 0 ? 0ul : 1ul; | |
} | |
// PERF: We construct if from different registers to avoid the latency introduced by RAW hazards | |
ulong blockBitmap = (st1 << 3) | (st2 << 2) | (st3 << 1) | st4; | |
if (blockBitmap == 0) | |
{ | |
// If the block hasnt been touched, nothing to do here unless we have an open pointer. | |
if (runStartPtr != null) | |
{ | |
// We have a run and we are looking to the start of a hole. | |
WriteDiff(runStartPtr, runEndPtr, startPtr, offset); // TODO: Handle all zeroes case. | |
runStartPtr = null; | |
runEndPtr = null; | |
} | |
} | |
else | |
{ | |
const int ulongsPerPacket = 4; | |
const int ulongsPerBlock = cacheBlocks / sizeof(ulong); | |
Debug.Assert( ulongsPerBlock % ulongsPerPacket == 0); | |
ulong mask = 0x0Ful << (64 - ulongsPerPacket); | |
for ( int i = 0; i < ulongsPerBlock / ulongsPerPacket; i += 1, mask >>= ulongsPerPacket) | |
{ | |
bool isUntouched = (blockBitmap & mask) == 0; | |
if (runStartPtr == null) | |
{ | |
if (isUntouched) | |
continue; // We dont have a run, therefore we go to the next. | |
runStartPtr = blockPtr + (i * ulongsPerPacket); | |
runEndPtr = runStartPtr + ulongsPerPacket; // TODO: Check we are using 'ptr indirect addressing' on runEndPtr | |
} | |
else // if (runPtr != null) | |
{ | |
if (!isUntouched) | |
{ | |
// Add the 4 ulong that had been tested to the run. | |
runEndPtr += ulongsPerPacket; | |
} | |
else | |
{ | |
//long countCheck = allZeros ? 0 : runEndPtr - runStartPtr; | |
long countCheck = (runEndPtr - runStartPtr) * sizeof(ulong); | |
if (OutputSize + countCheck + sizeof(ulong) * 2 > size) | |
goto CopyFull; | |
// We have a run and we are looking to the start of a hole. | |
WriteDiff(runStartPtr, runEndPtr, startPtr, offset); // TODO: Handle all zeroes case. | |
runStartPtr = null; | |
runEndPtr = null; | |
} | |
} | |
} | |
} | |
// We advance the pointer to the next block. | |
blockPtr = ptr; | |
} | |
} | |
// If the block hasnt been touched, nothing to do here unless we have an open pointer. | |
if (runStartPtr != null) | |
{ | |
//long countCheck = allZeros ? 0 : runEndPtr - runStartPtr; | |
long countCheck = (runEndPtr - runStartPtr) * sizeof(ulong); | |
if (OutputSize + countCheck + sizeof(ulong) * 2 > size) | |
goto CopyFull; | |
// We have a run and we are looking to the start of a hole. | |
WriteDiff(runStartPtr, runEndPtr, startPtr, offset); // TODO: Handle all zeroes case. | |
} | |
return; | |
CopyFull: | |
CopyFullBuffer((byte*)modifiedBuffer, size); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private void WriteDiff(ulong* start, ulong* end, ulong* srcPtr, long destOffset) | |
{ | |
Debug.Assert(end - start > 0); | |
Debug.Assert((OutputSize % sizeof(long)) == 0); | |
long outputSize = OutputSize; | |
long* outputPtr = (long*)Output + outputSize / sizeof(long); | |
long startIdx = start - srcPtr; // The start index of the run based from the start of the page we are diffing. | |
long runLengthInBytes = (end - start) * sizeof(ulong); // The run length from the start to the end. | |
outputPtr[0] = startIdx * sizeof(ulong); | |
outputPtr[1] = runLengthInBytes; | |
outputSize += sizeof(long) * 2; | |
Unsafe.CopyBlock(Output + outputSize, (srcPtr + destOffset) + startIdx, (uint)runLengthInBytes); | |
OutputSize = outputSize + runLengthInBytes; | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private void CopyFullBuffer(byte* modified, int size) | |
{ | |
// too big, no saving, just use the full modification | |
OutputSize = size; | |
Memory.Copy(Output, modified, size); | |
IsDiff = false; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment