Skip to content

Instantly share code, notes, and snippets.

@boxbackup-bot
Created May 19, 2019 10:07
Show Gist options
  • Save boxbackup-bot/b19c2981a665efd6168024893f4cb5f3 to your computer and use it in GitHub Desktop.
Save boxbackup-bot/b19c2981a665efd6168024893f4cb5f3 to your computer and use it in GitHub Desktop.
Trac issue 45 attachments
Index: lib/backupclient/BackupStoreFile.h
===================================================================
--- lib/backupclient/BackupStoreFile.h (revision 2361)
+++ lib/backupclient/BackupStoreFile.h (working copy)
@@ -214,11 +214,6 @@
static void ResetStats();
static BackupStoreFileStats msStats;
- // For debug
-#ifndef NDEBUG
- static bool TraceDetailsOfDiffProcess;
-#endif
-
// For decoding encoded files
static void DumpFile(void *clibFileHandle, bool ToTrace, IOStream &rFile);
};
Index: test/backupdiff/testbackupdiff.cpp
===================================================================
--- test/backupdiff/testbackupdiff.cpp (revision 2361)
+++ test/backupdiff/testbackupdiff.cpp (working copy)
@@ -390,7 +390,7 @@
// Want to trace out all the details
#ifndef NDEBUG
#ifndef WIN32
- BackupStoreFile::TraceDetailsOfDiffProcess = true;
+ Logging::SetGlobalLevel(Log::TRACE);
#endif
#endif
Index: lib/backupclient/BackupStoreFileDiff.cpp
===================================================================
--- lib/backupclient/BackupStoreFileDiff.cpp (revision 2361)
+++ lib/backupclient/BackupStoreFileDiff.cpp (working copy)
@@ -38,22 +38,26 @@
using namespace BackupStoreFileCryptVar;
using namespace BackupStoreFileCreation;
-// By default, don't trace out details of the diff as we go along -- would fill up logs significantly.
-// But it's useful for the test.
-#ifndef NDEBUG
- bool BackupStoreFile::TraceDetailsOfDiffProcess = false;
-#endif
-
-static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis);
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
-static void SearchForMatchingBlocks(IOStream &rFile,
- std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex,
- int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES],
+static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID,
+ BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut,
+ int Timeout, bool &rCanDiffFromThis);
+static void SearchForMatchingBlocks(IOStream &rFile,
+ std::map<int64_t, int64_t> &rFoundBlocks,
+ BlocksAvailableEntry *pIndex, int64_t NumBlocks,
DiffTimer *pDiffTimer);
-static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
-BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
-static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
+static void SetupHashTable(BlocksAvailableEntry *pIndex,
+ int64_t NumBlocks, int32_t BlockSize,
+ BlocksAvailableEntry **pHashTable);
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList,
+ RollingChecksum *pFastSum,
+ uint8_t *pLow, int32_t LowSize,
+ uint8_t *pHigh, int32_t HighSize,
+ int32_t BlockSize, int64_t FileOffset,
+ BlocksAvailableEntry *pIndex,
+ std::map<int64_t, int64_t> &rFoundBlocks);
+static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe,
+ BlocksAvailableEntry *pIndex, int64_t NumBlocks,
+ std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
// --------------------------------------------------------------------------
//
@@ -168,10 +172,6 @@
try
{
- // Find which sizes should be scanned
- int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
- FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);
-
// Flag for reporting to the user
bool completelyDifferent;
@@ -186,8 +186,7 @@
// Get size of file
sizeOfInputFile = file.BytesLeftToRead();
// Find all those lovely matching blocks
- SearchForMatchingBlocks(file, foundBlocks, pindex,
- blocksInIndex, sizesToScan, pDiffTimer);
+ SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, pDiffTimer);
// Is it completely different?
completelyDifferent = (foundBlocks.size() == 0);
@@ -363,393 +362,771 @@
}
}
-
// --------------------------------------------------------------------------
//
// Function
-// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-// Purpose: Finds the most commonly used block sizes in the index
+// Name: static void SearchForMatchingBlocks(IOStream &,
+// std::map<int64_t, int64_t> &,
+// BlocksAvailableEntry *, int64_t, DiffTimer *)
+// Purpose: Find the matching blocks within the file.
// Created: 12/1/04
//
// --------------------------------------------------------------------------
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
+static void SearchForMatchingBlocks(IOStream &rFile,
+ std::map<int64_t, int64_t> &rFoundBlocks,
+ BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer)
{
- // Array for lengths
- int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ Timer maximumDiffingTime(0, "MaximumDiffingTime");
- // Set arrays to lots of zeros (= unused entries)
- for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l)
+ if(pDiffTimer && pDiffTimer->IsManaged())
{
- Sizes[l] = 0;
- sizeCounts[l] = 0;
+ maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(),
+ "MaximumDiffingTime");
}
- // Array for collecting sizes
- std::map<int32_t, int64_t> foundSizes;
+ // Flag to abort the run, if too many blocks are found or the diffing
+ // timeout expires
+ bool abortSearch = false;
- // Run through blocks and make a count of the entries
- for(int64_t b = 0; b < NumBlocks; ++b)
+ // Buffers used during both phases of search
+ uint8_t *pbuffer0 = 0;
+ uint8_t *pbuffer1 = 0;
+
+ // Track offsets that already have block matches. We don't really care
+ // if its sorted, and this actually produces a performance issue, so
+ // see pfitBitmap for a workaround
+ std::map<int64_t, int32_t> goodnessOfFit;
+
+ // Collect sizes that aren't found in the file at their old offset
+ std::map<int32_t, int64_t> unmatchedSizes;
+
+ // Our arrays of block sizes during second search pass
+ int32_t scanSizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ int64_t scanSizesCount[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ int32_t maxScanBlockSize = 0;
+ ::memset(scanSizes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+ ::memset(scanSizesCount, 0, (sizeof(int64_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+ // We need to keep separate rolling checksums for each block size in second search
+ RollingChecksum *rollingSums[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ ::memset(rollingSums, 0, (sizeof(RollingChecksum *) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+ // And a hash lookup table per block size in seacond search
+ BlocksAvailableEntry **hashTables[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ ::memset(hashTables, 0, (sizeof(BlocksAvailableEntry **) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+ // We allow second search pass to short-circuit off rare blocks
+ bool scanThisSize[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ ::memset(scanThisSize, 0, (sizeof(bool) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+ // During block read we have a bitmap of prefit locations to avoid std::map
+ // performance problems
+ uint8_t *pfitBitmap = 0;
+
+
+ // First search pass...
+ //
+ // For many files (especially large ones) most of the file is unchanged.
+ // The RollingChecksum process requires us to read every byte of the file looking for
+ // blocks that have moved. However, we can make that process more efficient by
+ // quickly rolling over areas that match a different block. We can also use
+ // this to eliminate entirely the rolling checksum for block sizes that only exist
+ // at one location in the file.
+ //
+ // Thus we start by looking for blocks that have not moved. Only if a block
+ // cannot be found at its previous location do we consider scanning for it by
+ // rolling checksum size.
+ //
+ // This strategy has some disadvantages for files with lots of repeating content
+ // that happens to align with our block size, but the reduction in diff time
+ // for more typical files is worth it.
+ //
+ // Note also that in this pass we consider _all_ block sizes (smaller than
+ // BACKUP_FILE_MAX_BLOCK_SIZE). Any block size, no matter how small or rare
+ // is cheap for us to find in this pass.
+ //
+ pbuffer0 = (uint8_t *)::malloc(BACKUP_FILE_MAX_BLOCK_SIZE);
+ try
{
- // Only if the block size is bigger than the minimum size we'll scan for
- if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)
+ if(pbuffer0 == 0)
{
- // Find entry?
- std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize));
- if(f != foundSizes.end())
+ throw std::bad_alloc();
+ }
+
+ // We have to track file offset since the read may fail
+ int64_t fileOffset = 0;
+
+ // Walk the blocks
+ for(int64_t b = 0; b < NumBlocks; ++b)
+ {
+ // Check diffing timeout
+ if(maximumDiffingTime.HasExpired())
{
- // Increment existing entry
- foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1;
+ ASSERT(pDiffTimer != NULL);
+ BOX_INFO("MaximumDiffingTime reached - suspending file diff");
+ abortSearch = true;
+ break;
}
- else
+
+ // Send keep alive
+ if(pDiffTimer) pDiffTimer->DoKeepAlive();
+
+ // Skip blocks too large for our buffer
+ if(pIndex[b].mSize > BACKUP_FILE_MAX_BLOCK_SIZE) {
+ fileOffset += pIndex[b].mSize;
+ continue;
+ }
+
+ // Have to guard the seek operation, it could throw. We don't know size of the
+ // current file, and checking is pointless anyway since there's a race.
+ // In reality on Unix this is implemented with lseek which will mean we can
+ // seek past the EOF, but I don't want to make assumptions about Win32.
+ int32_t readSize = 0;
+ try
{
- // New entry
- foundSizes[pIndex[b].mSize] = 1;
+ rFile.Seek(fileOffset, IOStream::SeekType_Absolute);
+ readSize = rFile.Read(pbuffer0, pIndex[b].mSize);
}
+ catch(BoxException &e)
+ {
+ if(e.GetType() != CommonException::ExceptionType || e.GetSubType() != CommonException::OSFileError)
+ {
+ // Not what we expected, rethrow
+ throw;
+ }
+ }
+
+ // Check for a match
+ bool blockMatched = false;
+ if(readSize == pIndex[b].mSize)
+ {
+ // We don't have a rolling checksum to this point, so all we can do is MD5. If you
+ // worry this is expensive just remember that prior versions of this code
+ // re-read the file BACKUP_FILE_DIFF_MAX_BLOCK_SIZES times and calculated
+ // rolling checksums every time.
+ MD5Digest strong;
+ strong.Add(pbuffer0, pIndex[b].mSize);
+ strong.Finish();
+
+ // Do we have a match?
+ if(strong.DigestMatches(pIndex[b].mStrongChecksum))
+ {
+#ifndef NDEBUG
+ BOX_TRACE("Found unchanged block of size " << pIndex[b].mSize << " at offset " << fileOffset);
+#endif
+ rFoundBlocks[fileOffset] = b;
+ goodnessOfFit[fileOffset] = pIndex[b].mSize;
+ blockMatched = true;
+ }
+ }
+
+ // We're done with the file offset, so increment now
+ fileOffset += pIndex[b].mSize;
+
+ // If the block didn't match then this is a size we'll have to scan for
+ if(!blockMatched && (pIndex[b].mSize >= BACKUP_FILE_DIFF_MIN_BLOCK_SIZE))
+ {
+ // Find entry?
+ std::map<int32_t, int64_t>::const_iterator f(unmatchedSizes.find(pIndex[b].mSize));
+ if(f != unmatchedSizes.end())
+ {
+ // Increment existing entry
+ unmatchedSizes[pIndex[b].mSize] = unmatchedSizes[pIndex[b].mSize] + 1;
+ }
+ else
+ {
+ // New entry
+ unmatchedSizes[pIndex[b].mSize] = 1;
+ }
+ }
}
+
+ // Cleanup
+ ::free(pbuffer0);
+ pbuffer0 = 0;
}
+ catch(...)
+ {
+ // Cleanup and rethrow
+ if(pbuffer0 != 0) ::free(pbuffer0);
+ throw;
+ }
+
+ // Are we already out of time?
+ if(abortSearch)
+ {
+#ifndef NDEBUG
+ goto dumpDiffList;
+#endif
+ return;
+ }
- // Make the block sizes
- for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i)
+
+ // Second search pass...
+ //
+ // In our second phase, having matched all unchanged blocks we now need
+ // to scan for moved blocks. This involves looping across all unmatched
+ // block sizes and using the rolling checksum to look for relocations. To keep
+ // this from being too expensive we cap at BACKUP_FILE_DIFF_MAX_BLOCK_SIZES
+ // for the number of sizes to scan. We also scan for the blocks in order of
+ // their relative probability, a block size that occurs frequently is scanned
+ // first.
+ //
+
+ // Loop all sizes inserting higher usages into the array
+ for(std::map<int32_t, int64_t>::const_iterator i(unmatchedSizes.begin()); i != unmatchedSizes.end(); ++i)
{
- // Find the position of the size in the array
+ // TODO: Scanning for any block size isn't cheap, and realistically in many cases
+ // it would be less expensive to upload a few thousand bytes rather than do the scan.
+ // Here would be a good place to filter block sizes that aren't worth the effort
+ // once a suitable set of heuristics is found.
+
for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
{
- // Instead of sorting on the raw count of blocks,
- // take the file area covered by this block size.
- if(i->second * i->first > sizeCounts[t] * Sizes[t])
+ // Instead of sorting on the raw count of blocks, take the file area covered by this
+ // block size. This helps avoid favoring low numbers of large blocks over many
+ // small blocks.
+ if((i->second * i->first) > (scanSizesCount[t] * scanSizes[t]))
{
// Then this size belong before this entry -- shuffle them up
for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
{
- Sizes[s] = Sizes[s-1];
- sizeCounts[s] = sizeCounts[s-1];
+ scanSizes[s] = scanSizes[s-1];
+ scanSizesCount[s] = scanSizesCount[s-1];
}
// Insert this size
- Sizes[t] = i->first;
- sizeCounts[t] = i->second;
+ scanSizes[t] = i->first;
+ scanSizesCount[t] = i->second;
- // Shouldn't do any more searching
+ // Update max size
+ if(maxScanBlockSize < i->first) maxScanBlockSize = i->first;
+
+ // Shouldn't do any more searching for this size
break;
}
}
}
-
- // trace the size table in debug builds
+
+ // Dump the size table in debug builds
#ifndef NDEBUG
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
+ for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
{
- for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
+ if(scanSizes[t] != 0)
{
- BOX_TRACE("Diff block size " << t << ": " <<
- Sizes[t] << " (count = " <<
- sizeCounts[t] << ")");
+ BOX_TRACE("Scan block size " << t << ": " <<
+ scanSizes[t] << " count: " <<
+ scanSizesCount[t]);
}
}
#endif
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-// Purpose: Find the matching blocks within the file.
-// Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
- BlocksAvailableEntry *pIndex, int64_t NumBlocks,
- int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer)
-{
- Timer maximumDiffingTime(0, "MaximumDiffingTime");
-
- if(pDiffTimer && pDiffTimer->IsManaged())
+ // If we didn't find any sizes (could happen if all were found in the first matching
+ // phase) we're done.
+ if(maxScanBlockSize == 0)
{
- maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(),
- "MaximumDiffingTime");
+#ifndef NDEBUG
+ BOX_TRACE("No scan block sizes, skip rolling checksum");
+ goto dumpDiffList;
+#endif
+ return;
}
-
- std::map<int64_t, int32_t> goodnessOfFit;
- // Allocate the hash lookup table
- BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
+ ASSERT(maxScanBlockSize <= BACKUP_FILE_MAX_BLOCK_SIZE);
- // Choose a size for the buffer, just a little bit more than the maximum block size
- int32_t bufSize = Sizes[0];
- for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
- {
- if(Sizes[z] > bufSize) bufSize = Sizes[z];
- }
- bufSize += 4;
- ASSERT(bufSize > Sizes[0]);
- ASSERT(bufSize > 0);
- if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024))
- {
- THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
- }
+ // Allocate two buffers we'll toggle between at the max scan block size
+ // There sizes are doubled to make final block at the end easier (we're cheating)
+ pbuffer0 = (uint8_t *)::malloc(maxScanBlockSize * 2);
+ pbuffer1 = (uint8_t *)::malloc(maxScanBlockSize * 2);
- // TODO: Because we read in the file a scanned block size at a time,
- // it is likely to be inefficient. Probably will be much better to
- // calculate checksums for all block sizes in a single pass.
-
- // Allocate the buffers.
- uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize);
- uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize);
+ // Allocate a bitmap buffer to optimize goodnessOfFit access
+ pfitBitmap = (uint8_t *)::malloc(maxScanBlockSize * 2 / 8);
+
try
{
// Check buffer allocation
- if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
+ if(pbuffer0 == 0 || pbuffer1 == 0 || pfitBitmap == 0)
{
// If a buffer got allocated, it will be cleaned up in the catch block
throw std::bad_alloc();
}
- // Flag to abort the run, if too many blocks are found -- avoid using
- // huge amounts of processor time when files contain many similar blocks.
- bool abortSearch = false;
+ // Shift file position back to beginning
+ int64_t bufferFileOffset = 0;
+ rFile.Seek(0, IOStream::SeekType_Absolute);
- // Search for each block size in turn
- // NOTE: Do the smallest size first, so that the scheme for adding
- // entries in the found list works as expected and replaces smallers block
- // with larger blocks when it finds matches at the same offset in the file.
- for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s)
+ // We're going to be flipping back and forth between two buffers, the low and high
+ uint8_t *lowBuffer = pbuffer0;
+ int32_t lowBufferBytes = 0;
+ uint8_t *highBuffer = pbuffer1;
+ int32_t highBufferBytes = 0;
+
+ // In some cases we need to carry over reads from prior buffers
+ int32_t carryOverBytes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ ::memset(carryOverBytes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+ // Read the first buffer's worth of data
+ lowBufferBytes = rFile.Read(lowBuffer, maxScanBlockSize);
+ // Fill the second buffer if appropriate
+ if(lowBufferBytes == maxScanBlockSize)
{
- ASSERT(Sizes[s] <= bufSize);
- BOX_TRACE("Diff pass " << s << ", for block size " <<
- Sizes[s]);
-
- // Check we haven't finished
- if(Sizes[s] == 0)
+ highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize);
+ }
+
+ // For every block size, initialize our scan tracking
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+ {
+ ASSERT(scanSizes[z] <= maxScanBlockSize);
+
+ // The sizes array may be mostly empty, in those cases we have no
+ // state to maintain.
+ if(scanSizes[z] != 0)
{
- // empty entry, try next size
- continue;
+ // Mark for scan
+ scanThisSize[z] = true;
+
+ // Set up the hash table for this size
+ hashTables[z] = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
+ if(hashTables[z] == 0)
+ {
+ throw std::bad_alloc();
+ }
+ SetupHashTable(pIndex, NumBlocks, scanSizes[z], hashTables[z]);
+
+ // Set up a rolling checksum, but only if there is enough data to start with
+ // (file may now be shorter than some block sizes previously used)
+ if(scanSizes[z] <= lowBufferBytes)
+ {
+ rollingSums[z] = new RollingChecksum(lowBuffer, scanSizes[z]);
+ }
}
-
- // Set up the hash table entries
- SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable);
+ }
- // Shift file position to beginning
- rFile.Seek(0, IOStream::SeekType_Absolute);
-
- // Read first block
- if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s])
+ // Read loop while we can get full maxScanBlockSize reads
+ while(highBufferBytes == maxScanBlockSize)
+ {
+ // Oh happy day! We have maxScanBlockSize * 2 bytes available to us across
+ // the two buffers, which means we can walk every block size across all offsets
+ // in lowBuffer without any concern about overrunning the data available in highBuffer.
+
+ // Check diffing timeout
+ if(maximumDiffingTime.HasExpired())
{
- // Size of file too short to match -- do next size
- continue;
+ ASSERT(pDiffTimer != NULL);
+ BOX_INFO("MaximumDiffingTime reached - suspending file diff");
+ abortSearch = true;
+ break;
}
- // Setup block pointers
- uint8_t *beginnings = pbuffer0;
- uint8_t *endings = pbuffer1;
- int offset = 0;
+ // Send keep alive
+ if(pDiffTimer) pDiffTimer->DoKeepAlive();
- // Calculate the first checksum, ready for rolling
- RollingChecksum rolling(beginnings, Sizes[s]);
+ // Don't you wish hash_map was standard? std::map is very slow
+ // when we access it as often as we do in the loop below. To work around
+ // we'll create a bitmap of the previous fits in this buffer
+ ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8);
+ for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i)
+ {
+ if(i->first >= (bufferFileOffset + maxScanBlockSize)) break;
+ pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7));
+ }
- // Then roll, until the file is exhausted
- int64_t fileBlockNumber = 0;
- int64_t fileOffset = 0;
- int rollOverInitialBytes = 0;
- while(true)
+ // Walk all block sizes in block-probability order
+ for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s)
{
- if(maximumDiffingTime.HasExpired())
- {
- ASSERT(pDiffTimer != NULL);
- BOX_INFO("MaximumDiffingTime reached - "
- "suspending file diff");
- abortSearch = true;
- break;
- }
+ // If there is no rolling checksum at this index, skip its either
+ // a zero size or the file was too small
+ if(!scanThisSize[s] || (rollingSums[s] == 0)) continue;
+
+#ifndef NDEBUG
+ BOX_TRACE("Diff block size " << scanSizes[s] << " at file offset " << bufferFileOffset);
+#endif
- if(pDiffTimer)
- {
- pDiffTimer->DoKeepAlive();
- }
-
- // Load in another block of data, and record how big it is
- int bytesInEndings = rFile.Read(endings, Sizes[s]);
- int tmp;
+ // Offset of this block buffer
+ int32_t bufferOffset = 0;
- // Skip any bytes from a previous matched block
- if(rollOverInitialBytes > 0 && offset < bytesInEndings)
+ // Roll carry over
+ if(carryOverBytes[s] != 0)
{
- int spaceLeft = bytesInEndings - offset;
- int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+ ASSERT(carryOverBytes[s] > 0);
+ // Carry can be bigger than maxScanBlockSize because first matching
+ // phase might have used bigger blocks than the current max scan size
+ int32_t thisCarry = carryOverBytes[s];
+ if(thisCarry > maxScanBlockSize)
+ {
+ thisCarry = maxScanBlockSize;
+ }
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
- offset += thisRoll;
- fileOffset += thisRoll;
- rollOverInitialBytes -= thisRoll;
-
- if(rollOverInitialBytes)
+ // Perform carry
+ if((thisCarry + scanSizes[s]) > maxScanBlockSize)
{
- goto refresh;
+ // Roll is split some low, some low/high
+ int32_t lowRollBytes = maxScanBlockSize - scanSizes[s];
+ ASSERT(lowRollBytes >= 0);
+ if(lowRollBytes > 0)
+ {
+ rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], lowRollBytes);
+ }
+ rollingSums[s]->RollForwardSeveral(lowBuffer + lowRollBytes, highBuffer, scanSizes[s], thisCarry - lowRollBytes);
}
+ else
+ {
+ // Roll is all in low buffer
+ rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], thisCarry);
+ }
+ // Either way offset is carry
+ bufferOffset = thisCarry;
+ // Reuce carry by the carry amount actually carried
+ carryOverBytes[s] -= thisCarry;
+ ASSERT(carryOverBytes[s] >= 0);
}
-
- if(goodnessOfFit.count(fileOffset))
+
+ // Loop remaining low buffer bytes, taking a checksum at each offset
+ while (bufferOffset < maxScanBlockSize)
{
- tmp = goodnessOfFit[fileOffset];
- }
- else
- {
- tmp = 0;
- }
+ // Look for larger size matches at this offset
+ int32_t bestFitSoFarSize = 0;
- if(tmp >= Sizes[s])
- {
- // Skip over bigger ready-matched blocks completely
- rollOverInitialBytes = tmp;
- int spaceLeft = bytesInEndings - offset;
- int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
-
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
- offset += thisRoll;
- fileOffset += thisRoll;
- rollOverInitialBytes -= thisRoll;
-
- if(rollOverInitialBytes)
+ if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7)))
{
- goto refresh;
+ bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset];
}
- }
- while(offset < bytesInEndings)
- {
- // Is current checksum in hash list?
- uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
+ if(bestFitSoFarSize >= scanSizes[s])
{
- if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ // Roll can be bigger than maxScanBlockSize because first matching
+ // phase might have used bigger blocks than the max scan size
+ int32_t thisRoll = bestFitSoFarSize;
+ int32_t remainderRoll = 0;
+ if(thisRoll > maxScanBlockSize)
{
- BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
- goodnessOfFit[fileOffset] = Sizes[s];
+ thisRoll = maxScanBlockSize;
+ remainderRoll = bestFitSoFarSize - thisRoll;
+ }
- // Block matched, roll the checksum forward to the next block without doing
- // any more comparisons, because these are pointless (as any more matches will be ignored when
- // the recipe is generated) and just take up valuable processor time. Edge cases are
- // especially nasty, using huge amounts of time and memory.
- int skip = Sizes[s];
- if(offset < bytesInEndings && skip > 0)
- {
- int spaceLeft = bytesInEndings - offset;
- int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
+ // Roll forward in the lower buffer. This will either exhaust the total roll or will
+ // push the rolling checksum so that it is against the end of the buffer.
+ int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]);
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+ if(lowRoll > 0)
+ {
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll);
+ bufferOffset += lowRoll;
+ thisRoll -= lowRoll;
+ }
- offset += thisRoll;
- fileOffset += thisRoll;
- skip -= thisRoll;
- }
- // Not all the bytes necessary will have been skipped, so get them
- // skipped after the next block is loaded.
- rollOverInitialBytes = skip;
-
- // End this loop, so the final byte isn't used again
- break;
- }
- else
+ if(thisRoll < 1)
{
- BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
+ ASSERT(remainderRoll == 0);
+ continue; // Back around to bufferOffset loop
}
- int64_t NumBlocksFound = static_cast<int64_t>(
- rFoundBlocks.size());
- int64_t MaxBlocksFound = NumBlocks *
- BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
-
- if(NumBlocksFound > MaxBlocksFound)
+ // Roll high/low section. Our roll here will either exhaust the total or will
+ // leave the rolling checksum positioned a offset zero in the high buffer
+ int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset);
+ ASSERT(lowHighRoll > 0);
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll);
+ bufferOffset += lowHighRoll;
+ carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll;
+ ASSERT(carryOverBytes[s] >= 0);
+ continue; // Back around to bufferOffset loop
+ }
+
+ // Look for block match at this size
+ uint16_t hashValue = rollingSums[s]->GetComponentForHashing();
+ if(hashTables[s][hashValue] != 0)
+ {
+ int32_t lowSize = maxScanBlockSize - bufferOffset;
+ if(lowSize > scanSizes[s]) lowSize = scanSizes[s];
+ int32_t highSize = scanSizes[s] - lowSize;
+ if(SecondStageMatch(hashTables[s][hashValue],
+ rollingSums[s],
+ lowBuffer + bufferOffset, lowSize,
+ highBuffer, highSize,
+ scanSizes[s],
+ bufferFileOffset + bufferOffset,
+ pIndex, rFoundBlocks))
{
- abortSearch = true;
- break;
+#ifndef NDEBUG
+ BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset);
+#endif
+ // Update best fit so far
+ ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s]));
+ goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s];
+ pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7));
+
+ // We've found a match, don't scan anymore if we don't expect more blocks of this size
+ --scanSizesCount[s];
+ if(scanSizesCount[s] < 1)
+ {
+ scanThisSize[s] = false;
+#ifndef NDEBUG
+ BOX_TRACE("Short-circuit size " << scanSizes[s]);
+#endif
+ }
+
+ // Roll so we don't bother diffing the remainder of this block, see bestFitSoFarSize
+ // roll above for an explanation of the logic
+ int32_t thisRoll = scanSizes[s];
+ int32_t remainderRoll = 0;
+ if(thisRoll > maxScanBlockSize)
+ {
+ thisRoll = maxScanBlockSize;
+ remainderRoll = bestFitSoFarSize - thisRoll;
+ }
+ int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]);
+ if(lowRoll > 0)
+ {
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll);
+ bufferOffset += lowRoll;
+ thisRoll -= lowRoll;
+ }
+ if(thisRoll < 1)
+ {
+ ASSERT(remainderRoll == 0);
+ continue; // Back around to bufferOffset loop
+ }
+ int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset);
+ ASSERT(lowHighRoll > 0);
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll);
+ bufferOffset += lowHighRoll;
+ carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll;
+ ASSERT(carryOverBytes[s] >= 0);
+ continue; // Back around to bufferOffset loop
}
}
- // Roll checksum forward
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
-
- // Increment offsets
- ++offset;
- ++fileOffset;
+ // Roll one byte, either low or low/high depending on current offset
+ if((bufferOffset + scanSizes[s]) >= maxScanBlockSize)
+ {
+ rollingSums[s]->RollForward(lowBuffer[bufferOffset], highBuffer[scanSizes[s] - (maxScanBlockSize - bufferOffset)], scanSizes[s]);
+ }
+ else
+ {
+ rollingSums[s]->RollForward(lowBuffer[bufferOffset], lowBuffer[bufferOffset + scanSizes[s]], scanSizes[s]);
+ }
+ ++bufferOffset;
}
+
+ // Check if we've found too much
+ int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size());
+ int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
+ if(NumBlocksFound > MaxBlocksFound)
+ {
+ abortSearch = true;
+ break;
+ }
+ }
+ // If we're aborting, pass through
+ if(abortSearch) break;
+
+ // Swap buffers so high is low and the go back around
+ uint8_t *swap = lowBuffer;
+ lowBuffer = highBuffer;
+ ASSERT((lowBufferBytes == highBufferBytes) && (lowBufferBytes == maxScanBlockSize));
+ bufferFileOffset += maxScanBlockSize;
+ highBuffer = swap;
+ highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize);
+
+ } // End of maxScanBlockSize reads
+
+ // What remains is either:
+ // - An abort or timeout
+ // - A low buffer with less than maxScanBlockSize bytes and an empty high buffer
+ // (Potentially zero low bytes for a zero-length file)
+ // - A maxScanBlockSize low buffer and a maxScanBlockSize or less high buffer
+ //
+ if(!abortSearch && !maximumDiffingTime.HasExpired() && (lowBufferBytes > 0))
+ {
+ // We're going to repeat the logic in the previous read loop, but have
+ // to be careful not to walk off the end of the available data. To make this
+ // a lot less convoluted, we're just going to copy to consolidate buffers.
+ // This is cheating a bit, but worth the simplicity.
+ ASSERT((lowBufferBytes < maxScanBlockSize) ? (highBufferBytes == 0) : true);
+ ASSERT((highBufferBytes != 0) ? (lowBufferBytes == maxScanBlockSize) : true);
+ uint8_t *endingBuffer = lowBuffer;
+ if(highBufferBytes != 0)
+ {
+ ::memcpy(endingBuffer + maxScanBlockSize, highBuffer, highBufferBytes);
+ }
+ int32_t endingBytes = lowBufferBytes + highBufferBytes;
+
+ // Send keep alive
+ if(pDiffTimer) pDiffTimer->DoKeepAlive();
+
+ // std::map still not winning
+ ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8);
+ for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i)
+ {
+ pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7));
+ }
+
+ // Walk all block sizes in block-probability order
+ for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s)
+ {
+ // If there is no rolling checksum at this index, skip its either
+ // a zero size or the file was too small
+ if(!scanThisSize[s] || (rollingSums[s] == 0)) continue;
- if(abortSearch) break;
+ // If there's not enough data at the end, skip this block size
+ if(endingBytes < scanSizes[s]) continue;
+
+#ifndef NDEBUG
+ BOX_TRACE("Diff block size " << scanSizes[s] << " at file ending offset " << bufferFileOffset);
+#endif
- refresh:
- // Finished?
- if(bytesInEndings != Sizes[s])
+ // Offset of this block buffer
+ int32_t bufferOffset = 0;
+
+ // For each block size we can only walk so far safely
+ int32_t maxSafeOffset = endingBytes - scanSizes[s];
+
+ // Roll carry over
+ if(carryOverBytes[s] != 0)
{
- // No more data in file -- check the final block
- // (Do a copy and paste of 5 lines of code instead of introducing a comparison for
- // each byte of the file)
- uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
+ ASSERT(carryOverBytes[s] > 0);
+ // Don't walk off the end
+ if(carryOverBytes[s] > maxSafeOffset) continue;
+ // Roll
+ rollingSums[s]->RollForwardSeveral(endingBuffer, endingBuffer + scanSizes[s], scanSizes[s], carryOverBytes[s]);
+ bufferOffset = carryOverBytes[s];
+ carryOverBytes[s] = 0; // Not really needed again, but be clean
+ }
+
+ // Loop remaining end bytes, we must walk to exactly and then break the loop
+ while (bufferOffset <= maxSafeOffset)
+ {
+ // Look for larger size matches at this offset
+ int32_t bestFitSoFarSize = 0;
+ if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7)))
{
- if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset];
+ }
+ if(bestFitSoFarSize >= scanSizes[s]) {
+ // Space for this roll?
+ if((bufferOffset + bestFitSoFarSize) > maxSafeOffset) break; // Kill byte walk loop
+ // Roll
+ rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], bestFitSoFarSize);
+ bufferOffset += bestFitSoFarSize;
+ continue; // Back around to byte read loop
+ }
+
+ // Look for block match at this size
+ uint16_t hashValue = rollingSums[s]->GetComponentForHashing();
+ if(hashTables[s][hashValue] != 0)
+ {
+ if(SecondStageMatch(hashTables[s][hashValue],
+ rollingSums[s],
+ endingBuffer + bufferOffset,
+ scanSizes[s],
+ 0, 0, scanSizes[s],
+ bufferFileOffset + bufferOffset,
+ pIndex, rFoundBlocks))
{
- goodnessOfFit[fileOffset] = Sizes[s];
+#ifndef NDEBUG
+ BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset);
+#endif
+ // Update best fit so far
+ ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s]));
+ goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s];
+ pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7));
+
+ // We've found a match, don't scan anymore if we don't expect more blocks of this size
+ --scanSizesCount[s];
+ if(scanSizesCount[s] < 1)
+ {
+ scanThisSize[s] = false;
+#ifndef NDEBUG
+ BOX_TRACE("Short-circuit size " << scanSizes[s]);
+#endif
+ }
+
+ // Roll over this matched block
+ if((bufferOffset + scanSizes[s]) > maxSafeOffset) break; // Kill byte walk loop
+ rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], scanSizes[s]);
+ bufferOffset += scanSizes[s];
+ continue; // Back around to byte read loop
}
}
-
- // finish
- break;
+
+ // Abort check
+ int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size());
+ int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
+ if(NumBlocksFound > MaxBlocksFound)
+ {
+ abortSearch = true;
+ break;
+ }
+
+ // Because byte loop conditional can equal maxSafeOffset we must not read off
+ // end here
+ if(bufferOffset == maxSafeOffset) break; // Done with reading bytes
+ rollingSums[s]->RollForward(endingBuffer[bufferOffset], endingBuffer[bufferOffset + scanSizes[s]], scanSizes[s]);
+ ++bufferOffset;
}
-
- // Switch buffers, reset offset
- beginnings = endings;
- endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer
- offset = 0;
-
- // And count the blocks which have been done
- ++fileBlockNumber;
+ if(abortSearch) break;
+ // NOTE: Nothing else can go here without restructuring use of "break" in the byte read loop above
}
-
- if(abortSearch) break;
}
- // Free buffers and hash table
+ // Clean up
+ ::free(pbuffer0);
+ pbuffer0 = 0;
::free(pbuffer1);
pbuffer1 = 0;
- ::free(pbuffer0);
- pbuffer0 = 0;
- ::free(phashTable);
- phashTable = 0;
+ ::free(pfitBitmap);
+ pfitBitmap = 0;
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+ {
+ if(hashTables[z] != 0)
+ {
+ free(hashTables[z]);
+ hashTables[z] = 0;
+ }
+ if(rollingSums[z] != 0)
+ {
+ delete rollingSums[z];
+ rollingSums[z] = 0;
+ }
+ }
}
catch(...)
{
- // Cleanup and throw
+ // Cleanup and rethrow
+ if(pbuffer0 != 0) ::free(pbuffer0);
if(pbuffer1 != 0) ::free(pbuffer1);
- if(pbuffer0 != 0) ::free(pbuffer0);
- if(phashTable != 0) ::free(phashTable);
+ if(pfitBitmap != 0) ::free(pfitBitmap);
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+ {
+ if(hashTables[z] != 0) free(hashTables[z]);
+ }
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+ {
+ if(rollingSums[z] != 0) delete rollingSums[z];
+ }
throw;
}
-
+
#ifndef NDEBUG
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
+dumpDiffList:
+ // Trace out the found blocks in debug mode
+ BOX_TRACE("Diff: list of found blocks");
+ BOX_TRACE("======== ======== ======== ========");
+ BOX_TRACE(" Offset BlkIdx Size Movement");
+ for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
{
- // Trace out the found blocks in debug mode
- BOX_TRACE("Diff: list of found blocks");
- BOX_TRACE("======== ======== ======== ========");
- BOX_TRACE(" Offset BlkIdx Size Movement");
- for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
+ int64_t orgLoc = 0;
+ for(int64_t b = 0; b < i->second; ++b)
{
- int64_t orgLoc = 0;
- for(int64_t b = 0; b < i->second; ++b)
- {
- orgLoc += pIndex[b].mSize;
- }
- BOX_TRACE(std::setw(8) << i->first << " " <<
- std::setw(8) << i->second << " " <<
- std::setw(8) << pIndex[i->second].mSize <<
- " " <<
- std::setw(8) << (i->first - orgLoc));
+ orgLoc += pIndex[b].mSize;
}
- BOX_TRACE("======== ======== ======== ========");
+ BOX_TRACE(std::setw(8) << i->first << " " <<
+ std::setw(8) << i->second << " " <<
+ std::setw(8) << pIndex[i->second].mSize <<
+ " " <<
+ std::setw(8) << (i->first - orgLoc));
}
+ BOX_TRACE("======== ======== ======== ========");
#endif
}
@@ -781,6 +1158,8 @@
{
//BOX_TRACE("Another hash entry for " << hash << " found");
// Yes -- need to set the pointer in this entry to the current entry to build the linked list
+ // This is safe because there is only one hash table per block size and block sizes
+ // are unique.
pIndex[b].mpNextInHashList = pHashTable[hash];
}
@@ -790,30 +1169,35 @@
}
}
-
// --------------------------------------------------------------------------
//
// Function
// Name: static bool SecondStageMatch(xxx)
-// Purpose: When a match in the hash table is found, scan for second stage match using strong checksum.
+// Purpose: When a match in the hash table is found, scan for
+// second stage match using strong checksum.
// Created: 14/1/04
//
// --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
- int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList,
+ RollingChecksum *pFastSum, uint8_t *pLow, int32_t LowSize,
+ uint8_t *pHigh, int32_t HighSize, int32_t BlockSize,
+ int64_t FileOffset, BlocksAvailableEntry *pIndex,
+ std::map<int64_t, int64_t> &rFoundBlocks)
{
// Check parameters
- ASSERT(pBeginnings != 0);
- ASSERT(pEndings != 0);
- ASSERT(Offset >= 0);
+ ASSERT(pFirstInHashList != 0);
+ ASSERT(pFastSum != 0);
+ ASSERT(pLow != 0);
+ ASSERT(LowSize != 0);
+ ASSERT((HighSize > 0) ? (pHigh != 0) : true);
ASSERT(BlockSize > 0);
- ASSERT(pFirstInHashList != 0);
+ ASSERT(BlockSize == (LowSize + HighSize));
ASSERT(pIndex != 0);
#ifndef NDEBUG
- uint16_t DEBUG_Hash = fastSum.GetComponentForHashing();
+ uint16_t DEBUG_Hash = pFastSum->GetComponentForHashing();
#endif
- uint32_t Checksum = fastSum.GetChecksum();
+ uint32_t Checksum = pFastSum->GetChecksum();
// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
BlocksAvailableEntry *scan = pFirstInHashList;
@@ -834,18 +1218,13 @@
// Calculate the strong MD5 digest for this block
MD5Digest strong;
- // Add the data from the beginnings
- strong.Add(pBeginnings + Offset, BlockSize - Offset);
- // Add any data from the endings
- if(Offset > 0)
- {
- strong.Add(pEndings, Offset);
- }
+ strong.Add(pLow, LowSize);
+ if(HighSize > 0) strong.Add(pHigh, HighSize);
strong.Finish();
// Then go through the entries in the hash list, comparing with the strong digest calculated
scan = pFirstInHashList;
- //BOX_TRACE("second stage match");
+
while(scan != 0)
{
//BOX_TRACE("scan size " << scan->mSize <<
@@ -853,25 +1232,22 @@
// ", hash " << Hash);
ASSERT(scan->mSize == BlockSize);
ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash);
-
+
// Compare?
if(strong.DigestMatches(scan->mStrongChecksum))
{
- //BOX_TRACE("Match!\n");
// Found! Add to list of found blocks...
- int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
// We do NOT search for smallest blocks first, as this code originally assumed.
// To prevent this from potentially overwriting a better match, the caller must determine
// the relative "goodness" of any existing match and this one, and avoid the call if it
// could be detrimental.
- rFoundBlocks[fileOffset] = blockIndex;
+ rFoundBlocks[FileOffset] = blockIndex;
// No point in searching further, report success
return true;
}
-
// Next
scan = scan->mpNextInHashList;
}
@@ -917,11 +1293,8 @@
rRecipe.push_back(instruction);
#ifndef NDEBUG
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
- {
- BOX_TRACE("Diff: Default recipe generated, " <<
- SizeOfInputFile << " bytes of file");
- }
+ BOX_TRACE("Diff: Default recipe generated, " <<
+ SizeOfInputFile << " bytes of file");
#endif
// Don't do anything
@@ -1016,7 +1389,8 @@
BOX_TRACE("Diff: " <<
debug_NewBytesFound << " new bytes found, " <<
debug_OldBlocksUsed << " old blocks used");
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
+
+ if (true)
{
BOX_TRACE("Diff: Recipe generated (size " << rRecipe.size());
BOX_TRACE("======== ========= ========");
Index: test/bbackupd/testbbackupd.cpp
===================================================================
--- test/bbackupd/testbbackupd.cpp (revision 2361)
+++ test/bbackupd/testbbackupd.cpp (working copy)
@@ -1002,22 +1002,27 @@
// before any matching blocks could be found.
intercept_setup_delay("testfiles/TestDir1/spacetest/f1",
0, 4000, SYS_read, 1);
- pid = start_internal_daemon();
- intercept_clear_setup();
+ {
+ Timers::Init();
+ BackupDaemon bbackupd;
+ bbackupd.Configure("testfiles/bbackupd.conf");
+ bbackupd.InitCrypto();
- fd = open("testfiles/TestDir1/spacetest/f1", O_WRONLY);
- TEST_THAT(fd > 0);
- // write again, to update the file's timestamp
- TEST_EQUAL(sizeof(buffer), write(fd, buffer, sizeof(buffer)),
- "Buffer write");
- TEST_THAT(close(fd) == 0);
+ fd = open("testfiles/TestDir1/spacetest/f1", O_WRONLY);
+ TEST_THAT(fd > 0);
+ // write again, to update the file's timestamp
+ TEST_EQUAL(1, write(fd, "z", 1), "Buffer write");
+ TEST_THAT(close(fd) == 0);
- wait_for_backup_operation();
- // can't test whether intercept was triggered, because
- // it's in a different process.
- // TEST_THAT(intercept_triggered());
- TEST_THAT(stop_internal_daemon(pid));
+ // wait long enough to put file into sync window
+ wait_for_operation(5);
+ bbackupd.RunSyncNow();
+ TEST_THAT(intercept_triggered());
+ intercept_clear_setup();
+ Timers::Cleanup();
+ }
+
// check that the diff was aborted, i.e. upload was not a diff
found1 = false;
--- lib/backupclient/BackupStoreFile.h 2008-03-03 19:34:49.000000000 -0800
+++ lib/backupclient/BackupStoreFile.h 2008-02-27 20:17:20.000000000 -0800
@@ -206,11 +206,6 @@
static void ResetStats();
static BackupStoreFileStats msStats;
- // For debug
-#ifndef NDEBUG
- static bool TraceDetailsOfDiffProcess;
-#endif
-
// For decoding encoded files
static void DumpFile(void *clibFileHandle, bool ToTrace, IOStream &rFile);
};
--- test/backupdiff/testbackupdiff.cpp 2008-03-03 19:34:37.000000000 -0800
+++ test/backupdiff/testbackupdiff.cpp 2008-02-27 20:22:53.000000000 -0800
@@ -376,7 +376,7 @@
// Want to trace out all the details
#ifndef NDEBUG
#ifndef WIN32
- BackupStoreFile::TraceDetailsOfDiffProcess = true;
+ Logging::SetGlobalLevel(Log::TRACE);
#endif
#endif
--- lib/backupclient/BackupStoreFileDiff.cpp 2008-03-03 19:34:49.000000000 -0800
+++ lib/backupclient/BackupStoreFileDiff.cpp 2008-03-03 20:09:08.000000000 -0800
@@ -36,21 +36,12 @@
using namespace BackupStoreFileCryptVar;
using namespace BackupStoreFileCreation;
-// By default, don't trace out details of the diff as we go along -- would fill up logs significantly.
-// But it's useful for the test.
-#ifndef NDEBUG
- bool BackupStoreFile::TraceDetailsOfDiffProcess = false;
-#endif
-
static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis);
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
-static void SearchForMatchingBlocks(IOStream &rFile,
- std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex,
- int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES],
- DiffTimer *pDiffTimer);
+static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer);
static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
-BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum *pFastSum,
+ uint8_t *pLow, int32_t LowSize, uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, int64_t FileOffset,
+ BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
// --------------------------------------------------------------------------
@@ -166,10 +157,6 @@
try
{
- // Find which sizes should be scanned
- int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
- FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);
-
// Flag for reporting to the user
bool completelyDifferent;
@@ -184,8 +171,7 @@
// Get size of file
sizeOfInputFile = file.BytesLeftToRead();
// Find all those lovely matching blocks
- SearchForMatchingBlocks(file, foundBlocks, pindex,
- blocksInIndex, sizesToScan, pDiffTimer);
+ SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, pDiffTimer);
// Is it completely different?
completelyDifferent = (foundBlocks.size() == 0);
@@ -361,390 +347,744 @@
}
}
-
// --------------------------------------------------------------------------
//
// Function
-// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-// Purpose: Finds the most commonly used block sizes in the index
+// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, DiffTimer *)
+// Purpose: Find the matching blocks within the file.
// Created: 12/1/04
//
// --------------------------------------------------------------------------
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
+static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
+ BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer)
{
- // Array for lengths
- int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ Timer maximumDiffingTime(0);
- // Set arrays to lots of zeros (= unused entries)
- for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l)
+ if(pDiffTimer && pDiffTimer->IsManaged())
{
- Sizes[l] = 0;
- sizeCounts[l] = 0;
+ maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime());
}
- // Array for collecting sizes
- std::map<int32_t, int64_t> foundSizes;
+ // Flag to abort the run, if too many blocks are found or the diffing
+ // timeout expires
+ bool abortSearch = false;
+
+ // Buffers used during both phases of search
+ uint8_t *pbuffer0 = 0;
+ uint8_t *pbuffer1 = 0;
+
+ // Track offsets that already have block matches. We don't really care
+ // if its sorted, and this actually produces a performance issue, so
+ // see pfitBitmap for a workaround
+ std::map<int64_t, int32_t> goodnessOfFit;
- // Run through blocks and make a count of the entries
- for(int64_t b = 0; b < NumBlocks; ++b)
+ // Collect sizes that aren't found in the file at their old offset
+ std::map<int32_t, int64_t> unmatchedSizes;
+
+ // Our arrays of block sizes during second search pass
+ int32_t scanSizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ int64_t scanSizesCount[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ int32_t maxScanBlockSize = 0;
+ ::memset(scanSizes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+ ::memset(scanSizesCount, 0, (sizeof(int64_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+ // We need to keep separate rolling checksums for each block size in second search
+ RollingChecksum *rollingSums[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ ::memset(rollingSums, 0, (sizeof(RollingChecksum *) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+ // And a hash lookup table per block size in seacond search
+ BlocksAvailableEntry **hashTables[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ ::memset(hashTables, 0, (sizeof(BlocksAvailableEntry **) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+ // We allow second search pass to short-circuit off rare blocks
+ bool scanThisSize[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ ::memset(scanThisSize, 0, (sizeof(bool) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+ // During block read we have a bitmap of prefit locations to avoid std::map
+ // performance problems
+ uint8_t *pfitBitmap = 0;
+
+
+ // First search pass...
+ //
+ // For many files (especially large ones) most of the file is unchanged.
+ // The RollingChecksum process requires us to read every byte of the file looking for
+ // blocks that have moved. However, we can make that process more efficient by
+ // quickly rolling over areas that match a different block. We can also use
+ // this to eliminate entirely the rolling checksum for block sizes that only exist
+ // at one location in the file.
+ //
+ // Thus we start by looking for blocks that have not moved. Only if a block
+ // cannot be found at its previous location do we consider scanning for it by
+ // rolling checksum size.
+ //
+ // This strategy has some disadvantages for files with lots of repeating content
+ // that happens to align with our block size, but the reduction in diff time
+ // for more typical files is worth it.
+ //
+ // Note also that in this pass we consider _all_ block sizes (smaller than
+ // BACKUP_FILE_MAX_BLOCK_SIZE). Any block size, no matter how small or rare
+ // is cheap for us to find in this pass.
+ //
+ pbuffer0 = (uint8_t *)::malloc(BACKUP_FILE_MAX_BLOCK_SIZE);
+ try
{
- // Only if the block size is bigger than the minimum size we'll scan for
- if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)
+ if(pbuffer0 == 0)
+ {
+ throw std::bad_alloc();
+ }
+
+ // We have to track file offset since the read may fail
+ int64_t fileOffset = 0;
+
+ // Walk the blocks
+ for(int64_t b = 0; b < NumBlocks; ++b)
{
- // Find entry?
- std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize));
- if(f != foundSizes.end())
+ // Check diffing timeout
+ if(maximumDiffingTime.HasExpired())
+ {
+ ASSERT(pDiffTimer != NULL);
+ BOX_INFO("MaximumDiffingTime reached - suspending file diff");
+ abortSearch = true;
+ break;
+ }
+
+ // Send keep alive
+ if(pDiffTimer) pDiffTimer->DoKeepAlive();
+
+ // Skip blocks too large for our buffer
+ if(pIndex[b].mSize > BACKUP_FILE_MAX_BLOCK_SIZE) {
+ fileOffset += pIndex[b].mSize;
+ continue;
+ }
+
+ // Have to guard the seek operation, it could throw. We don't know size of the
+ // current file, and checking is pointless anyway since there's a race.
+ // In reality on Unix this is implemented with lseek which will mean we can
+ // seek past the EOF, but I don't want to make assumptions about Win32.
+ int32_t readSize = 0;
+ try
+ {
+ rFile.Seek(fileOffset, IOStream::SeekType_Absolute);
+ readSize = rFile.Read(pbuffer0, pIndex[b].mSize);
+ }
+ catch(BoxException &e)
+ {
+ if(e.GetType() != CommonException::ExceptionType || e.GetSubType() != CommonException::OSFileError)
+ {
+ // Not what we expected, rethrow
+ throw;
+ }
+ }
+
+ // Check for a match
+ bool blockMatched = false;
+ if(readSize == pIndex[b].mSize)
{
- // Increment existing entry
- foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1;
+ // We don't have a rolling checksum to this point, so all we can do is MD5. If you
+ // worry this is expensive just remember that prior versions of this code
+ // re-read the file BACKUP_FILE_DIFF_MAX_BLOCK_SIZES times and calculated
+ // rolling checksums every time.
+ MD5Digest strong;
+ strong.Add(pbuffer0, pIndex[b].mSize);
+ strong.Finish();
+
+ // Do we have a match?
+ if(strong.DigestMatches(pIndex[b].mStrongChecksum))
+ {
+#ifndef NDEBUG
+ BOX_TRACE("Found unchanged block of size " << pIndex[b].mSize << " at offset " << fileOffset);
+#endif
+ rFoundBlocks[fileOffset] = b;
+ goodnessOfFit[fileOffset] = pIndex[b].mSize;
+ blockMatched = true;
+ }
}
- else
+
+ // We're done with the file offset, so increment now
+ fileOffset += pIndex[b].mSize;
+
+ // If the block didn't match then this is a size we'll have to scan for
+ if(!blockMatched && (pIndex[b].mSize >= BACKUP_FILE_DIFF_MIN_BLOCK_SIZE))
{
- // New entry
- foundSizes[pIndex[b].mSize] = 1;
+ // Find entry?
+ std::map<int32_t, int64_t>::const_iterator f(unmatchedSizes.find(pIndex[b].mSize));
+ if(f != unmatchedSizes.end())
+ {
+ // Increment existing entry
+ unmatchedSizes[pIndex[b].mSize] = unmatchedSizes[pIndex[b].mSize] + 1;
+ }
+ else
+ {
+ // New entry
+ unmatchedSizes[pIndex[b].mSize] = 1;
+ }
}
}
+
+ // Cleanup
+ ::free(pbuffer0);
+ pbuffer0 = 0;
}
-
- // Make the block sizes
- for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i)
+ catch(...)
+ {
+ // Cleanup and rethrow
+ if(pbuffer0 != 0) ::free(pbuffer0);
+ throw;
+ }
+
+ // Are we already out of time?
+ if(abortSearch)
{
- // Find the position of the size in the array
+#ifndef NDEBUG
+ goto dumpDiffList;
+#endif
+ return;
+ }
+
+
+ // Second search pass...
+ //
+ // In our second phase, having matched all unchanged blocks we now need
+ // to scan for moved blocks. This involves looping across all unmatched
+ // block sizes and using the rolling checksum to look for relocations. To keep
+ // this from being too expensive we cap at BACKUP_FILE_DIFF_MAX_BLOCK_SIZES
+ // for the number of sizes to scan. We also scan for the blocks in order of
+ // their relative probability, a block size that occurs frequently is scanned
+ // first.
+ //
+
+ // Loop all sizes inserting higher usages into the array
+ for(std::map<int32_t, int64_t>::const_iterator i(unmatchedSizes.begin()); i != unmatchedSizes.end(); ++i)
+ {
+ // TODO: Scanning for any block size isn't cheap, and realistically in many cases
+ // it would be less expensive to upload a few thousand bytes rather than do the scan.
+ // Here would be a good place to filter block sizes that aren't worth the effort
+ // once a suitable set of heuristics is found.
+
for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
{
- // Instead of sorting on the raw count of blocks,
- // take the file area covered by this block size.
- if(i->second * i->first > sizeCounts[t] * Sizes[t])
+ // Instead of sorting on the raw count of blocks, take the file area covered by this
+ // block size. This helps avoid favoring low numbers of large blocks over many
+ // small blocks.
+ if((i->second * i->first) > (scanSizesCount[t] * scanSizes[t]))
{
// Then this size belong before this entry -- shuffle them up
for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
{
- Sizes[s] = Sizes[s-1];
- sizeCounts[s] = sizeCounts[s-1];
+ scanSizes[s] = scanSizes[s-1];
+ scanSizesCount[s] = scanSizesCount[s-1];
}
// Insert this size
- Sizes[t] = i->first;
- sizeCounts[t] = i->second;
+ scanSizes[t] = i->first;
+ scanSizesCount[t] = i->second;
- // Shouldn't do any more searching
+ // Update max size
+ if(maxScanBlockSize < i->first) maxScanBlockSize = i->first;
+
+ // Shouldn't do any more searching for this size
break;
}
}
}
-
- // trace the size table in debug builds
+ // Dump the size table in debug builds
#ifndef NDEBUG
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
+ for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
{
- for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
- {
- TRACE3("Diff block size %d: %d (count = %lld)\n", t, Sizes[t], sizeCounts[t]);
- }
+ if(scanSizes[t] != 0) BOX_TRACE("Scan block size " << t << ": " << scanSizes[t] << " count: " << scanSizesCount[t]);
}
#endif
-}
-
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-// Purpose: Find the matching blocks within the file.
-// Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
- BlocksAvailableEntry *pIndex, int64_t NumBlocks,
- int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer)
-{
- Timer maximumDiffingTime(0);
- if(pDiffTimer && pDiffTimer->IsManaged())
+ // If we didn't find any sizes (could happen if all were found in the first matching
+ // phase) we're done.
+ if(maxScanBlockSize == 0)
{
- maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime());
+#ifndef NDEBUG
+ BOX_TRACE("No scan block sizes, skip rolling checksum");
+ goto dumpDiffList;
+#endif
+ return;
}
+ ASSERT(maxScanBlockSize <= BACKUP_FILE_MAX_BLOCK_SIZE);
- std::map<int64_t, int32_t> goodnessOfFit;
-
- // Allocate the hash lookup table
- BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
-
- // Choose a size for the buffer, just a little bit more than the maximum block size
- int32_t bufSize = Sizes[0];
- for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
- {
- if(Sizes[z] > bufSize) bufSize = Sizes[z];
- }
- bufSize += 4;
- ASSERT(bufSize > Sizes[0]);
- ASSERT(bufSize > 0);
- if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024))
- {
- THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
- }
+ // Allocate two buffers we'll toggle between at the max scan block size
+ // There sizes are doubled to make final block at the end easier (we're cheating)
+ pbuffer0 = (uint8_t *)::malloc(maxScanBlockSize * 2);
+ pbuffer1 = (uint8_t *)::malloc(maxScanBlockSize * 2);
+
+ // Allocate a bitmap buffer to optimize goodnessOfFit access
+ pfitBitmap = (uint8_t *)::malloc(maxScanBlockSize * 2 / 8);
- // TODO: Because we read in the file a scanned block size at a time,
- // it is likely to be inefficient. Probably will be much better to
- // calculate checksums for all block sizes in a single pass.
-
- // Allocate the buffers.
- uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize);
- uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize);
try
{
// Check buffer allocation
- if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
+ if(pbuffer0 == 0 || pbuffer1 == 0 || pfitBitmap == 0)
{
// If a buffer got allocated, it will be cleaned up in the catch block
throw std::bad_alloc();
}
- // Flag to abort the run, if too many blocks are found -- avoid using
- // huge amounts of processor time when files contain many similar blocks.
- bool abortSearch = false;
-
- // Search for each block size in turn
- // NOTE: Do the smallest size first, so that the scheme for adding
- // entries in the found list works as expected and replaces smallers block
- // with larger blocks when it finds matches at the same offset in the file.
- for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s)
- {
- ASSERT(Sizes[s] <= bufSize);
- BOX_TRACE("Diff pass " << s << ", for block size " <<
- Sizes[s]);
-
- // Check we haven't finished
- if(Sizes[s] == 0)
- {
- // empty entry, try next size
- continue;
- }
-
- // Set up the hash table entries
- SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable);
+ // Shift file position back to beginning
+ int64_t bufferFileOffset = 0;
+ rFile.Seek(0, IOStream::SeekType_Absolute);
- // Shift file position to beginning
- rFile.Seek(0, IOStream::SeekType_Absolute);
-
- // Read first block
- if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s])
+ // We're going to be flipping back and forth between two buffers, the low and high
+ uint8_t *lowBuffer = pbuffer0;
+ int32_t lowBufferBytes = 0;
+ uint8_t *highBuffer = pbuffer1;
+ int32_t highBufferBytes = 0;
+
+ // In some cases we need to carry over reads from prior buffers
+ int32_t carryOverBytes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ ::memset(carryOverBytes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+ // Read the first buffer's worth of data
+ lowBufferBytes = rFile.Read(lowBuffer, maxScanBlockSize);
+ // Fill the second buffer if appropriate
+ if(lowBufferBytes == maxScanBlockSize)
+ {
+ highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize);
+ }
+
+ // For every block size, initialize our scan tracking
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+ {
+ ASSERT(scanSizes[z] <= maxScanBlockSize);
+
+ // The sizes array may be mostly empty, in those cases we have no
+ // state to maintain.
+ if(scanSizes[z] != 0)
{
- // Size of file too short to match -- do next size
- continue;
- }
-
- // Setup block pointers
- uint8_t *beginnings = pbuffer0;
- uint8_t *endings = pbuffer1;
- int offset = 0;
-
- // Calculate the first checksum, ready for rolling
- RollingChecksum rolling(beginnings, Sizes[s]);
-
- // Then roll, until the file is exhausted
- int64_t fileBlockNumber = 0;
- int64_t fileOffset = 0;
- int rollOverInitialBytes = 0;
- while(true)
- {
- if(maximumDiffingTime.HasExpired())
- {
- ASSERT(pDiffTimer != NULL);
- BOX_INFO("MaximumDiffingTime reached - "
- "suspending file diff");
- abortSearch = true;
- break;
- }
+ // Mark for scan
+ scanThisSize[z] = true;
- if(pDiffTimer)
+ // Set up the hash table for this size
+ hashTables[z] = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
+ if(hashTables[z] == 0)
{
- pDiffTimer->DoKeepAlive();
+ throw std::bad_alloc();
}
-
- // Load in another block of data, and record how big it is
- int bytesInEndings = rFile.Read(endings, Sizes[s]);
- int tmp;
+ SetupHashTable(pIndex, NumBlocks, scanSizes[z], hashTables[z]);
- // Skip any bytes from a previous matched block
- if(rollOverInitialBytes > 0 && offset < bytesInEndings)
+ // Set up a rolling checksum, but only if there is enough data to start with
+ // (file may now be shorter than some block sizes previously used)
+ if(scanSizes[z] <= lowBufferBytes)
{
- int spaceLeft = bytesInEndings - offset;
- int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+ rollingSums[z] = new RollingChecksum(lowBuffer, scanSizes[z]);
+ }
+ }
+ }
+
+ // Read loop while we can get full maxScanBlockSize reads
+ while(highBufferBytes == maxScanBlockSize)
+ {
+ // Oh happy day! We have maxScanBlockSize * 2 bytes available to us across
+ // the two buffers, which means we can walk every block size across all offsets
+ // in lowBuffer without any concern about overrunning the data available in highBuffer.
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+ // Check diffing timeout
+ if(maximumDiffingTime.HasExpired())
+ {
+ ASSERT(pDiffTimer != NULL);
+ BOX_INFO("MaximumDiffingTime reached - suspending file diff");
+ abortSearch = true;
+ break;
+ }
+
+ // Send keep alive
+ if(pDiffTimer) pDiffTimer->DoKeepAlive();
+
+ // Don't you wish hash_map was standard? std::map is very slow
+ // when we access it as often as we do in the loop below. To work around
+ // we'll create a bitmap of the previous fits in this buffer
+ ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8);
+ for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i)
+ {
+ if(i->first >= (bufferFileOffset + maxScanBlockSize)) break;
+ pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7));
+ }
+
+ // Walk all block sizes in block-probability order
+ for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s)
+ {
+ // If there is no rolling checksum at this index, skip its either
+ // a zero size or the file was too small
+ if(!scanThisSize[s] || (rollingSums[s] == 0)) continue;
- offset += thisRoll;
- fileOffset += thisRoll;
- rollOverInitialBytes -= thisRoll;
+#ifndef NDEBUG
+ BOX_TRACE("Diff block size " << scanSizes[s] << " at file offset " << bufferFileOffset);
+#endif
+
+ // Offset of this block buffer
+ int32_t bufferOffset = 0;
- if(rollOverInitialBytes)
+ // Roll carry over
+ if(carryOverBytes[s] != 0)
+ {
+ ASSERT(carryOverBytes[s] > 0);
+ // Carry can be bigger than maxScanBlockSize because first matching
+ // phase might have used bigger blocks than the current max scan size
+ int32_t thisCarry = carryOverBytes[s];
+ if(thisCarry > maxScanBlockSize)
{
- goto refresh;
+ thisCarry = maxScanBlockSize;
}
- }
-
- if(goodnessOfFit.count(fileOffset))
- {
- tmp = goodnessOfFit[fileOffset];
- }
- else
- {
- tmp = 0;
- }
- if(tmp >= Sizes[s])
- {
- // Skip over bigger ready-matched blocks completely
- rollOverInitialBytes = tmp;
- int spaceLeft = bytesInEndings - offset;
- int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
-
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
- offset += thisRoll;
- fileOffset += thisRoll;
- rollOverInitialBytes -= thisRoll;
-
- if(rollOverInitialBytes)
+ // Perform carry
+ if((thisCarry + scanSizes[s]) > maxScanBlockSize)
+ {
+ // Roll is split some low, some low/high
+ int32_t lowRollBytes = maxScanBlockSize - scanSizes[s];
+ ASSERT(lowRollBytes >= 0);
+ if(lowRollBytes > 0)
+ {
+ rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], lowRollBytes);
+ }
+ rollingSums[s]->RollForwardSeveral(lowBuffer + lowRollBytes, highBuffer, scanSizes[s], thisCarry - lowRollBytes);
+ }
+ else
{
- goto refresh;
+ // Roll is all in low buffer
+ rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], thisCarry);
}
+ // Either way offset is carry
+ bufferOffset = thisCarry;
+ // Reuce carry by the carry amount actually carried
+ carryOverBytes[s] -= thisCarry;
+ ASSERT(carryOverBytes[s] >= 0);
}
-
- while(offset < bytesInEndings)
+
+ // Loop remaining low buffer bytes, taking a checksum at each offset
+ while (bufferOffset < maxScanBlockSize)
{
- // Is current checksum in hash list?
- uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
+ // Look for larger size matches at this offset
+ int32_t bestFitSoFarSize = 0;
+ if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7)))
{
- if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset];
+ }
+ if(bestFitSoFarSize >= scanSizes[s]) {
+ // Roll can be bigger than maxScanBlockSize because first matching
+ // phase might have used bigger blocks than the max scan size
+ int32_t thisRoll = bestFitSoFarSize;
+ int32_t remainderRoll = 0;
+ if(thisRoll > maxScanBlockSize)
{
- BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
- goodnessOfFit[fileOffset] = Sizes[s];
-
- // Block matched, roll the checksum forward to the next block without doing
- // any more comparisons, because these are pointless (as any more matches will be ignored when
- // the receipe is generated) and just take up valuable processor time. Edge cases are
- // especially nasty, using huge amounts of time and memory.
- int skip = Sizes[s];
- if(offset < bytesInEndings && skip > 0)
- {
- int spaceLeft = bytesInEndings - offset;
- int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
-
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
- offset += thisRoll;
- fileOffset += thisRoll;
- skip -= thisRoll;
- }
- // Not all the bytes necessary will have been skipped, so get them
- // skipped after the next block is loaded.
- rollOverInitialBytes = skip;
-
- // End this loop, so the final byte isn't used again
- break;
+ thisRoll = maxScanBlockSize;
+ remainderRoll = bestFitSoFarSize - thisRoll;
}
- else
+
+ // Roll forward in the lower buffer. This will either exhaust the total roll or will
+ // push the rolling checksum so that it is against the end of the buffer.
+ int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]);
+ if(lowRoll > 0)
{
- BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll);
+ bufferOffset += lowRoll;
+ thisRoll -= lowRoll;
}
-
- int64_t NumBlocksFound = static_cast<int64_t>(
- rFoundBlocks.size());
- int64_t MaxBlocksFound = NumBlocks *
- BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
-
- if(NumBlocksFound > MaxBlocksFound)
+ if(thisRoll < 1)
{
- abortSearch = true;
- break;
+ ASSERT(remainderRoll == 0);
+ continue; // Back around to bufferOffset loop
}
+ // Roll high/low section. Our roll here will either exhaust the total or will
+ // leave the rolling checksum positioned a offset zero in the high buffer
+ int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset);
+ ASSERT(lowHighRoll > 0);
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll);
+ bufferOffset += lowHighRoll;
+ carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll;
+ ASSERT(carryOverBytes[s] >= 0);
+ continue; // Back around to bufferOffset loop
}
- // Roll checksum forward
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
-
- // Increment offsets
- ++offset;
- ++fileOffset;
- }
-
- if(abortSearch) break;
-
- refresh:
- // Finished?
- if(bytesInEndings != Sizes[s])
- {
- // No more data in file -- check the final block
- // (Do a copy and paste of 5 lines of code instead of introducing a comparison for
- // each byte of the file)
- uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
+ // Look for block match at this size
+ uint16_t hashValue = rollingSums[s]->GetComponentForHashing();
+ if(hashTables[s][hashValue] != 0)
{
- if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ int32_t lowSize = maxScanBlockSize - bufferOffset;
+ if(lowSize > scanSizes[s]) lowSize = scanSizes[s];
+ int32_t highSize = scanSizes[s] - lowSize;
+ if(SecondStageMatch(hashTables[s][hashValue], rollingSums[s], lowBuffer + bufferOffset, lowSize,
+ highBuffer, highSize, scanSizes[s], bufferFileOffset + bufferOffset, pIndex, rFoundBlocks))
{
- goodnessOfFit[fileOffset] = Sizes[s];
+#ifndef NDEBUG
+ BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset);
+#endif
+ // Update best fit so far
+ ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s]));
+ goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s];
+ pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7));
+
+ // We've found a match, don't scan anymore if we don't expect more blocks of this size
+ --scanSizesCount[s];
+ if(scanSizesCount[s] < 1)
+ {
+ scanThisSize[s] = false;
+#ifndef NDEBUG
+ BOX_TRACE("Short-circuit size " << scanSizes[s]);
+#endif
+ }
+
+ // Roll so we don't bother diffing the remainder of this block, see bestFitSoFarSize
+ // roll above for an explanation of the logic
+ int32_t thisRoll = scanSizes[s];
+ int32_t remainderRoll = 0;
+ if(thisRoll > maxScanBlockSize)
+ {
+ thisRoll = maxScanBlockSize;
+ remainderRoll = bestFitSoFarSize - thisRoll;
+ }
+ int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]);
+ if(lowRoll > 0)
+ {
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll);
+ bufferOffset += lowRoll;
+ thisRoll -= lowRoll;
+ }
+ if(thisRoll < 1)
+ {
+ ASSERT(remainderRoll == 0);
+ continue; // Back around to bufferOffset loop
+ }
+ int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset);
+ ASSERT(lowHighRoll > 0);
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll);
+ bufferOffset += lowHighRoll;
+ carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll;
+ ASSERT(carryOverBytes[s] >= 0);
+ continue; // Back around to bufferOffset loop
}
}
+
+ // Roll one byte, either low or low/high depending on current offset
+ if((bufferOffset + scanSizes[s]) >= maxScanBlockSize)
+ {
+ rollingSums[s]->RollForward(lowBuffer[bufferOffset], highBuffer[scanSizes[s] - (maxScanBlockSize - bufferOffset)], scanSizes[s]);
+ }
+ else
+ {
+ rollingSums[s]->RollForward(lowBuffer[bufferOffset], lowBuffer[bufferOffset + scanSizes[s]], scanSizes[s]);
+ }
+ ++bufferOffset;
+ }
- // finish
+ // Check if we've found too much
+ int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size());
+ int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
+ if(NumBlocksFound > MaxBlocksFound)
+ {
+ abortSearch = true;
break;
}
-
- // Switch buffers, reset offset
- beginnings = endings;
- endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer
- offset = 0;
+ }
+ // If we're aborting, pass through
+ if(abortSearch) break;
+
+ // Swap buffers so high is low and the go back around
+ uint8_t *swap = lowBuffer;
+ lowBuffer = highBuffer;
+ ASSERT((lowBufferBytes == highBufferBytes) && (lowBufferBytes == maxScanBlockSize));
+ bufferFileOffset += maxScanBlockSize;
+ highBuffer = swap;
+ highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize);
- // And count the blocks which have been done
- ++fileBlockNumber;
+ } // End of maxScanBlockSize reads
+
+ // What remains is either:
+ // - An abort or timeout
+ // - A low buffer with less than maxScanBlockSize bytes and an empty high buffer
+ // (Potentially zero low bytes for a zero-length file)
+ // - A maxScanBlockSize low buffer and a maxScanBlockSize or less high buffer
+ //
+ if(!abortSearch && !maximumDiffingTime.HasExpired() && (lowBufferBytes > 0))
+ {
+ // We're going to repeat the logic in the previous read loop, but have
+ // to be careful not to walk off the end of the available data. To make this
+ // a lot less convoluted, we're just going to copy to consolidate buffers.
+ // This is cheating a bit, but worth the simplicity.
+ ASSERT((lowBufferBytes < maxScanBlockSize) ? (highBufferBytes == 0) : true);
+ ASSERT((highBufferBytes != 0) ? (lowBufferBytes == maxScanBlockSize) : true);
+ uint8_t *endingBuffer = lowBuffer;
+ if(highBufferBytes != 0)
+ {
+ ::memcpy(endingBuffer + maxScanBlockSize, highBuffer, highBufferBytes);
+ }
+ int32_t endingBytes = lowBufferBytes + highBufferBytes;
+
+ // Send keep alive
+ if(pDiffTimer) pDiffTimer->DoKeepAlive();
+
+ // std::map still not winning
+ ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8);
+ for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i)
+ {
+ pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7));
}
+
+ // Walk all block sizes in block-probability order
+ for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s)
+ {
+ // If there is no rolling checksum at this index, skip its either
+ // a zero size or the file was too small
+ if(!scanThisSize[s] || (rollingSums[s] == 0)) continue;
+
+ // If there's not enough data at the end, skip this block size
+ if(endingBytes < scanSizes[s]) continue;
- if(abortSearch) break;
+#ifndef NDEBUG
+ BOX_TRACE("Diff block size " << scanSizes[s] << " at file ending offset " << bufferFileOffset);
+#endif
+
+ // Offset of this block buffer
+ int32_t bufferOffset = 0;
+
+ // For each block size we can only walk so far safely
+ int32_t maxSafeOffset = endingBytes - scanSizes[s];
+
+ // Roll carry over
+ if(carryOverBytes[s] != 0)
+ {
+ ASSERT(carryOverBytes[s] > 0);
+ // Don't walk off the end
+ if(carryOverBytes[s] > maxSafeOffset) continue;
+ // Roll
+ rollingSums[s]->RollForwardSeveral(endingBuffer, endingBuffer + scanSizes[s], scanSizes[s], carryOverBytes[s]);
+ bufferOffset = carryOverBytes[s];
+ carryOverBytes[s] = 0; // Not really needed again, but be clean
+ }
+
+ // Loop remaining end bytes, we must walk to exactly and then break the loop
+ while (bufferOffset <= maxSafeOffset)
+ {
+ // Look for larger size matches at this offset
+ int32_t bestFitSoFarSize = 0;
+ if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7)))
+ {
+ bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset];
+ }
+ if(bestFitSoFarSize >= scanSizes[s]) {
+ // Space for this roll?
+ if((bufferOffset + bestFitSoFarSize) > maxSafeOffset) break; // Kill byte walk loop
+ // Roll
+ rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], bestFitSoFarSize);
+ bufferOffset += bestFitSoFarSize;
+ continue; // Back around to byte read loop
+ }
+
+ // Look for block match at this size
+ uint16_t hashValue = rollingSums[s]->GetComponentForHashing();
+ if(hashTables[s][hashValue] != 0)
+ {
+ if(SecondStageMatch(hashTables[s][hashValue], rollingSums[s], endingBuffer + bufferOffset, scanSizes[s],
+ 0, 0, scanSizes[s], bufferFileOffset + bufferOffset, pIndex, rFoundBlocks))
+ {
+#ifndef NDEBUG
+ BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset);
+#endif
+ // Update best fit so far
+ ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s]));
+ goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s];
+ pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7));
+
+ // We've found a match, don't scan anymore if we don't expect more blocks of this size
+ --scanSizesCount[s];
+ if(scanSizesCount[s] < 1)
+ {
+ scanThisSize[s] = false;
+#ifndef NDEBUG
+ BOX_TRACE("Short-circuit size " << scanSizes[s]);
+#endif
+ }
+
+ // Roll over this matched block
+ if((bufferOffset + scanSizes[s]) > maxSafeOffset) break; // Kill byte walk loop
+ rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], scanSizes[s]);
+ bufferOffset += scanSizes[s];
+ continue; // Back around to byte read loop
+ }
+ }
+
+ // Abort check
+ int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size());
+ int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
+ if(NumBlocksFound > MaxBlocksFound)
+ {
+ abortSearch = true;
+ break;
+ }
+
+ // Because byte loop conditional can equal maxSafeOffset we must not read off
+ // end here
+ if(bufferOffset == maxSafeOffset) break; // Done with reading bytes
+ rollingSums[s]->RollForward(endingBuffer[bufferOffset], endingBuffer[bufferOffset + scanSizes[s]], scanSizes[s]);
+ ++bufferOffset;
+ }
+ if(abortSearch) break;
+ // NOTE: Nothing else can go here without restructuring use of "break" in the byte read loop above
+ }
}
- // Free buffers and hash table
- ::free(pbuffer1);
- pbuffer1 = 0;
+ // Clean up
::free(pbuffer0);
pbuffer0 = 0;
- ::free(phashTable);
- phashTable = 0;
+ ::free(pbuffer1);
+ pbuffer1 = 0;
+ ::free(pfitBitmap);
+ pfitBitmap = 0;
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+ {
+ if(hashTables[z] != 0)
+ {
+ free(hashTables[z]);
+ hashTables[z] = 0;
+ }
+ if(rollingSums[z] != 0)
+ {
+ delete rollingSums[z];
+ rollingSums[z] = 0;
+ }
+ }
}
catch(...)
{
- // Cleanup and throw
- if(pbuffer1 != 0) ::free(pbuffer1);
+ // Cleanup and rethrow
if(pbuffer0 != 0) ::free(pbuffer0);
- if(phashTable != 0) ::free(phashTable);
+ if(pbuffer1 != 0) ::free(pbuffer1);
+ if(pfitBitmap != 0) ::free(pfitBitmap);
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+ {
+ if(hashTables[z] != 0) free(hashTables[z]);
+ }
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+ {
+ if(rollingSums[z] != 0) delete rollingSums[z];
+ }
throw;
}
-
+
#ifndef NDEBUG
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
- {
- // Trace out the found blocks in debug mode
- BOX_TRACE("Diff: list of found blocks");
- BOX_TRACE("======== ======== ======== ========");
- BOX_TRACE(" Offset BlkIdx Size Movement");
- for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
- {
- int64_t orgLoc = 0;
- for(int64_t b = 0; b < i->second; ++b)
- {
- orgLoc += pIndex[b].mSize;
- }
- BOX_TRACE(std::setw(8) << i->first << " " <<
- std::setw(8) << i->second << " " <<
- std::setw(8) << pIndex[i->second].mSize <<
- " " <<
- std::setw(8) << (i->first - orgLoc));
- }
- BOX_TRACE("======== ======== ======== ========");
+dumpDiffList:
+ // Trace out the found blocks in debug mode
+ BOX_TRACE("Diff: list of found blocks");
+ BOX_TRACE("======== ======== ======== ========");
+ BOX_TRACE(" Offset BlkIdx Size Movement");
+ for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
+ {
+ int64_t orgLoc = 0;
+ for(int64_t b = 0; b < i->second; ++b)
+ {
+ orgLoc += pIndex[b].mSize;
+ }
+ BOX_TRACE(std::setw(8) << i->first << " " <<
+ std::setw(8) << i->second << " " <<
+ std::setw(8) << pIndex[i->second].mSize <<
+ " " <<
+ std::setw(8) << (i->first - orgLoc));
}
+ BOX_TRACE("======== ======== ======== ========");
#endif
}
@@ -776,6 +1116,8 @@
{
//TRACE1("Another hash entry for %d found\n", hash);
// Yes -- need to set the pointer in this entry to the current entry to build the linked list
+ // This is safe because there is only one hash table per block size and block sizes
+ // are unique.
pIndex[b].mpNextInHashList = pHashTable[hash];
}
@@ -785,7 +1127,6 @@
}
}
-
// --------------------------------------------------------------------------
//
// Function
@@ -794,21 +1135,24 @@
// Created: 14/1/04
//
// --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
- int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum *pFastSum,
+ uint8_t *pLow, int32_t LowSize, uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, int64_t FileOffset,
+ BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
{
// Check parameters
- ASSERT(pBeginnings != 0);
- ASSERT(pEndings != 0);
- ASSERT(Offset >= 0);
- ASSERT(BlockSize > 0);
ASSERT(pFirstInHashList != 0);
+ ASSERT(pFastSum != 0);
+ ASSERT(pLow != 0);
+ ASSERT(LowSize != 0);
+ ASSERT((HighSize > 0) ? (pHigh != 0) : true);
+ ASSERT(BlockSize > 0);
+ ASSERT(BlockSize == (LowSize + HighSize));
ASSERT(pIndex != 0);
#ifndef NDEBUG
- uint16_t DEBUG_Hash = fastSum.GetComponentForHashing();
+ uint16_t DEBUG_Hash = pFastSum->GetComponentForHashing();
#endif
- uint32_t Checksum = fastSum.GetChecksum();
+ uint32_t Checksum = pFastSum->GetChecksum();
// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
BlocksAvailableEntry *scan = pFirstInHashList;
@@ -829,42 +1173,33 @@
// Calculate the strong MD5 digest for this block
MD5Digest strong;
- // Add the data from the beginnings
- strong.Add(pBeginnings + Offset, BlockSize - Offset);
- // Add any data from the endings
- if(Offset > 0)
- {
- strong.Add(pEndings, Offset);
- }
+ strong.Add(pLow, LowSize);
+ if(HighSize > 0) strong.Add(pHigh, HighSize);
strong.Finish();
// Then go through the entries in the hash list, comparing with the strong digest calculated
scan = pFirstInHashList;
- //TRACE0("second stage match\n");
while(scan != 0)
{
//TRACE3("scan size %d, block size %d, hash %d\n", scan->mSize, BlockSize, Hash);
ASSERT(scan->mSize == BlockSize);
ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash);
-
+
// Compare?
if(strong.DigestMatches(scan->mStrongChecksum))
{
- //TRACE0("Match!\n");
// Found! Add to list of found blocks...
- int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
// We do NOT search for smallest blocks first, as this code originally assumed.
// To prevent this from potentially overwriting a better match, the caller must determine
// the relative "goodness" of any existing match and this one, and avoid the call if it
// could be detrimental.
- rFoundBlocks[fileOffset] = blockIndex;
+ rFoundBlocks[FileOffset] = blockIndex;
// No point in searching further, report success
return true;
}
-
// Next
scan = scan->mpNextInHashList;
}
@@ -909,12 +1244,9 @@
instruction.mSpaceBefore = SizeOfInputFile;
rRecipe.push_back(instruction);
- #ifndef NDEBUG
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
- {
- TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile);
- }
- #endif
+#ifndef NDEBUG
+ TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile);
+#endif
// Don't do anything
return;
@@ -1006,22 +1338,19 @@
// dump out the recipe
#ifndef NDEBUG
TRACE2("Diff: %lld new bytes found, %lld old blocks used\n", debug_NewBytesFound, debug_OldBlocksUsed);
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
+ TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk NumBlks\n", rRecipe.size());
{
- TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk NumBlks\n", rRecipe.size());
+ for(unsigned int e = 0; e < rRecipe.size(); ++e)
{
- for(unsigned int e = 0; e < rRecipe.size(); ++e)
- {
- char b[64];
+ char b[64];
#ifdef WIN32
- sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
+ sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
#else
- sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
+ sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
#endif
- TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?" -":b, (int64_t)rRecipe[e].mBlocks);
- }
+ TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?" -":b, (int64_t)rRecipe[e].mBlocks);
}
- TRACE0("======== ========= ========\n");
}
+ TRACE0("======== ========= ========\n");
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment