Created
May 19, 2019 10:07
-
-
Save boxbackup-bot/b19c2981a665efd6168024893f4cb5f3 to your computer and use it in GitHub Desktop.
Trac issue 45 attachments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Index: lib/backupclient/BackupStoreFile.h | |
=================================================================== | |
--- lib/backupclient/BackupStoreFile.h (revision 2361) | |
+++ lib/backupclient/BackupStoreFile.h (working copy) | |
@@ -214,11 +214,6 @@ | |
static void ResetStats(); | |
static BackupStoreFileStats msStats; | |
- // For debug | |
-#ifndef NDEBUG | |
- static bool TraceDetailsOfDiffProcess; | |
-#endif | |
- | |
// For decoding encoded files | |
static void DumpFile(void *clibFileHandle, bool ToTrace, IOStream &rFile); | |
}; | |
Index: test/backupdiff/testbackupdiff.cpp | |
=================================================================== | |
--- test/backupdiff/testbackupdiff.cpp (revision 2361) | |
+++ test/backupdiff/testbackupdiff.cpp (working copy) | |
@@ -390,7 +390,7 @@ | |
// Want to trace out all the details | |
#ifndef NDEBUG | |
#ifndef WIN32 | |
- BackupStoreFile::TraceDetailsOfDiffProcess = true; | |
+ Logging::SetGlobalLevel(Log::TRACE); | |
#endif | |
#endif | |
Index: lib/backupclient/BackupStoreFileDiff.cpp | |
=================================================================== | |
--- lib/backupclient/BackupStoreFileDiff.cpp (revision 2361) | |
+++ lib/backupclient/BackupStoreFileDiff.cpp (working copy) | |
@@ -38,22 +38,26 @@ | |
using namespace BackupStoreFileCryptVar; | |
using namespace BackupStoreFileCreation; | |
-// By default, don't trace out details of the diff as we go along -- would fill up logs significantly. | |
-// But it's useful for the test. | |
-#ifndef NDEBUG | |
- bool BackupStoreFile::TraceDetailsOfDiffProcess = false; | |
-#endif | |
- | |
-static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis); | |
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); | |
-static void SearchForMatchingBlocks(IOStream &rFile, | |
- std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, | |
- int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], | |
+static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, | |
+ BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, | |
+ int Timeout, bool &rCanDiffFromThis); | |
+static void SearchForMatchingBlocks(IOStream &rFile, | |
+ std::map<int64_t, int64_t> &rFoundBlocks, | |
+ BlocksAvailableEntry *pIndex, int64_t NumBlocks, | |
DiffTimer *pDiffTimer); | |
-static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); | |
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, | |
-BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); | |
-static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); | |
+static void SetupHashTable(BlocksAvailableEntry *pIndex, | |
+ int64_t NumBlocks, int32_t BlockSize, | |
+ BlocksAvailableEntry **pHashTable); | |
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, | |
+ RollingChecksum *pFastSum, | |
+ uint8_t *pLow, int32_t LowSize, | |
+ uint8_t *pHigh, int32_t HighSize, | |
+ int32_t BlockSize, int64_t FileOffset, | |
+ BlocksAvailableEntry *pIndex, | |
+ std::map<int64_t, int64_t> &rFoundBlocks); | |
+static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, | |
+ BlocksAvailableEntry *pIndex, int64_t NumBlocks, | |
+ std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); | |
// -------------------------------------------------------------------------- | |
// | |
@@ -168,10 +172,6 @@ | |
try | |
{ | |
- // Find which sizes should be scanned | |
- int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
- FindMostUsedSizes(pindex, blocksInIndex, sizesToScan); | |
- | |
// Flag for reporting to the user | |
bool completelyDifferent; | |
@@ -186,8 +186,7 @@ | |
// Get size of file | |
sizeOfInputFile = file.BytesLeftToRead(); | |
// Find all those lovely matching blocks | |
- SearchForMatchingBlocks(file, foundBlocks, pindex, | |
- blocksInIndex, sizesToScan, pDiffTimer); | |
+ SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, pDiffTimer); | |
// Is it completely different? | |
completelyDifferent = (foundBlocks.size() == 0); | |
@@ -363,393 +362,771 @@ | |
} | |
} | |
- | |
// -------------------------------------------------------------------------- | |
// | |
// Function | |
-// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) | |
-// Purpose: Finds the most commonly used block sizes in the index | |
+// Name: static void SearchForMatchingBlocks(IOStream &, | |
+// std::map<int64_t, int64_t> &, | |
+// BlocksAvailableEntry *, int64_t, DiffTimer *) | |
+// Purpose: Find the matching blocks within the file. | |
// Created: 12/1/04 | |
// | |
// -------------------------------------------------------------------------- | |
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) | |
+static void SearchForMatchingBlocks(IOStream &rFile, | |
+ std::map<int64_t, int64_t> &rFoundBlocks, | |
+ BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer) | |
{ | |
- // Array for lengths | |
- int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ Timer maximumDiffingTime(0, "MaximumDiffingTime"); | |
- // Set arrays to lots of zeros (= unused entries) | |
- for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l) | |
+ if(pDiffTimer && pDiffTimer->IsManaged()) | |
{ | |
- Sizes[l] = 0; | |
- sizeCounts[l] = 0; | |
+ maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(), | |
+ "MaximumDiffingTime"); | |
} | |
- // Array for collecting sizes | |
- std::map<int32_t, int64_t> foundSizes; | |
+ // Flag to abort the run, if too many blocks are found or the diffing | |
+ // timeout expires | |
+ bool abortSearch = false; | |
- // Run through blocks and make a count of the entries | |
- for(int64_t b = 0; b < NumBlocks; ++b) | |
+ // Buffers used during both phases of search | |
+ uint8_t *pbuffer0 = 0; | |
+ uint8_t *pbuffer1 = 0; | |
+ | |
+ // Track offsets that already have block matches. We don't really care | |
+ // if its sorted, and this actually produces a performance issue, so | |
+ // see pfitBitmap for a workaround | |
+ std::map<int64_t, int32_t> goodnessOfFit; | |
+ | |
+ // Collect sizes that aren't found in the file at their old offset | |
+ std::map<int32_t, int64_t> unmatchedSizes; | |
+ | |
+ // Our arrays of block sizes during second search pass | |
+ int32_t scanSizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ int64_t scanSizesCount[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ int32_t maxScanBlockSize = 0; | |
+ ::memset(scanSizes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ ::memset(scanSizesCount, 0, (sizeof(int64_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ | |
+ // We need to keep separate rolling checksums for each block size in second search | |
+ RollingChecksum *rollingSums[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ ::memset(rollingSums, 0, (sizeof(RollingChecksum *) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ | |
+ // And a hash lookup table per block size in seacond search | |
+ BlocksAvailableEntry **hashTables[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ ::memset(hashTables, 0, (sizeof(BlocksAvailableEntry **) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ | |
+ // We allow second search pass to short-circuit off rare blocks | |
+ bool scanThisSize[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ ::memset(scanThisSize, 0, (sizeof(bool) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ | |
+ // During block read we have a bitmap of prefit locations to avoid std::map | |
+ // performance problems | |
+ uint8_t *pfitBitmap = 0; | |
+ | |
+ | |
+ // First search pass... | |
+ // | |
+ // For many files (especially large ones) most of the file is unchanged. | |
+ // The RollingChecksum process requires us to read every byte of the file looking for | |
+ // blocks that have moved. However, we can make that process more efficient by | |
+ // quickly rolling over areas that match a different block. We can also use | |
+ // this to eliminate entirely the rolling checksum for block sizes that only exist | |
+ // at one location in the file. | |
+ // | |
+ // Thus we start by looking for blocks that have not moved. Only if a block | |
+ // cannot be found at its previous location do we consider scanning for it by | |
+ // rolling checksum size. | |
+ // | |
+ // This strategy has some disadvantages for files with lots of repeating content | |
+ // that happens to align with our block size, but the reduction in diff time | |
+ // for more typical files is worth it. | |
+ // | |
+ // Note also that in this pass we consider _all_ block sizes (smaller than | |
+ // BACKUP_FILE_MAX_BLOCK_SIZE). Any block size, no matter how small or rare | |
+ // is cheap for us to find in this pass. | |
+ // | |
+ pbuffer0 = (uint8_t *)::malloc(BACKUP_FILE_MAX_BLOCK_SIZE); | |
+ try | |
{ | |
- // Only if the block size is bigger than the minimum size we'll scan for | |
- if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE) | |
+ if(pbuffer0 == 0) | |
{ | |
- // Find entry? | |
- std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize)); | |
- if(f != foundSizes.end()) | |
+ throw std::bad_alloc(); | |
+ } | |
+ | |
+ // We have to track file offset since the read may fail | |
+ int64_t fileOffset = 0; | |
+ | |
+ // Walk the blocks | |
+ for(int64_t b = 0; b < NumBlocks; ++b) | |
+ { | |
+ // Check diffing timeout | |
+ if(maximumDiffingTime.HasExpired()) | |
{ | |
- // Increment existing entry | |
- foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; | |
+ ASSERT(pDiffTimer != NULL); | |
+ BOX_INFO("MaximumDiffingTime reached - suspending file diff"); | |
+ abortSearch = true; | |
+ break; | |
} | |
- else | |
+ | |
+ // Send keep alive | |
+ if(pDiffTimer) pDiffTimer->DoKeepAlive(); | |
+ | |
+ // Skip blocks too large for our buffer | |
+ if(pIndex[b].mSize > BACKUP_FILE_MAX_BLOCK_SIZE) { | |
+ fileOffset += pIndex[b].mSize; | |
+ continue; | |
+ } | |
+ | |
+ // Have to guard the seek operation, it could throw. We don't know size of the | |
+ // current file, and checking is pointless anyway since there's a race. | |
+ // In reality on Unix this is implemented with lseek which will mean we can | |
+ // seek past the EOF, but I don't want to make assumptions about Win32. | |
+ int32_t readSize = 0; | |
+ try | |
{ | |
- // New entry | |
- foundSizes[pIndex[b].mSize] = 1; | |
+ rFile.Seek(fileOffset, IOStream::SeekType_Absolute); | |
+ readSize = rFile.Read(pbuffer0, pIndex[b].mSize); | |
} | |
+ catch(BoxException &e) | |
+ { | |
+ if(e.GetType() != CommonException::ExceptionType || e.GetSubType() != CommonException::OSFileError) | |
+ { | |
+ // Not what we expected, rethrow | |
+ throw; | |
+ } | |
+ } | |
+ | |
+ // Check for a match | |
+ bool blockMatched = false; | |
+ if(readSize == pIndex[b].mSize) | |
+ { | |
+ // We don't have a rolling checksum to this point, so all we can do is MD5. If you | |
+ // worry this is expensive just remember that prior versions of this code | |
+ // re-read the file BACKUP_FILE_DIFF_MAX_BLOCK_SIZES times and calculated | |
+ // rolling checksums every time. | |
+ MD5Digest strong; | |
+ strong.Add(pbuffer0, pIndex[b].mSize); | |
+ strong.Finish(); | |
+ | |
+ // Do we have a match? | |
+ if(strong.DigestMatches(pIndex[b].mStrongChecksum)) | |
+ { | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Found unchanged block of size " << pIndex[b].mSize << " at offset " << fileOffset); | |
+#endif | |
+ rFoundBlocks[fileOffset] = b; | |
+ goodnessOfFit[fileOffset] = pIndex[b].mSize; | |
+ blockMatched = true; | |
+ } | |
+ } | |
+ | |
+ // We're done with the file offset, so increment now | |
+ fileOffset += pIndex[b].mSize; | |
+ | |
+ // If the block didn't match then this is a size we'll have to scan for | |
+ if(!blockMatched && (pIndex[b].mSize >= BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)) | |
+ { | |
+ // Find entry? | |
+ std::map<int32_t, int64_t>::const_iterator f(unmatchedSizes.find(pIndex[b].mSize)); | |
+ if(f != unmatchedSizes.end()) | |
+ { | |
+ // Increment existing entry | |
+ unmatchedSizes[pIndex[b].mSize] = unmatchedSizes[pIndex[b].mSize] + 1; | |
+ } | |
+ else | |
+ { | |
+ // New entry | |
+ unmatchedSizes[pIndex[b].mSize] = 1; | |
+ } | |
+ } | |
} | |
+ | |
+ // Cleanup | |
+ ::free(pbuffer0); | |
+ pbuffer0 = 0; | |
} | |
+ catch(...) | |
+ { | |
+ // Cleanup and rethrow | |
+ if(pbuffer0 != 0) ::free(pbuffer0); | |
+ throw; | |
+ } | |
+ | |
+ // Are we already out of time? | |
+ if(abortSearch) | |
+ { | |
+#ifndef NDEBUG | |
+ goto dumpDiffList; | |
+#endif | |
+ return; | |
+ } | |
- // Make the block sizes | |
- for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i) | |
+ | |
+ // Second search pass... | |
+ // | |
+ // In our second phase, having matched all unchanged blocks we now need | |
+ // to scan for moved blocks. This involves looping across all unmatched | |
+ // block sizes and using the rolling checksum to look for relocations. To keep | |
+ // this from being too expensive we cap at BACKUP_FILE_DIFF_MAX_BLOCK_SIZES | |
+ // for the number of sizes to scan. We also scan for the blocks in order of | |
+ // their relative probability, a block size that occurs frequently is scanned | |
+ // first. | |
+ // | |
+ | |
+ // Loop all sizes inserting higher usages into the array | |
+ for(std::map<int32_t, int64_t>::const_iterator i(unmatchedSizes.begin()); i != unmatchedSizes.end(); ++i) | |
{ | |
- // Find the position of the size in the array | |
+ // TODO: Scanning for any block size isn't cheap, and realistically in many cases | |
+ // it would be less expensive to upload a few thousand bytes rather than do the scan. | |
+ // Here would be a good place to filter block sizes that aren't worth the effort | |
+ // once a suitable set of heuristics is found. | |
+ | |
for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) | |
{ | |
- // Instead of sorting on the raw count of blocks, | |
- // take the file area covered by this block size. | |
- if(i->second * i->first > sizeCounts[t] * Sizes[t]) | |
+ // Instead of sorting on the raw count of blocks, take the file area covered by this | |
+ // block size. This helps avoid favoring low numbers of large blocks over many | |
+ // small blocks. | |
+ if((i->second * i->first) > (scanSizesCount[t] * scanSizes[t])) | |
{ | |
// Then this size belong before this entry -- shuffle them up | |
for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) | |
{ | |
- Sizes[s] = Sizes[s-1]; | |
- sizeCounts[s] = sizeCounts[s-1]; | |
+ scanSizes[s] = scanSizes[s-1]; | |
+ scanSizesCount[s] = scanSizesCount[s-1]; | |
} | |
// Insert this size | |
- Sizes[t] = i->first; | |
- sizeCounts[t] = i->second; | |
+ scanSizes[t] = i->first; | |
+ scanSizesCount[t] = i->second; | |
- // Shouldn't do any more searching | |
+ // Update max size | |
+ if(maxScanBlockSize < i->first) maxScanBlockSize = i->first; | |
+ | |
+ // Shouldn't do any more searching for this size | |
break; | |
} | |
} | |
} | |
- | |
- // trace the size table in debug builds | |
+ | |
+ // Dump the size table in debug builds | |
#ifndef NDEBUG | |
- if(BackupStoreFile::TraceDetailsOfDiffProcess) | |
+ for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) | |
{ | |
- for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) | |
+ if(scanSizes[t] != 0) | |
{ | |
- BOX_TRACE("Diff block size " << t << ": " << | |
- Sizes[t] << " (count = " << | |
- sizeCounts[t] << ")"); | |
+ BOX_TRACE("Scan block size " << t << ": " << | |
+ scanSizes[t] << " count: " << | |
+ scanSizesCount[t]); | |
} | |
} | |
#endif | |
-} | |
- | |
- | |
-// -------------------------------------------------------------------------- | |
-// | |
-// Function | |
-// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) | |
-// Purpose: Find the matching blocks within the file. | |
-// Created: 12/1/04 | |
-// | |
-// -------------------------------------------------------------------------- | |
-static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, | |
- BlocksAvailableEntry *pIndex, int64_t NumBlocks, | |
- int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer) | |
-{ | |
- Timer maximumDiffingTime(0, "MaximumDiffingTime"); | |
- | |
- if(pDiffTimer && pDiffTimer->IsManaged()) | |
+ // If we didn't find any sizes (could happen if all were found in the first matching | |
+ // phase) we're done. | |
+ if(maxScanBlockSize == 0) | |
{ | |
- maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(), | |
- "MaximumDiffingTime"); | |
+#ifndef NDEBUG | |
+ BOX_TRACE("No scan block sizes, skip rolling checksum"); | |
+ goto dumpDiffList; | |
+#endif | |
+ return; | |
} | |
- | |
- std::map<int64_t, int32_t> goodnessOfFit; | |
- // Allocate the hash lookup table | |
- BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); | |
+ ASSERT(maxScanBlockSize <= BACKUP_FILE_MAX_BLOCK_SIZE); | |
- // Choose a size for the buffer, just a little bit more than the maximum block size | |
- int32_t bufSize = Sizes[0]; | |
- for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) | |
- { | |
- if(Sizes[z] > bufSize) bufSize = Sizes[z]; | |
- } | |
- bufSize += 4; | |
- ASSERT(bufSize > Sizes[0]); | |
- ASSERT(bufSize > 0); | |
- if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) | |
- { | |
- THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) | |
- } | |
+ // Allocate two buffers we'll toggle between at the max scan block size | |
+ // There sizes are doubled to make final block at the end easier (we're cheating) | |
+ pbuffer0 = (uint8_t *)::malloc(maxScanBlockSize * 2); | |
+ pbuffer1 = (uint8_t *)::malloc(maxScanBlockSize * 2); | |
- // TODO: Because we read in the file a scanned block size at a time, | |
- // it is likely to be inefficient. Probably will be much better to | |
- // calculate checksums for all block sizes in a single pass. | |
- | |
- // Allocate the buffers. | |
- uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize); | |
- uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize); | |
+ // Allocate a bitmap buffer to optimize goodnessOfFit access | |
+ pfitBitmap = (uint8_t *)::malloc(maxScanBlockSize * 2 / 8); | |
+ | |
try | |
{ | |
// Check buffer allocation | |
- if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0) | |
+ if(pbuffer0 == 0 || pbuffer1 == 0 || pfitBitmap == 0) | |
{ | |
// If a buffer got allocated, it will be cleaned up in the catch block | |
throw std::bad_alloc(); | |
} | |
- // Flag to abort the run, if too many blocks are found -- avoid using | |
- // huge amounts of processor time when files contain many similar blocks. | |
- bool abortSearch = false; | |
+ // Shift file position back to beginning | |
+ int64_t bufferFileOffset = 0; | |
+ rFile.Seek(0, IOStream::SeekType_Absolute); | |
- // Search for each block size in turn | |
- // NOTE: Do the smallest size first, so that the scheme for adding | |
- // entries in the found list works as expected and replaces smallers block | |
- // with larger blocks when it finds matches at the same offset in the file. | |
- for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) | |
+ // We're going to be flipping back and forth between two buffers, the low and high | |
+ uint8_t *lowBuffer = pbuffer0; | |
+ int32_t lowBufferBytes = 0; | |
+ uint8_t *highBuffer = pbuffer1; | |
+ int32_t highBufferBytes = 0; | |
+ | |
+ // In some cases we need to carry over reads from prior buffers | |
+ int32_t carryOverBytes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ ::memset(carryOverBytes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ | |
+ // Read the first buffer's worth of data | |
+ lowBufferBytes = rFile.Read(lowBuffer, maxScanBlockSize); | |
+ // Fill the second buffer if appropriate | |
+ if(lowBufferBytes == maxScanBlockSize) | |
{ | |
- ASSERT(Sizes[s] <= bufSize); | |
- BOX_TRACE("Diff pass " << s << ", for block size " << | |
- Sizes[s]); | |
- | |
- // Check we haven't finished | |
- if(Sizes[s] == 0) | |
+ highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize); | |
+ } | |
+ | |
+ // For every block size, initialize our scan tracking | |
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) | |
+ { | |
+ ASSERT(scanSizes[z] <= maxScanBlockSize); | |
+ | |
+ // The sizes array may be mostly empty, in those cases we have no | |
+ // state to maintain. | |
+ if(scanSizes[z] != 0) | |
{ | |
- // empty entry, try next size | |
- continue; | |
+ // Mark for scan | |
+ scanThisSize[z] = true; | |
+ | |
+ // Set up the hash table for this size | |
+ hashTables[z] = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); | |
+ if(hashTables[z] == 0) | |
+ { | |
+ throw std::bad_alloc(); | |
+ } | |
+ SetupHashTable(pIndex, NumBlocks, scanSizes[z], hashTables[z]); | |
+ | |
+ // Set up a rolling checksum, but only if there is enough data to start with | |
+ // (file may now be shorter than some block sizes previously used) | |
+ if(scanSizes[z] <= lowBufferBytes) | |
+ { | |
+ rollingSums[z] = new RollingChecksum(lowBuffer, scanSizes[z]); | |
+ } | |
} | |
- | |
- // Set up the hash table entries | |
- SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); | |
+ } | |
- // Shift file position to beginning | |
- rFile.Seek(0, IOStream::SeekType_Absolute); | |
- | |
- // Read first block | |
- if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) | |
+ // Read loop while we can get full maxScanBlockSize reads | |
+ while(highBufferBytes == maxScanBlockSize) | |
+ { | |
+ // Oh happy day! We have maxScanBlockSize * 2 bytes available to us across | |
+ // the two buffers, which means we can walk every block size across all offsets | |
+ // in lowBuffer without any concern about overrunning the data available in highBuffer. | |
+ | |
+ // Check diffing timeout | |
+ if(maximumDiffingTime.HasExpired()) | |
{ | |
- // Size of file too short to match -- do next size | |
- continue; | |
+ ASSERT(pDiffTimer != NULL); | |
+ BOX_INFO("MaximumDiffingTime reached - suspending file diff"); | |
+ abortSearch = true; | |
+ break; | |
} | |
- // Setup block pointers | |
- uint8_t *beginnings = pbuffer0; | |
- uint8_t *endings = pbuffer1; | |
- int offset = 0; | |
+ // Send keep alive | |
+ if(pDiffTimer) pDiffTimer->DoKeepAlive(); | |
- // Calculate the first checksum, ready for rolling | |
- RollingChecksum rolling(beginnings, Sizes[s]); | |
+ // Don't you wish hash_map was standard? std::map is very slow | |
+ // when we access it as often as we do in the loop below. To work around | |
+ // we'll create a bitmap of the previous fits in this buffer | |
+ ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8); | |
+ for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i) | |
+ { | |
+ if(i->first >= (bufferFileOffset + maxScanBlockSize)) break; | |
+ pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7)); | |
+ } | |
- // Then roll, until the file is exhausted | |
- int64_t fileBlockNumber = 0; | |
- int64_t fileOffset = 0; | |
- int rollOverInitialBytes = 0; | |
- while(true) | |
+ // Walk all block sizes in block-probability order | |
+ for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s) | |
{ | |
- if(maximumDiffingTime.HasExpired()) | |
- { | |
- ASSERT(pDiffTimer != NULL); | |
- BOX_INFO("MaximumDiffingTime reached - " | |
- "suspending file diff"); | |
- abortSearch = true; | |
- break; | |
- } | |
+ // If there is no rolling checksum at this index, skip its either | |
+ // a zero size or the file was too small | |
+ if(!scanThisSize[s] || (rollingSums[s] == 0)) continue; | |
+ | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Diff block size " << scanSizes[s] << " at file offset " << bufferFileOffset); | |
+#endif | |
- if(pDiffTimer) | |
- { | |
- pDiffTimer->DoKeepAlive(); | |
- } | |
- | |
- // Load in another block of data, and record how big it is | |
- int bytesInEndings = rFile.Read(endings, Sizes[s]); | |
- int tmp; | |
+ // Offset of this block buffer | |
+ int32_t bufferOffset = 0; | |
- // Skip any bytes from a previous matched block | |
- if(rollOverInitialBytes > 0 && offset < bytesInEndings) | |
+ // Roll carry over | |
+ if(carryOverBytes[s] != 0) | |
{ | |
- int spaceLeft = bytesInEndings - offset; | |
- int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; | |
+ ASSERT(carryOverBytes[s] > 0); | |
+ // Carry can be bigger than maxScanBlockSize because first matching | |
+ // phase might have used bigger blocks than the current max scan size | |
+ int32_t thisCarry = carryOverBytes[s]; | |
+ if(thisCarry > maxScanBlockSize) | |
+ { | |
+ thisCarry = maxScanBlockSize; | |
+ } | |
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); | |
- | |
- offset += thisRoll; | |
- fileOffset += thisRoll; | |
- rollOverInitialBytes -= thisRoll; | |
- | |
- if(rollOverInitialBytes) | |
+ // Perform carry | |
+ if((thisCarry + scanSizes[s]) > maxScanBlockSize) | |
{ | |
- goto refresh; | |
+ // Roll is split some low, some low/high | |
+ int32_t lowRollBytes = maxScanBlockSize - scanSizes[s]; | |
+ ASSERT(lowRollBytes >= 0); | |
+ if(lowRollBytes > 0) | |
+ { | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], lowRollBytes); | |
+ } | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer + lowRollBytes, highBuffer, scanSizes[s], thisCarry - lowRollBytes); | |
} | |
+ else | |
+ { | |
+ // Roll is all in low buffer | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], thisCarry); | |
+ } | |
+ // Either way offset is carry | |
+ bufferOffset = thisCarry; | |
+ // Reuce carry by the carry amount actually carried | |
+ carryOverBytes[s] -= thisCarry; | |
+ ASSERT(carryOverBytes[s] >= 0); | |
} | |
- | |
- if(goodnessOfFit.count(fileOffset)) | |
+ | |
+ // Loop remaining low buffer bytes, taking a checksum at each offset | |
+ while (bufferOffset < maxScanBlockSize) | |
{ | |
- tmp = goodnessOfFit[fileOffset]; | |
- } | |
- else | |
- { | |
- tmp = 0; | |
- } | |
+ // Look for larger size matches at this offset | |
+ int32_t bestFitSoFarSize = 0; | |
- if(tmp >= Sizes[s]) | |
- { | |
- // Skip over bigger ready-matched blocks completely | |
- rollOverInitialBytes = tmp; | |
- int spaceLeft = bytesInEndings - offset; | |
- int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; | |
- | |
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); | |
- | |
- offset += thisRoll; | |
- fileOffset += thisRoll; | |
- rollOverInitialBytes -= thisRoll; | |
- | |
- if(rollOverInitialBytes) | |
+ if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7))) | |
{ | |
- goto refresh; | |
+ bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset]; | |
} | |
- } | |
- while(offset < bytesInEndings) | |
- { | |
- // Is current checksum in hash list? | |
- uint16_t hash = rolling.GetComponentForHashing(); | |
- if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) | |
+ if(bestFitSoFarSize >= scanSizes[s]) | |
{ | |
- if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) | |
+ // Roll can be bigger than maxScanBlockSize because first matching | |
+ // phase might have used bigger blocks than the max scan size | |
+ int32_t thisRoll = bestFitSoFarSize; | |
+ int32_t remainderRoll = 0; | |
+ if(thisRoll > maxScanBlockSize) | |
{ | |
- BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); | |
- goodnessOfFit[fileOffset] = Sizes[s]; | |
+ thisRoll = maxScanBlockSize; | |
+ remainderRoll = bestFitSoFarSize - thisRoll; | |
+ } | |
- // Block matched, roll the checksum forward to the next block without doing | |
- // any more comparisons, because these are pointless (as any more matches will be ignored when | |
- // the recipe is generated) and just take up valuable processor time. Edge cases are | |
- // especially nasty, using huge amounts of time and memory. | |
- int skip = Sizes[s]; | |
- if(offset < bytesInEndings && skip > 0) | |
- { | |
- int spaceLeft = bytesInEndings - offset; | |
- int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; | |
+ // Roll forward in the lower buffer. This will either exhaust the total roll or will | |
+ // push the rolling checksum so that it is against the end of the buffer. | |
+ int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]); | |
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); | |
+ if(lowRoll > 0) | |
+ { | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll); | |
+ bufferOffset += lowRoll; | |
+ thisRoll -= lowRoll; | |
+ } | |
- offset += thisRoll; | |
- fileOffset += thisRoll; | |
- skip -= thisRoll; | |
- } | |
- // Not all the bytes necessary will have been skipped, so get them | |
- // skipped after the next block is loaded. | |
- rollOverInitialBytes = skip; | |
- | |
- // End this loop, so the final byte isn't used again | |
- break; | |
- } | |
- else | |
+ if(thisRoll < 1) | |
{ | |
- BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); | |
+ ASSERT(remainderRoll == 0); | |
+ continue; // Back around to bufferOffset loop | |
} | |
- int64_t NumBlocksFound = static_cast<int64_t>( | |
- rFoundBlocks.size()); | |
- int64_t MaxBlocksFound = NumBlocks * | |
- BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; | |
- | |
- if(NumBlocksFound > MaxBlocksFound) | |
+ // Roll high/low section. Our roll here will either exhaust the total or will | |
+ // leave the rolling checksum positioned a offset zero in the high buffer | |
+ int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset); | |
+ ASSERT(lowHighRoll > 0); | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll); | |
+ bufferOffset += lowHighRoll; | |
+ carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll; | |
+ ASSERT(carryOverBytes[s] >= 0); | |
+ continue; // Back around to bufferOffset loop | |
+ } | |
+ | |
+ // Look for block match at this size | |
+ uint16_t hashValue = rollingSums[s]->GetComponentForHashing(); | |
+ if(hashTables[s][hashValue] != 0) | |
+ { | |
+ int32_t lowSize = maxScanBlockSize - bufferOffset; | |
+ if(lowSize > scanSizes[s]) lowSize = scanSizes[s]; | |
+ int32_t highSize = scanSizes[s] - lowSize; | |
+ if(SecondStageMatch(hashTables[s][hashValue], | |
+ rollingSums[s], | |
+ lowBuffer + bufferOffset, lowSize, | |
+ highBuffer, highSize, | |
+ scanSizes[s], | |
+ bufferFileOffset + bufferOffset, | |
+ pIndex, rFoundBlocks)) | |
{ | |
- abortSearch = true; | |
- break; | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset); | |
+#endif | |
+ // Update best fit so far | |
+ ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s])); | |
+ goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s]; | |
+ pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7)); | |
+ | |
+ // We've found a match, don't scan anymore if we don't expect more blocks of this size | |
+ --scanSizesCount[s]; | |
+ if(scanSizesCount[s] < 1) | |
+ { | |
+ scanThisSize[s] = false; | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Short-circuit size " << scanSizes[s]); | |
+#endif | |
+ } | |
+ | |
+ // Roll so we don't bother diffing the remainder of this block, see bestFitSoFarSize | |
+ // roll above for an explanation of the logic | |
+ int32_t thisRoll = scanSizes[s]; | |
+ int32_t remainderRoll = 0; | |
+ if(thisRoll > maxScanBlockSize) | |
+ { | |
+ thisRoll = maxScanBlockSize; | |
+ remainderRoll = bestFitSoFarSize - thisRoll; | |
+ } | |
+ int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]); | |
+ if(lowRoll > 0) | |
+ { | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll); | |
+ bufferOffset += lowRoll; | |
+ thisRoll -= lowRoll; | |
+ } | |
+ if(thisRoll < 1) | |
+ { | |
+ ASSERT(remainderRoll == 0); | |
+ continue; // Back around to bufferOffset loop | |
+ } | |
+ int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset); | |
+ ASSERT(lowHighRoll > 0); | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll); | |
+ bufferOffset += lowHighRoll; | |
+ carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll; | |
+ ASSERT(carryOverBytes[s] >= 0); | |
+ continue; // Back around to bufferOffset loop | |
} | |
} | |
- // Roll checksum forward | |
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); | |
- | |
- // Increment offsets | |
- ++offset; | |
- ++fileOffset; | |
+ // Roll one byte, either low or low/high depending on current offset | |
+ if((bufferOffset + scanSizes[s]) >= maxScanBlockSize) | |
+ { | |
+ rollingSums[s]->RollForward(lowBuffer[bufferOffset], highBuffer[scanSizes[s] - (maxScanBlockSize - bufferOffset)], scanSizes[s]); | |
+ } | |
+ else | |
+ { | |
+ rollingSums[s]->RollForward(lowBuffer[bufferOffset], lowBuffer[bufferOffset + scanSizes[s]], scanSizes[s]); | |
+ } | |
+ ++bufferOffset; | |
} | |
+ | |
+ // Check if we've found too much | |
+ int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size()); | |
+ int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; | |
+ if(NumBlocksFound > MaxBlocksFound) | |
+ { | |
+ abortSearch = true; | |
+ break; | |
+ } | |
+ } | |
+ // If we're aborting, pass through | |
+ if(abortSearch) break; | |
+ | |
+ // Swap buffers so high is low and the go back around | |
+ uint8_t *swap = lowBuffer; | |
+ lowBuffer = highBuffer; | |
+ ASSERT((lowBufferBytes == highBufferBytes) && (lowBufferBytes == maxScanBlockSize)); | |
+ bufferFileOffset += maxScanBlockSize; | |
+ highBuffer = swap; | |
+ highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize); | |
+ | |
+ } // End of maxScanBlockSize reads | |
+ | |
+ // What remains is either: | |
+ // - An abort or timeout | |
+ // - A low buffer with less than maxScanBlockSize bytes and an empty high buffer | |
+ // (Potentially zero low bytes for a zero-length file) | |
+ // - A maxScanBlockSize low buffer and a maxScanBlockSize or less high buffer | |
+ // | |
+ if(!abortSearch && !maximumDiffingTime.HasExpired() && (lowBufferBytes > 0)) | |
+ { | |
+ // We're going to repeat the logic in the previous read loop, but have | |
+ // to be careful not to walk off the end of the available data. To make this | |
+ // a lot less convoluted, we're just going to copy to consolidate buffers. | |
+ // This is cheating a bit, but worth the simplicity. | |
+ ASSERT((lowBufferBytes < maxScanBlockSize) ? (highBufferBytes == 0) : true); | |
+ ASSERT((highBufferBytes != 0) ? (lowBufferBytes == maxScanBlockSize) : true); | |
+ uint8_t *endingBuffer = lowBuffer; | |
+ if(highBufferBytes != 0) | |
+ { | |
+ ::memcpy(endingBuffer + maxScanBlockSize, highBuffer, highBufferBytes); | |
+ } | |
+ int32_t endingBytes = lowBufferBytes + highBufferBytes; | |
+ | |
+ // Send keep alive | |
+ if(pDiffTimer) pDiffTimer->DoKeepAlive(); | |
+ | |
+ // std::map still not winning | |
+ ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8); | |
+ for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i) | |
+ { | |
+ pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7)); | |
+ } | |
+ | |
+ // Walk all block sizes in block-probability order | |
+ for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s) | |
+ { | |
+ // If there is no rolling checksum at this index, skip its either | |
+ // a zero size or the file was too small | |
+ if(!scanThisSize[s] || (rollingSums[s] == 0)) continue; | |
- if(abortSearch) break; | |
+ // If there's not enough data at the end, skip this block size | |
+ if(endingBytes < scanSizes[s]) continue; | |
+ | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Diff block size " << scanSizes[s] << " at file ending offset " << bufferFileOffset); | |
+#endif | |
- refresh: | |
- // Finished? | |
- if(bytesInEndings != Sizes[s]) | |
+ // Offset of this block buffer | |
+ int32_t bufferOffset = 0; | |
+ | |
+ // For each block size we can only walk so far safely | |
+ int32_t maxSafeOffset = endingBytes - scanSizes[s]; | |
+ | |
+ // Roll carry over | |
+ if(carryOverBytes[s] != 0) | |
{ | |
- // No more data in file -- check the final block | |
- // (Do a copy and paste of 5 lines of code instead of introducing a comparison for | |
- // each byte of the file) | |
- uint16_t hash = rolling.GetComponentForHashing(); | |
- if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) | |
+ ASSERT(carryOverBytes[s] > 0); | |
+ // Don't walk off the end | |
+ if(carryOverBytes[s] > maxSafeOffset) continue; | |
+ // Roll | |
+ rollingSums[s]->RollForwardSeveral(endingBuffer, endingBuffer + scanSizes[s], scanSizes[s], carryOverBytes[s]); | |
+ bufferOffset = carryOverBytes[s]; | |
+ carryOverBytes[s] = 0; // Not really needed again, but be clean | |
+ } | |
+ | |
+ // Loop remaining end bytes, we must walk to exactly and then break the loop | |
+ while (bufferOffset <= maxSafeOffset) | |
+ { | |
+ // Look for larger size matches at this offset | |
+ int32_t bestFitSoFarSize = 0; | |
+ if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7))) | |
{ | |
- if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) | |
+ bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset]; | |
+ } | |
+ if(bestFitSoFarSize >= scanSizes[s]) { | |
+ // Space for this roll? | |
+ if((bufferOffset + bestFitSoFarSize) > maxSafeOffset) break; // Kill byte walk loop | |
+ // Roll | |
+ rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], bestFitSoFarSize); | |
+ bufferOffset += bestFitSoFarSize; | |
+ continue; // Back around to byte read loop | |
+ } | |
+ | |
+ // Look for block match at this size | |
+ uint16_t hashValue = rollingSums[s]->GetComponentForHashing(); | |
+ if(hashTables[s][hashValue] != 0) | |
+ { | |
+ if(SecondStageMatch(hashTables[s][hashValue], | |
+ rollingSums[s], | |
+ endingBuffer + bufferOffset, | |
+ scanSizes[s], | |
+ 0, 0, scanSizes[s], | |
+ bufferFileOffset + bufferOffset, | |
+ pIndex, rFoundBlocks)) | |
{ | |
- goodnessOfFit[fileOffset] = Sizes[s]; | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset); | |
+#endif | |
+ // Update best fit so far | |
+ ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s])); | |
+ goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s]; | |
+ pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7)); | |
+ | |
+ // We've found a match, don't scan anymore if we don't expect more blocks of this size | |
+ --scanSizesCount[s]; | |
+ if(scanSizesCount[s] < 1) | |
+ { | |
+ scanThisSize[s] = false; | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Short-circuit size " << scanSizes[s]); | |
+#endif | |
+ } | |
+ | |
+ // Roll over this matched block | |
+ if((bufferOffset + scanSizes[s]) > maxSafeOffset) break; // Kill byte walk loop | |
+ rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], scanSizes[s]); | |
+ bufferOffset += scanSizes[s]; | |
+ continue; // Back around to byte read loop | |
} | |
} | |
- | |
- // finish | |
- break; | |
+ | |
+ // Abort check | |
+ int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size()); | |
+ int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; | |
+ if(NumBlocksFound > MaxBlocksFound) | |
+ { | |
+ abortSearch = true; | |
+ break; | |
+ } | |
+ | |
+ // Because byte loop conditional can equal maxSafeOffset we must not read off | |
+ // end here | |
+ if(bufferOffset == maxSafeOffset) break; // Done with reading bytes | |
+ rollingSums[s]->RollForward(endingBuffer[bufferOffset], endingBuffer[bufferOffset + scanSizes[s]], scanSizes[s]); | |
+ ++bufferOffset; | |
} | |
- | |
- // Switch buffers, reset offset | |
- beginnings = endings; | |
- endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer | |
- offset = 0; | |
- | |
- // And count the blocks which have been done | |
- ++fileBlockNumber; | |
+ if(abortSearch) break; | |
+ // NOTE: Nothing else can go here without restructuring use of "break" in the byte read loop above | |
} | |
- | |
- if(abortSearch) break; | |
} | |
- // Free buffers and hash table | |
+ // Clean up | |
+ ::free(pbuffer0); | |
+ pbuffer0 = 0; | |
::free(pbuffer1); | |
pbuffer1 = 0; | |
- ::free(pbuffer0); | |
- pbuffer0 = 0; | |
- ::free(phashTable); | |
- phashTable = 0; | |
+ ::free(pfitBitmap); | |
+ pfitBitmap = 0; | |
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) | |
+ { | |
+ if(hashTables[z] != 0) | |
+ { | |
+ free(hashTables[z]); | |
+ hashTables[z] = 0; | |
+ } | |
+ if(rollingSums[z] != 0) | |
+ { | |
+ delete rollingSums[z]; | |
+ rollingSums[z] = 0; | |
+ } | |
+ } | |
} | |
catch(...) | |
{ | |
- // Cleanup and throw | |
+ // Cleanup and rethrow | |
+ if(pbuffer0 != 0) ::free(pbuffer0); | |
if(pbuffer1 != 0) ::free(pbuffer1); | |
- if(pbuffer0 != 0) ::free(pbuffer0); | |
- if(phashTable != 0) ::free(phashTable); | |
+ if(pfitBitmap != 0) ::free(pfitBitmap); | |
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) | |
+ { | |
+ if(hashTables[z] != 0) free(hashTables[z]); | |
+ } | |
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) | |
+ { | |
+ if(rollingSums[z] != 0) delete rollingSums[z]; | |
+ } | |
throw; | |
} | |
- | |
+ | |
#ifndef NDEBUG | |
- if(BackupStoreFile::TraceDetailsOfDiffProcess) | |
+dumpDiffList: | |
+ // Trace out the found blocks in debug mode | |
+ BOX_TRACE("Diff: list of found blocks"); | |
+ BOX_TRACE("======== ======== ======== ========"); | |
+ BOX_TRACE(" Offset BlkIdx Size Movement"); | |
+ for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) | |
{ | |
- // Trace out the found blocks in debug mode | |
- BOX_TRACE("Diff: list of found blocks"); | |
- BOX_TRACE("======== ======== ======== ========"); | |
- BOX_TRACE(" Offset BlkIdx Size Movement"); | |
- for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) | |
+ int64_t orgLoc = 0; | |
+ for(int64_t b = 0; b < i->second; ++b) | |
{ | |
- int64_t orgLoc = 0; | |
- for(int64_t b = 0; b < i->second; ++b) | |
- { | |
- orgLoc += pIndex[b].mSize; | |
- } | |
- BOX_TRACE(std::setw(8) << i->first << " " << | |
- std::setw(8) << i->second << " " << | |
- std::setw(8) << pIndex[i->second].mSize << | |
- " " << | |
- std::setw(8) << (i->first - orgLoc)); | |
+ orgLoc += pIndex[b].mSize; | |
} | |
- BOX_TRACE("======== ======== ======== ========"); | |
+ BOX_TRACE(std::setw(8) << i->first << " " << | |
+ std::setw(8) << i->second << " " << | |
+ std::setw(8) << pIndex[i->second].mSize << | |
+ " " << | |
+ std::setw(8) << (i->first - orgLoc)); | |
} | |
+ BOX_TRACE("======== ======== ======== ========"); | |
#endif | |
} | |
@@ -781,6 +1158,8 @@ | |
{ | |
//BOX_TRACE("Another hash entry for " << hash << " found"); | |
// Yes -- need to set the pointer in this entry to the current entry to build the linked list | |
+ // This is safe because there is only one hash table per block size and block sizes | |
+ // are unique. | |
pIndex[b].mpNextInHashList = pHashTable[hash]; | |
} | |
@@ -790,30 +1169,35 @@ | |
} | |
} | |
- | |
// -------------------------------------------------------------------------- | |
// | |
// Function | |
// Name: static bool SecondStageMatch(xxx) | |
-// Purpose: When a match in the hash table is found, scan for second stage match using strong checksum. | |
+// Purpose: When a match in the hash table is found, scan for | |
+// second stage match using strong checksum. | |
// Created: 14/1/04 | |
// | |
// -------------------------------------------------------------------------- | |
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, | |
- int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) | |
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, | |
+ RollingChecksum *pFastSum, uint8_t *pLow, int32_t LowSize, | |
+ uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, | |
+ int64_t FileOffset, BlocksAvailableEntry *pIndex, | |
+ std::map<int64_t, int64_t> &rFoundBlocks) | |
{ | |
// Check parameters | |
- ASSERT(pBeginnings != 0); | |
- ASSERT(pEndings != 0); | |
- ASSERT(Offset >= 0); | |
+ ASSERT(pFirstInHashList != 0); | |
+ ASSERT(pFastSum != 0); | |
+ ASSERT(pLow != 0); | |
+ ASSERT(LowSize != 0); | |
+ ASSERT((HighSize > 0) ? (pHigh != 0) : true); | |
ASSERT(BlockSize > 0); | |
- ASSERT(pFirstInHashList != 0); | |
+ ASSERT(BlockSize == (LowSize + HighSize)); | |
ASSERT(pIndex != 0); | |
#ifndef NDEBUG | |
- uint16_t DEBUG_Hash = fastSum.GetComponentForHashing(); | |
+ uint16_t DEBUG_Hash = pFastSum->GetComponentForHashing(); | |
#endif | |
- uint32_t Checksum = fastSum.GetChecksum(); | |
+ uint32_t Checksum = pFastSum->GetChecksum(); | |
// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. | |
BlocksAvailableEntry *scan = pFirstInHashList; | |
@@ -834,18 +1218,13 @@ | |
// Calculate the strong MD5 digest for this block | |
MD5Digest strong; | |
- // Add the data from the beginnings | |
- strong.Add(pBeginnings + Offset, BlockSize - Offset); | |
- // Add any data from the endings | |
- if(Offset > 0) | |
- { | |
- strong.Add(pEndings, Offset); | |
- } | |
+ strong.Add(pLow, LowSize); | |
+ if(HighSize > 0) strong.Add(pHigh, HighSize); | |
strong.Finish(); | |
// Then go through the entries in the hash list, comparing with the strong digest calculated | |
scan = pFirstInHashList; | |
- //BOX_TRACE("second stage match"); | |
+ | |
while(scan != 0) | |
{ | |
//BOX_TRACE("scan size " << scan->mSize << | |
@@ -853,25 +1232,22 @@ | |
// ", hash " << Hash); | |
ASSERT(scan->mSize == BlockSize); | |
ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash); | |
- | |
+ | |
// Compare? | |
if(strong.DigestMatches(scan->mStrongChecksum)) | |
{ | |
- //BOX_TRACE("Match!\n"); | |
// Found! Add to list of found blocks... | |
- int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset; | |
int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory | |
// We do NOT search for smallest blocks first, as this code originally assumed. | |
// To prevent this from potentially overwriting a better match, the caller must determine | |
// the relative "goodness" of any existing match and this one, and avoid the call if it | |
// could be detrimental. | |
- rFoundBlocks[fileOffset] = blockIndex; | |
+ rFoundBlocks[FileOffset] = blockIndex; | |
// No point in searching further, report success | |
return true; | |
} | |
- | |
// Next | |
scan = scan->mpNextInHashList; | |
} | |
@@ -917,11 +1293,8 @@ | |
rRecipe.push_back(instruction); | |
#ifndef NDEBUG | |
- if(BackupStoreFile::TraceDetailsOfDiffProcess) | |
- { | |
- BOX_TRACE("Diff: Default recipe generated, " << | |
- SizeOfInputFile << " bytes of file"); | |
- } | |
+ BOX_TRACE("Diff: Default recipe generated, " << | |
+ SizeOfInputFile << " bytes of file"); | |
#endif | |
// Don't do anything | |
@@ -1016,7 +1389,8 @@ | |
BOX_TRACE("Diff: " << | |
debug_NewBytesFound << " new bytes found, " << | |
debug_OldBlocksUsed << " old blocks used"); | |
- if(BackupStoreFile::TraceDetailsOfDiffProcess) | |
+ | |
+ if (true) | |
{ | |
BOX_TRACE("Diff: Recipe generated (size " << rRecipe.size()); | |
BOX_TRACE("======== ========= ========"); | |
Index: test/bbackupd/testbbackupd.cpp | |
=================================================================== | |
--- test/bbackupd/testbbackupd.cpp (revision 2361) | |
+++ test/bbackupd/testbbackupd.cpp (working copy) | |
@@ -1002,22 +1002,27 @@ | |
// before any matching blocks could be found. | |
intercept_setup_delay("testfiles/TestDir1/spacetest/f1", | |
0, 4000, SYS_read, 1); | |
- pid = start_internal_daemon(); | |
- intercept_clear_setup(); | |
+ { | |
+ Timers::Init(); | |
+ BackupDaemon bbackupd; | |
+ bbackupd.Configure("testfiles/bbackupd.conf"); | |
+ bbackupd.InitCrypto(); | |
- fd = open("testfiles/TestDir1/spacetest/f1", O_WRONLY); | |
- TEST_THAT(fd > 0); | |
- // write again, to update the file's timestamp | |
- TEST_EQUAL(sizeof(buffer), write(fd, buffer, sizeof(buffer)), | |
- "Buffer write"); | |
- TEST_THAT(close(fd) == 0); | |
+ fd = open("testfiles/TestDir1/spacetest/f1", O_WRONLY); | |
+ TEST_THAT(fd > 0); | |
+ // write again, to update the file's timestamp | |
+ TEST_EQUAL(1, write(fd, "z", 1), "Buffer write"); | |
+ TEST_THAT(close(fd) == 0); | |
- wait_for_backup_operation(); | |
- // can't test whether intercept was triggered, because | |
- // it's in a different process. | |
- // TEST_THAT(intercept_triggered()); | |
- TEST_THAT(stop_internal_daemon(pid)); | |
+ // wait long enough to put file into sync window | |
+ wait_for_operation(5); | |
+ bbackupd.RunSyncNow(); | |
+ TEST_THAT(intercept_triggered()); | |
+ intercept_clear_setup(); | |
+ Timers::Cleanup(); | |
+ } | |
+ | |
// check that the diff was aborted, i.e. upload was not a diff | |
found1 = false; | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- lib/backupclient/BackupStoreFile.h 2008-03-03 19:34:49.000000000 -0800 | |
+++ lib/backupclient/BackupStoreFile.h 2008-02-27 20:17:20.000000000 -0800 | |
@@ -206,11 +206,6 @@ | |
static void ResetStats(); | |
static BackupStoreFileStats msStats; | |
- // For debug | |
-#ifndef NDEBUG | |
- static bool TraceDetailsOfDiffProcess; | |
-#endif | |
- | |
// For decoding encoded files | |
static void DumpFile(void *clibFileHandle, bool ToTrace, IOStream &rFile); | |
}; | |
--- test/backupdiff/testbackupdiff.cpp 2008-03-03 19:34:37.000000000 -0800 | |
+++ test/backupdiff/testbackupdiff.cpp 2008-02-27 20:22:53.000000000 -0800 | |
@@ -376,7 +376,7 @@ | |
// Want to trace out all the details | |
#ifndef NDEBUG | |
#ifndef WIN32 | |
- BackupStoreFile::TraceDetailsOfDiffProcess = true; | |
+ Logging::SetGlobalLevel(Log::TRACE); | |
#endif | |
#endif | |
--- lib/backupclient/BackupStoreFileDiff.cpp 2008-03-03 19:34:49.000000000 -0800 | |
+++ lib/backupclient/BackupStoreFileDiff.cpp 2008-03-03 20:09:08.000000000 -0800 | |
@@ -36,21 +36,12 @@ | |
using namespace BackupStoreFileCryptVar; | |
using namespace BackupStoreFileCreation; | |
-// By default, don't trace out details of the diff as we go along -- would fill up logs significantly. | |
-// But it's useful for the test. | |
-#ifndef NDEBUG | |
- bool BackupStoreFile::TraceDetailsOfDiffProcess = false; | |
-#endif | |
- | |
static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis); | |
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); | |
-static void SearchForMatchingBlocks(IOStream &rFile, | |
- std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, | |
- int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], | |
- DiffTimer *pDiffTimer); | |
+static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer); | |
static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); | |
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, | |
-BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); | |
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum *pFastSum, | |
+ uint8_t *pLow, int32_t LowSize, uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, int64_t FileOffset, | |
+ BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); | |
static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); | |
// -------------------------------------------------------------------------- | |
@@ -166,10 +157,6 @@ | |
try | |
{ | |
- // Find which sizes should be scanned | |
- int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
- FindMostUsedSizes(pindex, blocksInIndex, sizesToScan); | |
- | |
// Flag for reporting to the user | |
bool completelyDifferent; | |
@@ -184,8 +171,7 @@ | |
// Get size of file | |
sizeOfInputFile = file.BytesLeftToRead(); | |
// Find all those lovely matching blocks | |
- SearchForMatchingBlocks(file, foundBlocks, pindex, | |
- blocksInIndex, sizesToScan, pDiffTimer); | |
+ SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, pDiffTimer); | |
// Is it completely different? | |
completelyDifferent = (foundBlocks.size() == 0); | |
@@ -361,390 +347,744 @@ | |
} | |
} | |
- | |
// -------------------------------------------------------------------------- | |
// | |
// Function | |
-// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) | |
-// Purpose: Finds the most commonly used block sizes in the index | |
+// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, DiffTimer *) | |
+// Purpose: Find the matching blocks within the file. | |
// Created: 12/1/04 | |
// | |
// -------------------------------------------------------------------------- | |
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) | |
+static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, | |
+ BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer) | |
{ | |
- // Array for lengths | |
- int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ Timer maximumDiffingTime(0); | |
- // Set arrays to lots of zeros (= unused entries) | |
- for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l) | |
+ if(pDiffTimer && pDiffTimer->IsManaged()) | |
{ | |
- Sizes[l] = 0; | |
- sizeCounts[l] = 0; | |
+ maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime()); | |
} | |
- // Array for collecting sizes | |
- std::map<int32_t, int64_t> foundSizes; | |
+ // Flag to abort the run, if too many blocks are found or the diffing | |
+ // timeout expires | |
+ bool abortSearch = false; | |
+ | |
+ // Buffers used during both phases of search | |
+ uint8_t *pbuffer0 = 0; | |
+ uint8_t *pbuffer1 = 0; | |
+ | |
+ // Track offsets that already have block matches. We don't really care | |
+ // if its sorted, and this actually produces a performance issue, so | |
+ // see pfitBitmap for a workaround | |
+ std::map<int64_t, int32_t> goodnessOfFit; | |
- // Run through blocks and make a count of the entries | |
- for(int64_t b = 0; b < NumBlocks; ++b) | |
+ // Collect sizes that aren't found in the file at their old offset | |
+ std::map<int32_t, int64_t> unmatchedSizes; | |
+ | |
+ // Our arrays of block sizes during second search pass | |
+ int32_t scanSizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ int64_t scanSizesCount[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ int32_t maxScanBlockSize = 0; | |
+ ::memset(scanSizes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ ::memset(scanSizesCount, 0, (sizeof(int64_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ | |
+ // We need to keep separate rolling checksums for each block size in second search | |
+ RollingChecksum *rollingSums[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ ::memset(rollingSums, 0, (sizeof(RollingChecksum *) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ | |
+ // And a hash lookup table per block size in seacond search | |
+ BlocksAvailableEntry **hashTables[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ ::memset(hashTables, 0, (sizeof(BlocksAvailableEntry **) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ | |
+ // We allow second search pass to short-circuit off rare blocks | |
+ bool scanThisSize[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ ::memset(scanThisSize, 0, (sizeof(bool) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ | |
+ // During block read we have a bitmap of prefit locations to avoid std::map | |
+ // performance problems | |
+ uint8_t *pfitBitmap = 0; | |
+ | |
+ | |
+ // First search pass... | |
+ // | |
+ // For many files (especially large ones) most of the file is unchanged. | |
+ // The RollingChecksum process requires us to read every byte of the file looking for | |
+ // blocks that have moved. However, we can make that process more efficient by | |
+ // quickly rolling over areas that match a different block. We can also use | |
+ // this to eliminate entirely the rolling checksum for block sizes that only exist | |
+ // at one location in the file. | |
+ // | |
+ // Thus we start by looking for blocks that have not moved. Only if a block | |
+ // cannot be found at its previous location do we consider scanning for it by | |
+ // rolling checksum size. | |
+ // | |
+ // This strategy has some disadvantages for files with lots of repeating content | |
+ // that happens to align with our block size, but the reduction in diff time | |
+ // for more typical files is worth it. | |
+ // | |
+ // Note also that in this pass we consider _all_ block sizes (smaller than | |
+ // BACKUP_FILE_MAX_BLOCK_SIZE). Any block size, no matter how small or rare | |
+ // is cheap for us to find in this pass. | |
+ // | |
+ pbuffer0 = (uint8_t *)::malloc(BACKUP_FILE_MAX_BLOCK_SIZE); | |
+ try | |
{ | |
- // Only if the block size is bigger than the minimum size we'll scan for | |
- if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE) | |
+ if(pbuffer0 == 0) | |
+ { | |
+ throw std::bad_alloc(); | |
+ } | |
+ | |
+ // We have to track file offset since the read may fail | |
+ int64_t fileOffset = 0; | |
+ | |
+ // Walk the blocks | |
+ for(int64_t b = 0; b < NumBlocks; ++b) | |
{ | |
- // Find entry? | |
- std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize)); | |
- if(f != foundSizes.end()) | |
+ // Check diffing timeout | |
+ if(maximumDiffingTime.HasExpired()) | |
+ { | |
+ ASSERT(pDiffTimer != NULL); | |
+ BOX_INFO("MaximumDiffingTime reached - suspending file diff"); | |
+ abortSearch = true; | |
+ break; | |
+ } | |
+ | |
+ // Send keep alive | |
+ if(pDiffTimer) pDiffTimer->DoKeepAlive(); | |
+ | |
+ // Skip blocks too large for our buffer | |
+ if(pIndex[b].mSize > BACKUP_FILE_MAX_BLOCK_SIZE) { | |
+ fileOffset += pIndex[b].mSize; | |
+ continue; | |
+ } | |
+ | |
+ // Have to guard the seek operation, it could throw. We don't know size of the | |
+ // current file, and checking is pointless anyway since there's a race. | |
+ // In reality on Unix this is implemented with lseek which will mean we can | |
+ // seek past the EOF, but I don't want to make assumptions about Win32. | |
+ int32_t readSize = 0; | |
+ try | |
+ { | |
+ rFile.Seek(fileOffset, IOStream::SeekType_Absolute); | |
+ readSize = rFile.Read(pbuffer0, pIndex[b].mSize); | |
+ } | |
+ catch(BoxException &e) | |
+ { | |
+ if(e.GetType() != CommonException::ExceptionType || e.GetSubType() != CommonException::OSFileError) | |
+ { | |
+ // Not what we expected, rethrow | |
+ throw; | |
+ } | |
+ } | |
+ | |
+ // Check for a match | |
+ bool blockMatched = false; | |
+ if(readSize == pIndex[b].mSize) | |
{ | |
- // Increment existing entry | |
- foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; | |
+ // We don't have a rolling checksum to this point, so all we can do is MD5. If you | |
+ // worry this is expensive just remember that prior versions of this code | |
+ // re-read the file BACKUP_FILE_DIFF_MAX_BLOCK_SIZES times and calculated | |
+ // rolling checksums every time. | |
+ MD5Digest strong; | |
+ strong.Add(pbuffer0, pIndex[b].mSize); | |
+ strong.Finish(); | |
+ | |
+ // Do we have a match? | |
+ if(strong.DigestMatches(pIndex[b].mStrongChecksum)) | |
+ { | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Found unchanged block of size " << pIndex[b].mSize << " at offset " << fileOffset); | |
+#endif | |
+ rFoundBlocks[fileOffset] = b; | |
+ goodnessOfFit[fileOffset] = pIndex[b].mSize; | |
+ blockMatched = true; | |
+ } | |
} | |
- else | |
+ | |
+ // We're done with the file offset, so increment now | |
+ fileOffset += pIndex[b].mSize; | |
+ | |
+ // If the block didn't match then this is a size we'll have to scan for | |
+ if(!blockMatched && (pIndex[b].mSize >= BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)) | |
{ | |
- // New entry | |
- foundSizes[pIndex[b].mSize] = 1; | |
+ // Find entry? | |
+ std::map<int32_t, int64_t>::const_iterator f(unmatchedSizes.find(pIndex[b].mSize)); | |
+ if(f != unmatchedSizes.end()) | |
+ { | |
+ // Increment existing entry | |
+ unmatchedSizes[pIndex[b].mSize] = unmatchedSizes[pIndex[b].mSize] + 1; | |
+ } | |
+ else | |
+ { | |
+ // New entry | |
+ unmatchedSizes[pIndex[b].mSize] = 1; | |
+ } | |
} | |
} | |
+ | |
+ // Cleanup | |
+ ::free(pbuffer0); | |
+ pbuffer0 = 0; | |
} | |
- | |
- // Make the block sizes | |
- for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i) | |
+ catch(...) | |
+ { | |
+ // Cleanup and rethrow | |
+ if(pbuffer0 != 0) ::free(pbuffer0); | |
+ throw; | |
+ } | |
+ | |
+ // Are we already out of time? | |
+ if(abortSearch) | |
{ | |
- // Find the position of the size in the array | |
+#ifndef NDEBUG | |
+ goto dumpDiffList; | |
+#endif | |
+ return; | |
+ } | |
+ | |
+ | |
+ // Second search pass... | |
+ // | |
+ // In our second phase, having matched all unchanged blocks we now need | |
+ // to scan for moved blocks. This involves looping across all unmatched | |
+ // block sizes and using the rolling checksum to look for relocations. To keep | |
+ // this from being too expensive we cap at BACKUP_FILE_DIFF_MAX_BLOCK_SIZES | |
+ // for the number of sizes to scan. We also scan for the blocks in order of | |
+ // their relative probability, a block size that occurs frequently is scanned | |
+ // first. | |
+ // | |
+ | |
+ // Loop all sizes inserting higher usages into the array | |
+ for(std::map<int32_t, int64_t>::const_iterator i(unmatchedSizes.begin()); i != unmatchedSizes.end(); ++i) | |
+ { | |
+ // TODO: Scanning for any block size isn't cheap, and realistically in many cases | |
+ // it would be less expensive to upload a few thousand bytes rather than do the scan. | |
+ // Here would be a good place to filter block sizes that aren't worth the effort | |
+ // once a suitable set of heuristics is found. | |
+ | |
for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) | |
{ | |
- // Instead of sorting on the raw count of blocks, | |
- // take the file area covered by this block size. | |
- if(i->second * i->first > sizeCounts[t] * Sizes[t]) | |
+ // Instead of sorting on the raw count of blocks, take the file area covered by this | |
+ // block size. This helps avoid favoring low numbers of large blocks over many | |
+ // small blocks. | |
+ if((i->second * i->first) > (scanSizesCount[t] * scanSizes[t])) | |
{ | |
// Then this size belong before this entry -- shuffle them up | |
for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) | |
{ | |
- Sizes[s] = Sizes[s-1]; | |
- sizeCounts[s] = sizeCounts[s-1]; | |
+ scanSizes[s] = scanSizes[s-1]; | |
+ scanSizesCount[s] = scanSizesCount[s-1]; | |
} | |
// Insert this size | |
- Sizes[t] = i->first; | |
- sizeCounts[t] = i->second; | |
+ scanSizes[t] = i->first; | |
+ scanSizesCount[t] = i->second; | |
- // Shouldn't do any more searching | |
+ // Update max size | |
+ if(maxScanBlockSize < i->first) maxScanBlockSize = i->first; | |
+ | |
+ // Shouldn't do any more searching for this size | |
break; | |
} | |
} | |
} | |
- | |
- // trace the size table in debug builds | |
+ // Dump the size table in debug builds | |
#ifndef NDEBUG | |
- if(BackupStoreFile::TraceDetailsOfDiffProcess) | |
+ for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) | |
{ | |
- for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) | |
- { | |
- TRACE3("Diff block size %d: %d (count = %lld)\n", t, Sizes[t], sizeCounts[t]); | |
- } | |
+ if(scanSizes[t] != 0) BOX_TRACE("Scan block size " << t << ": " << scanSizes[t] << " count: " << scanSizesCount[t]); | |
} | |
#endif | |
-} | |
- | |
- | |
- | |
-// -------------------------------------------------------------------------- | |
-// | |
-// Function | |
-// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) | |
-// Purpose: Find the matching blocks within the file. | |
-// Created: 12/1/04 | |
-// | |
-// -------------------------------------------------------------------------- | |
-static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, | |
- BlocksAvailableEntry *pIndex, int64_t NumBlocks, | |
- int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer) | |
-{ | |
- Timer maximumDiffingTime(0); | |
- if(pDiffTimer && pDiffTimer->IsManaged()) | |
+ // If we didn't find any sizes (could happen if all were found in the first matching | |
+ // phase) we're done. | |
+ if(maxScanBlockSize == 0) | |
{ | |
- maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime()); | |
+#ifndef NDEBUG | |
+ BOX_TRACE("No scan block sizes, skip rolling checksum"); | |
+ goto dumpDiffList; | |
+#endif | |
+ return; | |
} | |
+ ASSERT(maxScanBlockSize <= BACKUP_FILE_MAX_BLOCK_SIZE); | |
- std::map<int64_t, int32_t> goodnessOfFit; | |
- | |
- // Allocate the hash lookup table | |
- BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); | |
- | |
- // Choose a size for the buffer, just a little bit more than the maximum block size | |
- int32_t bufSize = Sizes[0]; | |
- for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) | |
- { | |
- if(Sizes[z] > bufSize) bufSize = Sizes[z]; | |
- } | |
- bufSize += 4; | |
- ASSERT(bufSize > Sizes[0]); | |
- ASSERT(bufSize > 0); | |
- if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) | |
- { | |
- THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) | |
- } | |
+ // Allocate two buffers we'll toggle between at the max scan block size | |
+ // There sizes are doubled to make final block at the end easier (we're cheating) | |
+ pbuffer0 = (uint8_t *)::malloc(maxScanBlockSize * 2); | |
+ pbuffer1 = (uint8_t *)::malloc(maxScanBlockSize * 2); | |
+ | |
+ // Allocate a bitmap buffer to optimize goodnessOfFit access | |
+ pfitBitmap = (uint8_t *)::malloc(maxScanBlockSize * 2 / 8); | |
- // TODO: Because we read in the file a scanned block size at a time, | |
- // it is likely to be inefficient. Probably will be much better to | |
- // calculate checksums for all block sizes in a single pass. | |
- | |
- // Allocate the buffers. | |
- uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize); | |
- uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize); | |
try | |
{ | |
// Check buffer allocation | |
- if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0) | |
+ if(pbuffer0 == 0 || pbuffer1 == 0 || pfitBitmap == 0) | |
{ | |
// If a buffer got allocated, it will be cleaned up in the catch block | |
throw std::bad_alloc(); | |
} | |
- // Flag to abort the run, if too many blocks are found -- avoid using | |
- // huge amounts of processor time when files contain many similar blocks. | |
- bool abortSearch = false; | |
- | |
- // Search for each block size in turn | |
- // NOTE: Do the smallest size first, so that the scheme for adding | |
- // entries in the found list works as expected and replaces smallers block | |
- // with larger blocks when it finds matches at the same offset in the file. | |
- for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) | |
- { | |
- ASSERT(Sizes[s] <= bufSize); | |
- BOX_TRACE("Diff pass " << s << ", for block size " << | |
- Sizes[s]); | |
- | |
- // Check we haven't finished | |
- if(Sizes[s] == 0) | |
- { | |
- // empty entry, try next size | |
- continue; | |
- } | |
- | |
- // Set up the hash table entries | |
- SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); | |
+ // Shift file position back to beginning | |
+ int64_t bufferFileOffset = 0; | |
+ rFile.Seek(0, IOStream::SeekType_Absolute); | |
- // Shift file position to beginning | |
- rFile.Seek(0, IOStream::SeekType_Absolute); | |
- | |
- // Read first block | |
- if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) | |
+ // We're going to be flipping back and forth between two buffers, the low and high | |
+ uint8_t *lowBuffer = pbuffer0; | |
+ int32_t lowBufferBytes = 0; | |
+ uint8_t *highBuffer = pbuffer1; | |
+ int32_t highBufferBytes = 0; | |
+ | |
+ // In some cases we need to carry over reads from prior buffers | |
+ int32_t carryOverBytes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; | |
+ ::memset(carryOverBytes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES)); | |
+ | |
+ // Read the first buffer's worth of data | |
+ lowBufferBytes = rFile.Read(lowBuffer, maxScanBlockSize); | |
+ // Fill the second buffer if appropriate | |
+ if(lowBufferBytes == maxScanBlockSize) | |
+ { | |
+ highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize); | |
+ } | |
+ | |
+ // For every block size, initialize our scan tracking | |
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) | |
+ { | |
+ ASSERT(scanSizes[z] <= maxScanBlockSize); | |
+ | |
+ // The sizes array may be mostly empty, in those cases we have no | |
+ // state to maintain. | |
+ if(scanSizes[z] != 0) | |
{ | |
- // Size of file too short to match -- do next size | |
- continue; | |
- } | |
- | |
- // Setup block pointers | |
- uint8_t *beginnings = pbuffer0; | |
- uint8_t *endings = pbuffer1; | |
- int offset = 0; | |
- | |
- // Calculate the first checksum, ready for rolling | |
- RollingChecksum rolling(beginnings, Sizes[s]); | |
- | |
- // Then roll, until the file is exhausted | |
- int64_t fileBlockNumber = 0; | |
- int64_t fileOffset = 0; | |
- int rollOverInitialBytes = 0; | |
- while(true) | |
- { | |
- if(maximumDiffingTime.HasExpired()) | |
- { | |
- ASSERT(pDiffTimer != NULL); | |
- BOX_INFO("MaximumDiffingTime reached - " | |
- "suspending file diff"); | |
- abortSearch = true; | |
- break; | |
- } | |
+ // Mark for scan | |
+ scanThisSize[z] = true; | |
- if(pDiffTimer) | |
+ // Set up the hash table for this size | |
+ hashTables[z] = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); | |
+ if(hashTables[z] == 0) | |
{ | |
- pDiffTimer->DoKeepAlive(); | |
+ throw std::bad_alloc(); | |
} | |
- | |
- // Load in another block of data, and record how big it is | |
- int bytesInEndings = rFile.Read(endings, Sizes[s]); | |
- int tmp; | |
+ SetupHashTable(pIndex, NumBlocks, scanSizes[z], hashTables[z]); | |
- // Skip any bytes from a previous matched block | |
- if(rollOverInitialBytes > 0 && offset < bytesInEndings) | |
+ // Set up a rolling checksum, but only if there is enough data to start with | |
+ // (file may now be shorter than some block sizes previously used) | |
+ if(scanSizes[z] <= lowBufferBytes) | |
{ | |
- int spaceLeft = bytesInEndings - offset; | |
- int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; | |
+ rollingSums[z] = new RollingChecksum(lowBuffer, scanSizes[z]); | |
+ } | |
+ } | |
+ } | |
+ | |
+ // Read loop while we can get full maxScanBlockSize reads | |
+ while(highBufferBytes == maxScanBlockSize) | |
+ { | |
+ // Oh happy day! We have maxScanBlockSize * 2 bytes available to us across | |
+ // the two buffers, which means we can walk every block size across all offsets | |
+ // in lowBuffer without any concern about overrunning the data available in highBuffer. | |
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); | |
+ // Check diffing timeout | |
+ if(maximumDiffingTime.HasExpired()) | |
+ { | |
+ ASSERT(pDiffTimer != NULL); | |
+ BOX_INFO("MaximumDiffingTime reached - suspending file diff"); | |
+ abortSearch = true; | |
+ break; | |
+ } | |
+ | |
+ // Send keep alive | |
+ if(pDiffTimer) pDiffTimer->DoKeepAlive(); | |
+ | |
+ // Don't you wish hash_map was standard? std::map is very slow | |
+ // when we access it as often as we do in the loop below. To work around | |
+ // we'll create a bitmap of the previous fits in this buffer | |
+ ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8); | |
+ for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i) | |
+ { | |
+ if(i->first >= (bufferFileOffset + maxScanBlockSize)) break; | |
+ pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7)); | |
+ } | |
+ | |
+ // Walk all block sizes in block-probability order | |
+ for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s) | |
+ { | |
+ // If there is no rolling checksum at this index, skip its either | |
+ // a zero size or the file was too small | |
+ if(!scanThisSize[s] || (rollingSums[s] == 0)) continue; | |
- offset += thisRoll; | |
- fileOffset += thisRoll; | |
- rollOverInitialBytes -= thisRoll; | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Diff block size " << scanSizes[s] << " at file offset " << bufferFileOffset); | |
+#endif | |
+ | |
+ // Offset of this block buffer | |
+ int32_t bufferOffset = 0; | |
- if(rollOverInitialBytes) | |
+ // Roll carry over | |
+ if(carryOverBytes[s] != 0) | |
+ { | |
+ ASSERT(carryOverBytes[s] > 0); | |
+ // Carry can be bigger than maxScanBlockSize because first matching | |
+ // phase might have used bigger blocks than the current max scan size | |
+ int32_t thisCarry = carryOverBytes[s]; | |
+ if(thisCarry > maxScanBlockSize) | |
{ | |
- goto refresh; | |
+ thisCarry = maxScanBlockSize; | |
} | |
- } | |
- | |
- if(goodnessOfFit.count(fileOffset)) | |
- { | |
- tmp = goodnessOfFit[fileOffset]; | |
- } | |
- else | |
- { | |
- tmp = 0; | |
- } | |
- if(tmp >= Sizes[s]) | |
- { | |
- // Skip over bigger ready-matched blocks completely | |
- rollOverInitialBytes = tmp; | |
- int spaceLeft = bytesInEndings - offset; | |
- int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; | |
- | |
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); | |
- | |
- offset += thisRoll; | |
- fileOffset += thisRoll; | |
- rollOverInitialBytes -= thisRoll; | |
- | |
- if(rollOverInitialBytes) | |
+ // Perform carry | |
+ if((thisCarry + scanSizes[s]) > maxScanBlockSize) | |
+ { | |
+ // Roll is split some low, some low/high | |
+ int32_t lowRollBytes = maxScanBlockSize - scanSizes[s]; | |
+ ASSERT(lowRollBytes >= 0); | |
+ if(lowRollBytes > 0) | |
+ { | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], lowRollBytes); | |
+ } | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer + lowRollBytes, highBuffer, scanSizes[s], thisCarry - lowRollBytes); | |
+ } | |
+ else | |
{ | |
- goto refresh; | |
+ // Roll is all in low buffer | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], thisCarry); | |
} | |
+ // Either way offset is carry | |
+ bufferOffset = thisCarry; | |
+ // Reuce carry by the carry amount actually carried | |
+ carryOverBytes[s] -= thisCarry; | |
+ ASSERT(carryOverBytes[s] >= 0); | |
} | |
- | |
- while(offset < bytesInEndings) | |
+ | |
+ // Loop remaining low buffer bytes, taking a checksum at each offset | |
+ while (bufferOffset < maxScanBlockSize) | |
{ | |
- // Is current checksum in hash list? | |
- uint16_t hash = rolling.GetComponentForHashing(); | |
- if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) | |
+ // Look for larger size matches at this offset | |
+ int32_t bestFitSoFarSize = 0; | |
+ if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7))) | |
{ | |
- if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) | |
+ bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset]; | |
+ } | |
+ if(bestFitSoFarSize >= scanSizes[s]) { | |
+ // Roll can be bigger than maxScanBlockSize because first matching | |
+ // phase might have used bigger blocks than the max scan size | |
+ int32_t thisRoll = bestFitSoFarSize; | |
+ int32_t remainderRoll = 0; | |
+ if(thisRoll > maxScanBlockSize) | |
{ | |
- BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); | |
- goodnessOfFit[fileOffset] = Sizes[s]; | |
- | |
- // Block matched, roll the checksum forward to the next block without doing | |
- // any more comparisons, because these are pointless (as any more matches will be ignored when | |
- // the receipe is generated) and just take up valuable processor time. Edge cases are | |
- // especially nasty, using huge amounts of time and memory. | |
- int skip = Sizes[s]; | |
- if(offset < bytesInEndings && skip > 0) | |
- { | |
- int spaceLeft = bytesInEndings - offset; | |
- int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; | |
- | |
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); | |
- | |
- offset += thisRoll; | |
- fileOffset += thisRoll; | |
- skip -= thisRoll; | |
- } | |
- // Not all the bytes necessary will have been skipped, so get them | |
- // skipped after the next block is loaded. | |
- rollOverInitialBytes = skip; | |
- | |
- // End this loop, so the final byte isn't used again | |
- break; | |
+ thisRoll = maxScanBlockSize; | |
+ remainderRoll = bestFitSoFarSize - thisRoll; | |
} | |
- else | |
+ | |
+ // Roll forward in the lower buffer. This will either exhaust the total roll or will | |
+ // push the rolling checksum so that it is against the end of the buffer. | |
+ int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]); | |
+ if(lowRoll > 0) | |
{ | |
- BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll); | |
+ bufferOffset += lowRoll; | |
+ thisRoll -= lowRoll; | |
} | |
- | |
- int64_t NumBlocksFound = static_cast<int64_t>( | |
- rFoundBlocks.size()); | |
- int64_t MaxBlocksFound = NumBlocks * | |
- BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; | |
- | |
- if(NumBlocksFound > MaxBlocksFound) | |
+ if(thisRoll < 1) | |
{ | |
- abortSearch = true; | |
- break; | |
+ ASSERT(remainderRoll == 0); | |
+ continue; // Back around to bufferOffset loop | |
} | |
+ // Roll high/low section. Our roll here will either exhaust the total or will | |
+ // leave the rolling checksum positioned a offset zero in the high buffer | |
+ int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset); | |
+ ASSERT(lowHighRoll > 0); | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll); | |
+ bufferOffset += lowHighRoll; | |
+ carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll; | |
+ ASSERT(carryOverBytes[s] >= 0); | |
+ continue; // Back around to bufferOffset loop | |
} | |
- // Roll checksum forward | |
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); | |
- | |
- // Increment offsets | |
- ++offset; | |
- ++fileOffset; | |
- } | |
- | |
- if(abortSearch) break; | |
- | |
- refresh: | |
- // Finished? | |
- if(bytesInEndings != Sizes[s]) | |
- { | |
- // No more data in file -- check the final block | |
- // (Do a copy and paste of 5 lines of code instead of introducing a comparison for | |
- // each byte of the file) | |
- uint16_t hash = rolling.GetComponentForHashing(); | |
- if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) | |
+ // Look for block match at this size | |
+ uint16_t hashValue = rollingSums[s]->GetComponentForHashing(); | |
+ if(hashTables[s][hashValue] != 0) | |
{ | |
- if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) | |
+ int32_t lowSize = maxScanBlockSize - bufferOffset; | |
+ if(lowSize > scanSizes[s]) lowSize = scanSizes[s]; | |
+ int32_t highSize = scanSizes[s] - lowSize; | |
+ if(SecondStageMatch(hashTables[s][hashValue], rollingSums[s], lowBuffer + bufferOffset, lowSize, | |
+ highBuffer, highSize, scanSizes[s], bufferFileOffset + bufferOffset, pIndex, rFoundBlocks)) | |
{ | |
- goodnessOfFit[fileOffset] = Sizes[s]; | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset); | |
+#endif | |
+ // Update best fit so far | |
+ ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s])); | |
+ goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s]; | |
+ pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7)); | |
+ | |
+ // We've found a match, don't scan anymore if we don't expect more blocks of this size | |
+ --scanSizesCount[s]; | |
+ if(scanSizesCount[s] < 1) | |
+ { | |
+ scanThisSize[s] = false; | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Short-circuit size " << scanSizes[s]); | |
+#endif | |
+ } | |
+ | |
+ // Roll so we don't bother diffing the remainder of this block, see bestFitSoFarSize | |
+ // roll above for an explanation of the logic | |
+ int32_t thisRoll = scanSizes[s]; | |
+ int32_t remainderRoll = 0; | |
+ if(thisRoll > maxScanBlockSize) | |
+ { | |
+ thisRoll = maxScanBlockSize; | |
+ remainderRoll = bestFitSoFarSize - thisRoll; | |
+ } | |
+ int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]); | |
+ if(lowRoll > 0) | |
+ { | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll); | |
+ bufferOffset += lowRoll; | |
+ thisRoll -= lowRoll; | |
+ } | |
+ if(thisRoll < 1) | |
+ { | |
+ ASSERT(remainderRoll == 0); | |
+ continue; // Back around to bufferOffset loop | |
+ } | |
+ int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset); | |
+ ASSERT(lowHighRoll > 0); | |
+ rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll); | |
+ bufferOffset += lowHighRoll; | |
+ carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll; | |
+ ASSERT(carryOverBytes[s] >= 0); | |
+ continue; // Back around to bufferOffset loop | |
} | |
} | |
+ | |
+ // Roll one byte, either low or low/high depending on current offset | |
+ if((bufferOffset + scanSizes[s]) >= maxScanBlockSize) | |
+ { | |
+ rollingSums[s]->RollForward(lowBuffer[bufferOffset], highBuffer[scanSizes[s] - (maxScanBlockSize - bufferOffset)], scanSizes[s]); | |
+ } | |
+ else | |
+ { | |
+ rollingSums[s]->RollForward(lowBuffer[bufferOffset], lowBuffer[bufferOffset + scanSizes[s]], scanSizes[s]); | |
+ } | |
+ ++bufferOffset; | |
+ } | |
- // finish | |
+ // Check if we've found too much | |
+ int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size()); | |
+ int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; | |
+ if(NumBlocksFound > MaxBlocksFound) | |
+ { | |
+ abortSearch = true; | |
break; | |
} | |
- | |
- // Switch buffers, reset offset | |
- beginnings = endings; | |
- endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer | |
- offset = 0; | |
+ } | |
+ // If we're aborting, pass through | |
+ if(abortSearch) break; | |
+ | |
+ // Swap buffers so high is low and the go back around | |
+ uint8_t *swap = lowBuffer; | |
+ lowBuffer = highBuffer; | |
+ ASSERT((lowBufferBytes == highBufferBytes) && (lowBufferBytes == maxScanBlockSize)); | |
+ bufferFileOffset += maxScanBlockSize; | |
+ highBuffer = swap; | |
+ highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize); | |
- // And count the blocks which have been done | |
- ++fileBlockNumber; | |
+ } // End of maxScanBlockSize reads | |
+ | |
+ // What remains is either: | |
+ // - An abort or timeout | |
+ // - A low buffer with less than maxScanBlockSize bytes and an empty high buffer | |
+ // (Potentially zero low bytes for a zero-length file) | |
+ // - A maxScanBlockSize low buffer and a maxScanBlockSize or less high buffer | |
+ // | |
+ if(!abortSearch && !maximumDiffingTime.HasExpired() && (lowBufferBytes > 0)) | |
+ { | |
+ // We're going to repeat the logic in the previous read loop, but have | |
+ // to be careful not to walk off the end of the available data. To make this | |
+ // a lot less convoluted, we're just going to copy to consolidate buffers. | |
+ // This is cheating a bit, but worth the simplicity. | |
+ ASSERT((lowBufferBytes < maxScanBlockSize) ? (highBufferBytes == 0) : true); | |
+ ASSERT((highBufferBytes != 0) ? (lowBufferBytes == maxScanBlockSize) : true); | |
+ uint8_t *endingBuffer = lowBuffer; | |
+ if(highBufferBytes != 0) | |
+ { | |
+ ::memcpy(endingBuffer + maxScanBlockSize, highBuffer, highBufferBytes); | |
+ } | |
+ int32_t endingBytes = lowBufferBytes + highBufferBytes; | |
+ | |
+ // Send keep alive | |
+ if(pDiffTimer) pDiffTimer->DoKeepAlive(); | |
+ | |
+ // std::map still not winning | |
+ ::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8); | |
+ for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i) | |
+ { | |
+ pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7)); | |
} | |
+ | |
+ // Walk all block sizes in block-probability order | |
+ for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s) | |
+ { | |
+ // If there is no rolling checksum at this index, skip its either | |
+ // a zero size or the file was too small | |
+ if(!scanThisSize[s] || (rollingSums[s] == 0)) continue; | |
+ | |
+ // If there's not enough data at the end, skip this block size | |
+ if(endingBytes < scanSizes[s]) continue; | |
- if(abortSearch) break; | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Diff block size " << scanSizes[s] << " at file ending offset " << bufferFileOffset); | |
+#endif | |
+ | |
+ // Offset of this block buffer | |
+ int32_t bufferOffset = 0; | |
+ | |
+ // For each block size we can only walk so far safely | |
+ int32_t maxSafeOffset = endingBytes - scanSizes[s]; | |
+ | |
+ // Roll carry over | |
+ if(carryOverBytes[s] != 0) | |
+ { | |
+ ASSERT(carryOverBytes[s] > 0); | |
+ // Don't walk off the end | |
+ if(carryOverBytes[s] > maxSafeOffset) continue; | |
+ // Roll | |
+ rollingSums[s]->RollForwardSeveral(endingBuffer, endingBuffer + scanSizes[s], scanSizes[s], carryOverBytes[s]); | |
+ bufferOffset = carryOverBytes[s]; | |
+ carryOverBytes[s] = 0; // Not really needed again, but be clean | |
+ } | |
+ | |
+ // Loop remaining end bytes, we must walk to exactly and then break the loop | |
+ while (bufferOffset <= maxSafeOffset) | |
+ { | |
+ // Look for larger size matches at this offset | |
+ int32_t bestFitSoFarSize = 0; | |
+ if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7))) | |
+ { | |
+ bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset]; | |
+ } | |
+ if(bestFitSoFarSize >= scanSizes[s]) { | |
+ // Space for this roll? | |
+ if((bufferOffset + bestFitSoFarSize) > maxSafeOffset) break; // Kill byte walk loop | |
+ // Roll | |
+ rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], bestFitSoFarSize); | |
+ bufferOffset += bestFitSoFarSize; | |
+ continue; // Back around to byte read loop | |
+ } | |
+ | |
+ // Look for block match at this size | |
+ uint16_t hashValue = rollingSums[s]->GetComponentForHashing(); | |
+ if(hashTables[s][hashValue] != 0) | |
+ { | |
+ if(SecondStageMatch(hashTables[s][hashValue], rollingSums[s], endingBuffer + bufferOffset, scanSizes[s], | |
+ 0, 0, scanSizes[s], bufferFileOffset + bufferOffset, pIndex, rFoundBlocks)) | |
+ { | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset); | |
+#endif | |
+ // Update best fit so far | |
+ ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s])); | |
+ goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s]; | |
+ pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7)); | |
+ | |
+ // We've found a match, don't scan anymore if we don't expect more blocks of this size | |
+ --scanSizesCount[s]; | |
+ if(scanSizesCount[s] < 1) | |
+ { | |
+ scanThisSize[s] = false; | |
+#ifndef NDEBUG | |
+ BOX_TRACE("Short-circuit size " << scanSizes[s]); | |
+#endif | |
+ } | |
+ | |
+ // Roll over this matched block | |
+ if((bufferOffset + scanSizes[s]) > maxSafeOffset) break; // Kill byte walk loop | |
+ rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], scanSizes[s]); | |
+ bufferOffset += scanSizes[s]; | |
+ continue; // Back around to byte read loop | |
+ } | |
+ } | |
+ | |
+ // Abort check | |
+ int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size()); | |
+ int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; | |
+ if(NumBlocksFound > MaxBlocksFound) | |
+ { | |
+ abortSearch = true; | |
+ break; | |
+ } | |
+ | |
+ // Because byte loop conditional can equal maxSafeOffset we must not read off | |
+ // end here | |
+ if(bufferOffset == maxSafeOffset) break; // Done with reading bytes | |
+ rollingSums[s]->RollForward(endingBuffer[bufferOffset], endingBuffer[bufferOffset + scanSizes[s]], scanSizes[s]); | |
+ ++bufferOffset; | |
+ } | |
+ if(abortSearch) break; | |
+ // NOTE: Nothing else can go here without restructuring use of "break" in the byte read loop above | |
+ } | |
} | |
- // Free buffers and hash table | |
- ::free(pbuffer1); | |
- pbuffer1 = 0; | |
+ // Clean up | |
::free(pbuffer0); | |
pbuffer0 = 0; | |
- ::free(phashTable); | |
- phashTable = 0; | |
+ ::free(pbuffer1); | |
+ pbuffer1 = 0; | |
+ ::free(pfitBitmap); | |
+ pfitBitmap = 0; | |
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) | |
+ { | |
+ if(hashTables[z] != 0) | |
+ { | |
+ free(hashTables[z]); | |
+ hashTables[z] = 0; | |
+ } | |
+ if(rollingSums[z] != 0) | |
+ { | |
+ delete rollingSums[z]; | |
+ rollingSums[z] = 0; | |
+ } | |
+ } | |
} | |
catch(...) | |
{ | |
- // Cleanup and throw | |
- if(pbuffer1 != 0) ::free(pbuffer1); | |
+ // Cleanup and rethrow | |
if(pbuffer0 != 0) ::free(pbuffer0); | |
- if(phashTable != 0) ::free(phashTable); | |
+ if(pbuffer1 != 0) ::free(pbuffer1); | |
+ if(pfitBitmap != 0) ::free(pfitBitmap); | |
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) | |
+ { | |
+ if(hashTables[z] != 0) free(hashTables[z]); | |
+ } | |
+ for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) | |
+ { | |
+ if(rollingSums[z] != 0) delete rollingSums[z]; | |
+ } | |
throw; | |
} | |
- | |
+ | |
#ifndef NDEBUG | |
- if(BackupStoreFile::TraceDetailsOfDiffProcess) | |
- { | |
- // Trace out the found blocks in debug mode | |
- BOX_TRACE("Diff: list of found blocks"); | |
- BOX_TRACE("======== ======== ======== ========"); | |
- BOX_TRACE(" Offset BlkIdx Size Movement"); | |
- for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) | |
- { | |
- int64_t orgLoc = 0; | |
- for(int64_t b = 0; b < i->second; ++b) | |
- { | |
- orgLoc += pIndex[b].mSize; | |
- } | |
- BOX_TRACE(std::setw(8) << i->first << " " << | |
- std::setw(8) << i->second << " " << | |
- std::setw(8) << pIndex[i->second].mSize << | |
- " " << | |
- std::setw(8) << (i->first - orgLoc)); | |
- } | |
- BOX_TRACE("======== ======== ======== ========"); | |
+dumpDiffList: | |
+ // Trace out the found blocks in debug mode | |
+ BOX_TRACE("Diff: list of found blocks"); | |
+ BOX_TRACE("======== ======== ======== ========"); | |
+ BOX_TRACE(" Offset BlkIdx Size Movement"); | |
+ for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) | |
+ { | |
+ int64_t orgLoc = 0; | |
+ for(int64_t b = 0; b < i->second; ++b) | |
+ { | |
+ orgLoc += pIndex[b].mSize; | |
+ } | |
+ BOX_TRACE(std::setw(8) << i->first << " " << | |
+ std::setw(8) << i->second << " " << | |
+ std::setw(8) << pIndex[i->second].mSize << | |
+ " " << | |
+ std::setw(8) << (i->first - orgLoc)); | |
} | |
+ BOX_TRACE("======== ======== ======== ========"); | |
#endif | |
} | |
@@ -776,6 +1116,8 @@ | |
{ | |
//TRACE1("Another hash entry for %d found\n", hash); | |
// Yes -- need to set the pointer in this entry to the current entry to build the linked list | |
+ // This is safe because there is only one hash table per block size and block sizes | |
+ // are unique. | |
pIndex[b].mpNextInHashList = pHashTable[hash]; | |
} | |
@@ -785,7 +1127,6 @@ | |
} | |
} | |
- | |
// -------------------------------------------------------------------------- | |
// | |
// Function | |
@@ -794,21 +1135,24 @@ | |
// Created: 14/1/04 | |
// | |
// -------------------------------------------------------------------------- | |
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, | |
- int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) | |
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum *pFastSum, | |
+ uint8_t *pLow, int32_t LowSize, uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, int64_t FileOffset, | |
+ BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) | |
{ | |
// Check parameters | |
- ASSERT(pBeginnings != 0); | |
- ASSERT(pEndings != 0); | |
- ASSERT(Offset >= 0); | |
- ASSERT(BlockSize > 0); | |
ASSERT(pFirstInHashList != 0); | |
+ ASSERT(pFastSum != 0); | |
+ ASSERT(pLow != 0); | |
+ ASSERT(LowSize != 0); | |
+ ASSERT((HighSize > 0) ? (pHigh != 0) : true); | |
+ ASSERT(BlockSize > 0); | |
+ ASSERT(BlockSize == (LowSize + HighSize)); | |
ASSERT(pIndex != 0); | |
#ifndef NDEBUG | |
- uint16_t DEBUG_Hash = fastSum.GetComponentForHashing(); | |
+ uint16_t DEBUG_Hash = pFastSum->GetComponentForHashing(); | |
#endif | |
- uint32_t Checksum = fastSum.GetChecksum(); | |
+ uint32_t Checksum = pFastSum->GetChecksum(); | |
// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. | |
BlocksAvailableEntry *scan = pFirstInHashList; | |
@@ -829,42 +1173,33 @@ | |
// Calculate the strong MD5 digest for this block | |
MD5Digest strong; | |
- // Add the data from the beginnings | |
- strong.Add(pBeginnings + Offset, BlockSize - Offset); | |
- // Add any data from the endings | |
- if(Offset > 0) | |
- { | |
- strong.Add(pEndings, Offset); | |
- } | |
+ strong.Add(pLow, LowSize); | |
+ if(HighSize > 0) strong.Add(pHigh, HighSize); | |
strong.Finish(); | |
// Then go through the entries in the hash list, comparing with the strong digest calculated | |
scan = pFirstInHashList; | |
- //TRACE0("second stage match\n"); | |
while(scan != 0) | |
{ | |
//TRACE3("scan size %d, block size %d, hash %d\n", scan->mSize, BlockSize, Hash); | |
ASSERT(scan->mSize == BlockSize); | |
ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash); | |
- | |
+ | |
// Compare? | |
if(strong.DigestMatches(scan->mStrongChecksum)) | |
{ | |
- //TRACE0("Match!\n"); | |
// Found! Add to list of found blocks... | |
- int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset; | |
int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory | |
// We do NOT search for smallest blocks first, as this code originally assumed. | |
// To prevent this from potentially overwriting a better match, the caller must determine | |
// the relative "goodness" of any existing match and this one, and avoid the call if it | |
// could be detrimental. | |
- rFoundBlocks[fileOffset] = blockIndex; | |
+ rFoundBlocks[FileOffset] = blockIndex; | |
// No point in searching further, report success | |
return true; | |
} | |
- | |
// Next | |
scan = scan->mpNextInHashList; | |
} | |
@@ -909,12 +1244,9 @@ | |
instruction.mSpaceBefore = SizeOfInputFile; | |
rRecipe.push_back(instruction); | |
- #ifndef NDEBUG | |
- if(BackupStoreFile::TraceDetailsOfDiffProcess) | |
- { | |
- TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile); | |
- } | |
- #endif | |
+#ifndef NDEBUG | |
+ TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile); | |
+#endif | |
// Don't do anything | |
return; | |
@@ -1006,22 +1338,19 @@ | |
// dump out the recipe | |
#ifndef NDEBUG | |
TRACE2("Diff: %lld new bytes found, %lld old blocks used\n", debug_NewBytesFound, debug_OldBlocksUsed); | |
- if(BackupStoreFile::TraceDetailsOfDiffProcess) | |
+ TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk NumBlks\n", rRecipe.size()); | |
{ | |
- TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk NumBlks\n", rRecipe.size()); | |
+ for(unsigned int e = 0; e < rRecipe.size(); ++e) | |
{ | |
- for(unsigned int e = 0; e < rRecipe.size(); ++e) | |
- { | |
- char b[64]; | |
+ char b[64]; | |
#ifdef WIN32 | |
- sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); | |
+ sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); | |
#else | |
- sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); | |
+ sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); | |
#endif | |
- TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?" -":b, (int64_t)rRecipe[e].mBlocks); | |
- } | |
+ TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?" -":b, (int64_t)rRecipe[e].mBlocks); | |
} | |
- TRACE0("======== ========= ========\n"); | |
} | |
+ TRACE0("======== ========= ========\n"); | |
#endif | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment