boxbackup-bot/BackupFileDiff.2361.patch

## BackupFileDiff.2361.patch
Index: lib/backupclient/BackupStoreFile.h
===================================================================
--- lib/backupclient/BackupStoreFile.h	(revision 2361)
+++ lib/backupclient/BackupStoreFile.h	(working copy)
@@ -214,11 +214,6 @@
 	static void ResetStats();
 	static BackupStoreFileStats msStats;

-	// For debug
-#ifndef NDEBUG
-	static bool TraceDetailsOfDiffProcess;
-#endif
-
 	// For decoding encoded files
 	static void DumpFile(void *clibFileHandle, bool ToTrace, IOStream &rFile);
 };
Index: test/backupdiff/testbackupdiff.cpp
===================================================================
--- test/backupdiff/testbackupdiff.cpp	(revision 2361)
+++ test/backupdiff/testbackupdiff.cpp	(working copy)
@@ -390,7 +390,7 @@
 	// Want to trace out all the details
 	#ifndef NDEBUG
 	#ifndef WIN32
-	BackupStoreFile::TraceDetailsOfDiffProcess = true;
+	Logging::SetGlobalLevel(Log::TRACE);
 	#endif
 	#endif

Index: lib/backupclient/BackupStoreFileDiff.cpp
===================================================================
--- lib/backupclient/BackupStoreFileDiff.cpp	(revision 2361)
+++ lib/backupclient/BackupStoreFileDiff.cpp	(working copy)
@@ -38,22 +38,26 @@
 using namespace BackupStoreFileCryptVar;
 using namespace BackupStoreFileCreation;

-// By default, don't trace out details of the diff as we go along -- would fill up logs significantly.
-// But it's useful for the test.
-#ifndef NDEBUG
-	bool BackupStoreFile::TraceDetailsOfDiffProcess = false;
-#endif
-
-static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis);
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
-static void SearchForMatchingBlocks(IOStream &rFile,
-	std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex,
-	int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES],
+static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID,
+	BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut,
+	int Timeout, bool &rCanDiffFromThis);
+static void SearchForMatchingBlocks(IOStream &rFile,
+	std::map<int64_t, int64_t> &rFoundBlocks,
+	BlocksAvailableEntry *pIndex, int64_t NumBlocks,
 	DiffTimer *pDiffTimer);
-static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
-BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
-static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
+static void SetupHashTable(BlocksAvailableEntry *pIndex,
+	int64_t NumBlocks, int32_t BlockSize,
+	BlocksAvailableEntry **pHashTable);
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList,
+	RollingChecksum *pFastSum,
+	uint8_t *pLow, int32_t LowSize,
+	uint8_t *pHigh, int32_t HighSize,
+	int32_t BlockSize, int64_t FileOffset,
+	BlocksAvailableEntry *pIndex,
+	std::map<int64_t, int64_t> &rFoundBlocks);
+static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe,
+	BlocksAvailableEntry *pIndex, int64_t NumBlocks,
+	std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);

 // --------------------------------------------------------------------------
 //
@@ -168,10 +172,6 @@

 	try
 	{
-		// Find which sizes should be scanned
-		int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
-		FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);
-
 		// Flag for reporting to the user
 		bool completelyDifferent;

@@ -186,8 +186,7 @@
 				// Get size of file
 				sizeOfInputFile = file.BytesLeftToRead();
 				// Find all those lovely matching blocks
-				SearchForMatchingBlocks(file, foundBlocks, pindex,
-					blocksInIndex, sizesToScan, pDiffTimer);
+				SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, pDiffTimer);

 				// Is it completely different?
 				completelyDifferent = (foundBlocks.size() == 0);
@@ -363,393 +362,771 @@
 	}
 }

-
 // --------------------------------------------------------------------------
 //
 // Function
-//		Name:    static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-//		Purpose: Finds the most commonly used block sizes in the index
+//		Name:    static void SearchForMatchingBlocks(IOStream &,
+//			 std::map<int64_t, int64_t> &,
+//			 BlocksAvailableEntry *, int64_t, DiffTimer *)
+//		Purpose: Find the matching blocks within the file.
 //		Created: 12/1/04
 //
 // --------------------------------------------------------------------------
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
+static void SearchForMatchingBlocks(IOStream &rFile,
+	std::map<int64_t, int64_t> &rFoundBlocks,
+	BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer)
 {
-	// Array for lengths
-	int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	Timer maximumDiffingTime(0, "MaximumDiffingTime");

-	// Set arrays to lots of zeros (= unused entries)
-	for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l)
+	if(pDiffTimer && pDiffTimer->IsManaged())
 	{
-		Sizes[l] = 0;
-		sizeCounts[l] = 0;
+		maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(),
+			"MaximumDiffingTime");
 	}

-	// Array for collecting sizes
-	std::map<int32_t, int64_t> foundSizes;
+	// Flag to abort the run, if too many blocks are found or the diffing
+	// timeout expires
+	bool abortSearch = false;

-	// Run through blocks and make a count of the entries
-	for(int64_t b = 0; b < NumBlocks; ++b)
+	// Buffers used during both phases of search
+	uint8_t *pbuffer0 = 0;
+	uint8_t *pbuffer1 = 0;
+
+	// Track offsets that already have block matches. We don't really care
+	// if its sorted, and this actually produces a performance issue, so
+	// see pfitBitmap for a workaround
+	std::map<int64_t, int32_t> goodnessOfFit;
+
+	// Collect sizes that aren't found in the file at their old offset
+	std::map<int32_t, int64_t> unmatchedSizes;
+
+	// Our arrays of block sizes during second search pass
+	int32_t scanSizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	int64_t scanSizesCount[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	int32_t maxScanBlockSize = 0;
+	::memset(scanSizes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+	::memset(scanSizesCount, 0, (sizeof(int64_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+	// We need to keep separate rolling checksums for each block size in second search
+	RollingChecksum *rollingSums[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	::memset(rollingSums, 0, (sizeof(RollingChecksum *) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+	// And a hash lookup table per block size in seacond search
+	BlocksAvailableEntry **hashTables[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	::memset(hashTables, 0, (sizeof(BlocksAvailableEntry **) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+	// We allow second search pass to short-circuit off rare blocks
+	bool scanThisSize[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	::memset(scanThisSize, 0, (sizeof(bool) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+	// During block read we have a bitmap of prefit locations to avoid std::map
+	// performance problems
+	uint8_t *pfitBitmap = 0;
+
+
+	// First search pass...
+	//
+	// For many files (especially large ones) most of the file is unchanged.
+	// The RollingChecksum process requires us to read every byte of the file looking for
+	// blocks that have moved. However, we can make that process more efficient by
+	// quickly rolling over areas that match a different block. We can also use
+	// this to eliminate entirely the rolling checksum for block sizes that only exist
+	// at one location in the file.
+	//
+	// Thus we start by looking for blocks that have not moved. Only if a block
+	// cannot be found at its previous location do we consider scanning for it by
+	// rolling checksum size.
+	//
+	// This strategy has some disadvantages for files with lots of repeating content
+	// that happens to align with our block size, but the reduction in diff time
+	// for more typical files is worth it.
+	//
+	// Note also that in this pass we consider _all_ block sizes (smaller than
+	// BACKUP_FILE_MAX_BLOCK_SIZE). Any block size, no matter how small or rare
+	// is cheap for us to find in this pass.
+	//
+	pbuffer0 = (uint8_t *)::malloc(BACKUP_FILE_MAX_BLOCK_SIZE);
+	try
 	{
-		// Only if the block size is bigger than the minimum size we'll scan for
-		if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)
+		if(pbuffer0 == 0)
 		{
-			// Find entry?
-			std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize));
-			if(f != foundSizes.end())
+			throw std::bad_alloc();
+		}
+
+		// We have to track file offset since the read may fail
+		int64_t fileOffset = 0;
+
+		// Walk the blocks
+		for(int64_t b = 0; b < NumBlocks; ++b)
+		{
+			// Check diffing timeout
+			if(maximumDiffingTime.HasExpired())
 			{
-				// Increment existing entry
-				foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1;
+				ASSERT(pDiffTimer != NULL);
+				BOX_INFO("MaximumDiffingTime reached - suspending file diff");
+				abortSearch = true;
+				break;
 			}
-			else
+
+			// Send keep alive
+			if(pDiffTimer) pDiffTimer->DoKeepAlive();
+
+			// Skip blocks too large for our buffer
+			if(pIndex[b].mSize > BACKUP_FILE_MAX_BLOCK_SIZE) {
+				fileOffset += pIndex[b].mSize;
+				continue;
+			}
+
+			// Have to guard the seek operation, it could throw. We don't know size of the
+			// current file, and checking is pointless anyway since there's a race.
+			// In reality on Unix this is implemented with lseek which will mean we can
+			// seek past the EOF, but I don't want to make assumptions about Win32.
+			int32_t readSize = 0;
+			try
 			{
-				// New entry
-				foundSizes[pIndex[b].mSize] = 1;
+				rFile.Seek(fileOffset, IOStream::SeekType_Absolute);
+				readSize = rFile.Read(pbuffer0, pIndex[b].mSize);
 			}
+			catch(BoxException &e)
+			{
+				if(e.GetType() != CommonException::ExceptionType || e.GetSubType() != CommonException::OSFileError)
+				{
+					// Not what we expected, rethrow
+					throw;
+				}
+			}
+
+			// Check for a match
+			bool blockMatched = false;
+			if(readSize == pIndex[b].mSize)
+			{
+				// We don't have a rolling checksum to this point, so all we can do is MD5. If you
+				// worry this is expensive just remember that prior versions of this code
+				// re-read the file BACKUP_FILE_DIFF_MAX_BLOCK_SIZES times and calculated
+				// rolling checksums every time.
+				MD5Digest strong;
+				strong.Add(pbuffer0, pIndex[b].mSize);
+				strong.Finish();
+
+				// Do we have a match?
+				if(strong.DigestMatches(pIndex[b].mStrongChecksum))
+				{
+#ifndef NDEBUG
+					BOX_TRACE("Found unchanged block of size " << pIndex[b].mSize << " at offset " << fileOffset);
+#endif
+					rFoundBlocks[fileOffset] = b;
+					goodnessOfFit[fileOffset] = pIndex[b].mSize;
+					blockMatched = true;
+				}
+			}
+
+			// We're done with the file offset, so increment now
+			fileOffset += pIndex[b].mSize;
+
+			// If the block didn't match then this is a size we'll have to scan for
+			if(!blockMatched && (pIndex[b].mSize >= BACKUP_FILE_DIFF_MIN_BLOCK_SIZE))
+			{
+				// Find entry?
+				std::map<int32_t, int64_t>::const_iterator f(unmatchedSizes.find(pIndex[b].mSize));
+				if(f != unmatchedSizes.end())
+				{
+					// Increment existing entry
+					unmatchedSizes[pIndex[b].mSize] = unmatchedSizes[pIndex[b].mSize] + 1;
+				}
+				else
+				{
+					// New entry
+					unmatchedSizes[pIndex[b].mSize] = 1;
+				}
+			}
 		}
+
+		// Cleanup
+		::free(pbuffer0);
+		pbuffer0 = 0;
 	}
+	catch(...)
+	{
+		// Cleanup and rethrow
+		if(pbuffer0 != 0) ::free(pbuffer0);
+		throw;
+	}
+
+	// Are we already out of time?
+	if(abortSearch)
+	{
+#ifndef NDEBUG
+		goto dumpDiffList;
+#endif
+		return;
+	}

-	// Make the block sizes
-	for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i)
+
+	// Second search pass...
+	//
+	// In our second phase, having matched all unchanged blocks we now need
+	// to scan for moved blocks. This involves looping across all unmatched
+	// block sizes and using the rolling checksum to look for relocations. To keep
+	// this from being too expensive we cap at BACKUP_FILE_DIFF_MAX_BLOCK_SIZES
+	// for the number of sizes to scan. We also scan for the blocks in order of
+	// their relative probability, a block size that occurs frequently is scanned
+	// first.
+	//
+
+	// Loop all sizes inserting higher usages into the array
+	for(std::map<int32_t, int64_t>::const_iterator i(unmatchedSizes.begin()); i != unmatchedSizes.end(); ++i)
 	{
-		// Find the position of the size in the array
+		// TODO: Scanning for any block size isn't cheap, and realistically in many cases
+		// it would be less expensive to upload a few thousand bytes rather than do the scan.
+		// Here would be a good place to filter block sizes that aren't worth the effort
+		// once a suitable set of heuristics is found.
+
 		for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
 		{
-			// Instead of sorting on the raw count of blocks,
-			// take the file area covered by this block size.
-			if(i->second * i->first > sizeCounts[t] * Sizes[t])
+			// Instead of sorting on the raw count of blocks, take the file area covered by this
+			// block size. This helps avoid favoring low numbers of large blocks over many
+			// small blocks.
+			if((i->second * i->first) > (scanSizesCount[t] * scanSizes[t]))
 			{
 				// Then this size belong before this entry -- shuffle them up
 				for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
 				{
-					Sizes[s] = Sizes[s-1];
-					sizeCounts[s] = sizeCounts[s-1];
+					scanSizes[s] = scanSizes[s-1];
+					scanSizesCount[s] = scanSizesCount[s-1];
 				}

 				// Insert this size
-				Sizes[t] = i->first;
-				sizeCounts[t] = i->second;
+				scanSizes[t] = i->first;
+				scanSizesCount[t] = i->second;

-				// Shouldn't do any more searching
+				// Update max size
+				if(maxScanBlockSize < i->first) maxScanBlockSize = i->first;
+
+				// Shouldn't do any more searching for this size
 				break;
 			}
 		}
 	}
-
-	// trace the size table in debug builds
+
+	// Dump the size table in debug builds
 #ifndef NDEBUG
-	if(BackupStoreFile::TraceDetailsOfDiffProcess)
+	for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
 	{
-		for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
+		if(scanSizes[t] != 0)
 		{
-			BOX_TRACE("Diff block size " << t << ": " <<
-				Sizes[t] << " (count = " <<
-				sizeCounts[t] << ")");
+			BOX_TRACE("Scan block size " << t << ": " <<
+				scanSizes[t] << " count: " <<
+				scanSizesCount[t]);
 		}
 	}
 #endif
-}

-
-
-// --------------------------------------------------------------------------
-//
-// Function
-//		Name:    static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-//		Purpose: Find the matching blocks within the file.
-//		Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
-	BlocksAvailableEntry *pIndex, int64_t NumBlocks,
-	int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer)
-{
-	Timer maximumDiffingTime(0, "MaximumDiffingTime");
-
-	if(pDiffTimer && pDiffTimer->IsManaged())
+	// If we didn't find any sizes (could happen if all were found in the first matching
+	// phase) we're done.
+	if(maxScanBlockSize == 0)
 	{
-		maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(),
-			"MaximumDiffingTime");
+#ifndef NDEBUG
+		BOX_TRACE("No scan block sizes, skip rolling checksum");
+		goto dumpDiffList;
+#endif
+		return;
 	}
-
-	std::map<int64_t, int32_t> goodnessOfFit;

-	// Allocate the hash lookup table
-	BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
+	ASSERT(maxScanBlockSize <= BACKUP_FILE_MAX_BLOCK_SIZE);

-	// Choose a size for the buffer, just a little bit more than the maximum block size
-	int32_t bufSize = Sizes[0];
-	for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
-	{
-		if(Sizes[z] > bufSize) bufSize = Sizes[z];
-	}
-	bufSize += 4;
-	ASSERT(bufSize > Sizes[0]);
-	ASSERT(bufSize > 0);
-	if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024))
-	{
-		THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
-	}
+	// Allocate two buffers we'll toggle between at the max scan block size
+	// There sizes are doubled to make final block at the end easier (we're cheating)
+	pbuffer0 = (uint8_t *)::malloc(maxScanBlockSize * 2);
+	pbuffer1 = (uint8_t *)::malloc(maxScanBlockSize * 2);

-	// TODO: Because we read in the file a scanned block size at a time,
-	// it is likely to be inefficient. Probably will be much better to
-	// calculate checksums for all block sizes in a single pass.
-
-	// Allocate the buffers.
-	uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize);
-	uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize);
+	// Allocate a bitmap buffer to optimize goodnessOfFit access
+	pfitBitmap = (uint8_t *)::malloc(maxScanBlockSize * 2 / 8);
+
 	try
 	{
 		// Check buffer allocation
-		if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
+		if(pbuffer0 == 0 || pbuffer1 == 0 || pfitBitmap == 0)
 		{
 			// If a buffer got allocated, it will be cleaned up in the catch block
 			throw std::bad_alloc();
 		}

-		// Flag to abort the run, if too many blocks are found -- avoid using
-		// huge amounts of processor time when files contain many similar blocks.
-		bool abortSearch = false;
+		// Shift file position back to beginning
+		int64_t bufferFileOffset = 0;
+		rFile.Seek(0, IOStream::SeekType_Absolute);

-		// Search for each block size in turn
-		// NOTE: Do the smallest size first, so that the scheme for adding
-		// entries in the found list works as expected and replaces smallers block
-		// with larger blocks when it finds matches at the same offset in the file.
-		for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s)
+		// We're going to be flipping back and forth between two buffers, the low and high
+		uint8_t *lowBuffer = pbuffer0;
+		int32_t lowBufferBytes = 0;
+		uint8_t *highBuffer = pbuffer1;
+		int32_t highBufferBytes = 0;
+
+		// In some cases we need to carry over reads from prior buffers
+		int32_t carryOverBytes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+		::memset(carryOverBytes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+		// Read the first buffer's worth of data
+		lowBufferBytes = rFile.Read(lowBuffer, maxScanBlockSize);
+		// Fill the second buffer if appropriate
+		if(lowBufferBytes == maxScanBlockSize)
 		{
-			ASSERT(Sizes[s] <= bufSize);
-			BOX_TRACE("Diff pass " << s << ", for block size " <<
-				Sizes[s]);
-
-			// Check we haven't finished
-			if(Sizes[s] == 0)
+			highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize);
+		}
+
+		// For every block size, initialize our scan tracking
+		for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+		{
+			ASSERT(scanSizes[z] <= maxScanBlockSize);
+
+			// The sizes array may be mostly empty, in those cases we have no
+			// state to maintain.
+			if(scanSizes[z] != 0)
 			{
-				// empty entry, try next size
-				continue;
+				// Mark for scan
+				scanThisSize[z] = true;
+
+				// Set up the hash table for this size
+				hashTables[z] = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
+				if(hashTables[z] == 0)
+				{
+					throw std::bad_alloc();
+				}
+				SetupHashTable(pIndex, NumBlocks, scanSizes[z], hashTables[z]);
+
+				// Set up a rolling checksum, but only if there is enough data to start with
+				// (file may now be shorter than some block sizes previously used)
+				if(scanSizes[z] <= lowBufferBytes)
+				{
+					rollingSums[z] = new RollingChecksum(lowBuffer, scanSizes[z]);
+				}
 			}
-
-			// Set up the hash table entries
-			SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable);
+		}

-			// Shift file position to beginning
-			rFile.Seek(0, IOStream::SeekType_Absolute);
-
-			// Read first block
-			if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s])
+		// Read loop while we can get full maxScanBlockSize reads
+		while(highBufferBytes == maxScanBlockSize)
+		{
+			// Oh happy day! We have maxScanBlockSize * 2 bytes available to us across
+			// the two buffers, which means we can walk every block size across all offsets
+			// in lowBuffer without any concern about overrunning the data available in highBuffer.
+
+			// Check diffing timeout
+			if(maximumDiffingTime.HasExpired())
 			{
-				// Size of file too short to match -- do next size
-				continue;
+				ASSERT(pDiffTimer != NULL);
+				BOX_INFO("MaximumDiffingTime reached - suspending file diff");
+				abortSearch = true;
+				break;
 			}

-			// Setup block pointers
-			uint8_t *beginnings = pbuffer0;
-			uint8_t *endings = pbuffer1;
-			int offset = 0;
+			// Send keep alive
+			if(pDiffTimer) pDiffTimer->DoKeepAlive();

-			// Calculate the first checksum, ready for rolling
-			RollingChecksum rolling(beginnings, Sizes[s]);
+			// Don't you wish hash_map was standard? std::map is very slow
+			// when we access it as often as we do in the loop below. To work around
+			// we'll create a bitmap of the previous fits in this buffer
+			::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8);
+			for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i)
+			{
+				if(i->first >= (bufferFileOffset + maxScanBlockSize)) break;
+				pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7));
+			}

-			// Then roll, until the file is exhausted
-			int64_t fileBlockNumber = 0;
-			int64_t fileOffset = 0;
-			int rollOverInitialBytes = 0;
-			while(true)
+			// Walk all block sizes in block-probability order
+			for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s)
 			{
-				if(maximumDiffingTime.HasExpired())
-				{
-					ASSERT(pDiffTimer != NULL);
-					BOX_INFO("MaximumDiffingTime reached - "
-						"suspending file diff");
-					abortSearch = true;
-					break;
-				}
+				// If there is no rolling checksum at this index, skip its either
+				// a zero size or the file was too small
+				if(!scanThisSize[s] || (rollingSums[s] == 0)) continue;
+
+#ifndef NDEBUG
+				BOX_TRACE("Diff block size " << scanSizes[s] << " at file offset " << bufferFileOffset);
+#endif

-				if(pDiffTimer)
-				{
-					pDiffTimer->DoKeepAlive();
-				}
-
-				// Load in another block of data, and record how big it is
-				int bytesInEndings = rFile.Read(endings, Sizes[s]);
-				int tmp;
+				// Offset of this block buffer
+				int32_t bufferOffset = 0;

-				// Skip any bytes from a previous matched block
-				if(rollOverInitialBytes > 0 && offset < bytesInEndings)
+				// Roll carry over
+				if(carryOverBytes[s] != 0)
 				{
-					int spaceLeft = bytesInEndings - offset;
-					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+					ASSERT(carryOverBytes[s] > 0);
+					// Carry can be bigger than maxScanBlockSize because first matching
+					// phase might have used bigger blocks than the current max scan size
+					int32_t thisCarry = carryOverBytes[s];
+					if(thisCarry > maxScanBlockSize)
+					{
+						thisCarry = maxScanBlockSize;
+					}

-					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
-					offset += thisRoll;
-					fileOffset += thisRoll;
-					rollOverInitialBytes -= thisRoll;
-
-					if(rollOverInitialBytes)
+					// Perform carry
+					if((thisCarry + scanSizes[s]) > maxScanBlockSize)
 					{
-						goto refresh;
+						// Roll is split some low, some low/high
+						int32_t lowRollBytes = maxScanBlockSize - scanSizes[s];
+						ASSERT(lowRollBytes >= 0);
+						if(lowRollBytes > 0)
+						{
+							rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], lowRollBytes);
+						}
+						rollingSums[s]->RollForwardSeveral(lowBuffer + lowRollBytes, highBuffer, scanSizes[s], thisCarry - lowRollBytes);
 					}
+					else
+					{
+						// Roll is all in low buffer
+						rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], thisCarry);
+					}
+					// Either way offset is carry
+					bufferOffset = thisCarry;
+					// Reuce carry by the carry amount actually carried
+					carryOverBytes[s] -= thisCarry;
+					ASSERT(carryOverBytes[s] >= 0);
 				}
-
-				if(goodnessOfFit.count(fileOffset))
+
+				// Loop remaining low buffer bytes, taking a checksum at each offset
+				while (bufferOffset < maxScanBlockSize)
 				{
-					tmp = goodnessOfFit[fileOffset];
-				}
-				else
-				{
-					tmp = 0;
-				}
+					// Look for larger size matches at this offset
+					int32_t bestFitSoFarSize = 0;

-				if(tmp >= Sizes[s])
-				{
-					// Skip over bigger ready-matched blocks completely
-					rollOverInitialBytes = tmp;
-					int spaceLeft = bytesInEndings - offset;
-					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
-
-					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
-					offset += thisRoll;
-					fileOffset += thisRoll;
-					rollOverInitialBytes -= thisRoll;
-
-					if(rollOverInitialBytes)
+					if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7)))
 					{
-						goto refresh;
+						bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset];
 					}
-				}

-				while(offset < bytesInEndings)
-				{
-					// Is current checksum in hash list?
-					uint16_t hash = rolling.GetComponentForHashing();
-					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
+					if(bestFitSoFarSize >= scanSizes[s])
 					{
-						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+						// Roll can be bigger than maxScanBlockSize because first matching
+						// phase might have used bigger blocks than the max scan size
+						int32_t thisRoll = bestFitSoFarSize;
+						int32_t remainderRoll = 0;
+						if(thisRoll > maxScanBlockSize)
 						{
-							BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
-							goodnessOfFit[fileOffset] = Sizes[s];
+							thisRoll = maxScanBlockSize;
+							remainderRoll = bestFitSoFarSize - thisRoll;
+						}

-							// Block matched, roll the checksum forward to the next block without doing
-							// any more comparisons, because these are pointless (as any more matches will be ignored when
-							// the recipe is generated) and just take up valuable processor time. Edge cases are
-							// especially nasty, using huge amounts of time and memory.
-							int skip = Sizes[s];
-							if(offset < bytesInEndings && skip > 0)
-							{
-								int spaceLeft = bytesInEndings - offset;
-								int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
+						// Roll forward in the lower buffer. This will either exhaust the total roll or will
+						// push the rolling checksum so that it is against the end of the buffer.
+						int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]);

-								rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+						if(lowRoll > 0)
+						{
+							rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll);
+							bufferOffset += lowRoll;
+							thisRoll -= lowRoll;
+						}

-								offset += thisRoll;
-								fileOffset += thisRoll;
-								skip -= thisRoll;
-							}
-							// Not all the bytes necessary will have been skipped, so get them
-							// skipped after the next block is loaded.
-							rollOverInitialBytes = skip;
-
-							// End this loop, so the final byte isn't used again
-							break;
-						}
-						else
+						if(thisRoll < 1)
 						{
-							BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
+							ASSERT(remainderRoll == 0);
+							continue;  // Back around to bufferOffset loop
 						}

-						int64_t NumBlocksFound = static_cast<int64_t>(
-							rFoundBlocks.size());
-						int64_t MaxBlocksFound = NumBlocks *
-							BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
-
-						if(NumBlocksFound > MaxBlocksFound)
+						// Roll high/low section. Our roll here will either exhaust the total or will
+						// leave the rolling checksum positioned a offset zero in the high buffer
+						int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset);
+						ASSERT(lowHighRoll > 0);
+						rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll);
+						bufferOffset += lowHighRoll;
+						carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll;
+						ASSERT(carryOverBytes[s] >= 0);
+						continue;  // Back around to bufferOffset loop
+					}
+
+					// Look for block match at this size
+					uint16_t hashValue = rollingSums[s]->GetComponentForHashing();
+					if(hashTables[s][hashValue] != 0)
+					{
+						int32_t lowSize = maxScanBlockSize - bufferOffset;
+						if(lowSize > scanSizes[s]) lowSize = scanSizes[s];
+						int32_t highSize = scanSizes[s] - lowSize;
+						if(SecondStageMatch(hashTables[s][hashValue],
+							rollingSums[s],
+							lowBuffer + bufferOffset, lowSize,
+							highBuffer, highSize,
+							scanSizes[s],
+							bufferFileOffset + bufferOffset,
+							pIndex, rFoundBlocks))
 						{
-							abortSearch = true;
-							break;
+#ifndef NDEBUG
+							BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset);
+#endif
+							// Update best fit so far
+							ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s]));
+							goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s];
+							pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7));
+
+							// We've found a match, don't scan anymore if we don't expect more blocks of this size
+							--scanSizesCount[s];
+							if(scanSizesCount[s] < 1)
+							{
+								scanThisSize[s] = false;
+#ifndef NDEBUG
+								BOX_TRACE("Short-circuit size " << scanSizes[s]);
+#endif
+							}
+
+							// Roll so we don't bother diffing the remainder of this block, see bestFitSoFarSize
+							// roll above for an explanation of the logic
+							int32_t thisRoll = scanSizes[s];
+							int32_t remainderRoll = 0;
+							if(thisRoll > maxScanBlockSize)
+							{
+								thisRoll = maxScanBlockSize;
+								remainderRoll = bestFitSoFarSize - thisRoll;
+							}
+							int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]);
+							if(lowRoll > 0)
+							{
+								rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll);
+								bufferOffset += lowRoll;
+								thisRoll -= lowRoll;
+							}
+							if(thisRoll < 1)
+							{
+								ASSERT(remainderRoll == 0);
+								continue;  // Back around to bufferOffset loop
+							}
+							int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset);
+							ASSERT(lowHighRoll > 0);
+							rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll);
+							bufferOffset += lowHighRoll;
+							carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll;
+							ASSERT(carryOverBytes[s] >= 0);
+							continue;  // Back around to bufferOffset loop
 						}
 					}

-					// Roll checksum forward
-					rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
-
-					// Increment offsets
-					++offset;
-					++fileOffset;
+					// Roll one byte, either low or low/high depending on current offset
+					if((bufferOffset + scanSizes[s]) >= maxScanBlockSize)
+					{
+						rollingSums[s]->RollForward(lowBuffer[bufferOffset], highBuffer[scanSizes[s] - (maxScanBlockSize - bufferOffset)], scanSizes[s]);
+					}
+					else
+					{
+						rollingSums[s]->RollForward(lowBuffer[bufferOffset], lowBuffer[bufferOffset + scanSizes[s]], scanSizes[s]);
+					}
+					++bufferOffset;
 				}
+
+				// Check if we've found too much
+				int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size());
+				int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
+				if(NumBlocksFound > MaxBlocksFound)
+				{
+					abortSearch = true;
+					break;
+				}
+			}
+			// If we're aborting, pass through
+			if(abortSearch) break;
+
+			// Swap buffers so high is low and the go back around
+			uint8_t *swap = lowBuffer;
+			lowBuffer = highBuffer;
+			ASSERT((lowBufferBytes == highBufferBytes) && (lowBufferBytes == maxScanBlockSize));
+			bufferFileOffset += maxScanBlockSize;
+			highBuffer = swap;
+			highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize);
+
+		} // End of maxScanBlockSize reads
+
+		// What remains is either:
+		// - An abort or timeout
+		// - A low buffer with less than maxScanBlockSize bytes and an empty high buffer
+		//		(Potentially zero low bytes for a zero-length file)
+		// - A maxScanBlockSize low buffer and a maxScanBlockSize or less high buffer
+		//
+		if(!abortSearch && !maximumDiffingTime.HasExpired() && (lowBufferBytes > 0))
+		{
+			// We're going to repeat the logic in the previous read loop, but have
+			// to be careful not to walk off the end of the available data. To make this
+			// a lot less convoluted, we're just going to copy to consolidate buffers.
+			// This is cheating a bit, but worth the simplicity.
+			ASSERT((lowBufferBytes < maxScanBlockSize) ? (highBufferBytes == 0) : true);
+			ASSERT((highBufferBytes != 0) ? (lowBufferBytes == maxScanBlockSize) : true);
+			uint8_t *endingBuffer = lowBuffer;
+			if(highBufferBytes != 0)
+			{
+				::memcpy(endingBuffer + maxScanBlockSize, highBuffer, highBufferBytes);
+			}
+			int32_t endingBytes = lowBufferBytes + highBufferBytes;
+
+			// Send keep alive
+			if(pDiffTimer) pDiffTimer->DoKeepAlive();
+
+			// std::map still not winning
+			::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8);
+			for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i)
+			{
+				pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7));
+			}
+
+			// Walk all block sizes in block-probability order
+			for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s)
+			{
+				// If there is no rolling checksum at this index, skip its either
+				// a zero size or the file was too small
+				if(!scanThisSize[s] || (rollingSums[s] == 0)) continue;

-				if(abortSearch) break;
+				// If there's not enough data at the end, skip this block size
+				if(endingBytes < scanSizes[s]) continue;
+
+#ifndef NDEBUG
+				BOX_TRACE("Diff block size " << scanSizes[s] << " at file ending offset " << bufferFileOffset);
+#endif

-			refresh:
-				// Finished?
-				if(bytesInEndings != Sizes[s])
+				// Offset of this block buffer
+				int32_t bufferOffset = 0;
+
+				// For each block size we can only walk so far safely
+				int32_t maxSafeOffset = endingBytes - scanSizes[s];
+
+				// Roll carry over
+				if(carryOverBytes[s] != 0)
 				{
-					// No more data in file -- check the final block
-					// (Do a copy and paste of 5 lines of code instead of introducing a comparison for
-					// each byte of the file)
-					uint16_t hash = rolling.GetComponentForHashing();
-					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
+					ASSERT(carryOverBytes[s] > 0);
+					// Don't walk off the end
+					if(carryOverBytes[s] > maxSafeOffset) continue;
+					// Roll
+					rollingSums[s]->RollForwardSeveral(endingBuffer, endingBuffer + scanSizes[s], scanSizes[s], carryOverBytes[s]);
+					bufferOffset = carryOverBytes[s];
+					carryOverBytes[s] = 0;  // Not really needed again, but be clean
+				}
+
+				// Loop remaining end bytes, we must walk to exactly and then break the loop
+				while (bufferOffset <= maxSafeOffset)
+				{
+					// Look for larger size matches at this offset
+					int32_t bestFitSoFarSize = 0;
+					if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7)))
 					{
-						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+						bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset];
+					}
+					if(bestFitSoFarSize >= scanSizes[s]) {
+						// Space for this roll?
+						if((bufferOffset + bestFitSoFarSize) > maxSafeOffset) break; // Kill byte walk loop
+						// Roll
+						rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], bestFitSoFarSize);
+						bufferOffset += bestFitSoFarSize;
+						continue;  // Back around to byte read loop
+					}
+
+					// Look for block match at this size
+					uint16_t hashValue = rollingSums[s]->GetComponentForHashing();
+					if(hashTables[s][hashValue] != 0)
+					{
+						if(SecondStageMatch(hashTables[s][hashValue],
+							rollingSums[s],
+							endingBuffer + bufferOffset,
+							scanSizes[s],
+							0, 0, scanSizes[s],
+							bufferFileOffset + bufferOffset,
+							pIndex, rFoundBlocks))
 						{
-							goodnessOfFit[fileOffset] = Sizes[s];
+#ifndef NDEBUG
+							BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset);
+#endif
+							// Update best fit so far
+							ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s]));
+							goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s];
+							pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7));
+
+							// We've found a match, don't scan anymore if we don't expect more blocks of this size
+							--scanSizesCount[s];
+							if(scanSizesCount[s] < 1)
+							{
+								scanThisSize[s] = false;
+#ifndef NDEBUG
+								BOX_TRACE("Short-circuit size " << scanSizes[s]);
+#endif
+							}
+
+							// Roll over this matched block
+							if((bufferOffset + scanSizes[s]) > maxSafeOffset) break; // Kill byte walk loop
+							rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], scanSizes[s]);
+							bufferOffset += scanSizes[s];
+							continue;  // Back around to byte read loop
 						}
 					}
-
-					// finish
-					break;
+
+					// Abort check
+					int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size());
+					int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
+					if(NumBlocksFound > MaxBlocksFound)
+					{
+						abortSearch = true;
+						break;
+					}
+
+					// Because byte loop conditional can equal maxSafeOffset we must not read off
+					// end here
+					if(bufferOffset == maxSafeOffset) break;  // Done with reading bytes
+					rollingSums[s]->RollForward(endingBuffer[bufferOffset], endingBuffer[bufferOffset + scanSizes[s]], scanSizes[s]);
+					++bufferOffset;
 				}
-
-				// Switch buffers, reset offset
-				beginnings = endings;
-				endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0);	// ie the other buffer
-				offset = 0;
-
-				// And count the blocks which have been done
-				++fileBlockNumber;
+				if(abortSearch) break;
+				// NOTE: Nothing else can go here without restructuring use of "break" in the byte read loop above
 			}
-
-			if(abortSearch) break;
 		}

-		// Free buffers and hash table
+		// Clean up
+		::free(pbuffer0);
+		pbuffer0 = 0;
 		::free(pbuffer1);
 		pbuffer1 = 0;
-		::free(pbuffer0);
-		pbuffer0 = 0;
-		::free(phashTable);
-		phashTable = 0;
+		::free(pfitBitmap);
+		pfitBitmap = 0;
+		for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+		{
+			if(hashTables[z] != 0)
+			{
+				free(hashTables[z]);
+				hashTables[z] = 0;
+			}
+			if(rollingSums[z] != 0)
+			{
+				delete rollingSums[z];
+				rollingSums[z] = 0;
+			}
+		}
 	}
 	catch(...)
 	{
-		// Cleanup and throw
+		// Cleanup and rethrow
+		if(pbuffer0 != 0) ::free(pbuffer0);
 		if(pbuffer1 != 0) ::free(pbuffer1);
-		if(pbuffer0 != 0) ::free(pbuffer0);
-		if(phashTable != 0) ::free(phashTable);
+		if(pfitBitmap != 0) ::free(pfitBitmap);
+		for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+		{
+			if(hashTables[z] != 0) free(hashTables[z]);
+		}
+		for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+		{
+			if(rollingSums[z] != 0) delete rollingSums[z];
+		}
 		throw;
 	}
-
+
 #ifndef NDEBUG
-	if(BackupStoreFile::TraceDetailsOfDiffProcess)
+dumpDiffList:
+	// Trace out the found blocks in debug mode
+	BOX_TRACE("Diff: list of found blocks");
+	BOX_TRACE("======== ======== ======== ========");
+	BOX_TRACE("  Offset   BlkIdx     Size Movement");
+	for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
 	{
-		// Trace out the found blocks in debug mode
-		BOX_TRACE("Diff: list of found blocks");
-		BOX_TRACE("======== ======== ======== ========");
-		BOX_TRACE("  Offset   BlkIdx     Size Movement");
-		for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
+		int64_t orgLoc = 0;
+		for(int64_t b = 0; b < i->second; ++b)
 		{
-			int64_t orgLoc = 0;
-			for(int64_t b = 0; b < i->second; ++b)
-			{
-				orgLoc += pIndex[b].mSize;
-			}
-			BOX_TRACE(std::setw(8) << i->first << " " <<
-				std::setw(8) << i->second << " " <<
-				std::setw(8) << pIndex[i->second].mSize <<
-				" " <<
-				std::setw(8) << (i->first - orgLoc));
+			orgLoc += pIndex[b].mSize;
 		}
-		BOX_TRACE("======== ======== ======== ========");
+		BOX_TRACE(std::setw(8) << i->first << " " <<
+			std::setw(8) << i->second << " " <<
+			std::setw(8) << pIndex[i->second].mSize <<
+			" " <<
+			std::setw(8) << (i->first - orgLoc));
 	}
+	BOX_TRACE("======== ======== ======== ========");
 #endif
 }

@@ -781,6 +1158,8 @@
 			{
 				//BOX_TRACE("Another hash entry for " << hash << " found");
 				// Yes -- need to set the pointer in this entry to the current entry to build the linked list
+				// This is safe because there is only one hash table per block size and block sizes
+				// are unique.
 				pIndex[b].mpNextInHashList = pHashTable[hash];
 			}

@@ -790,30 +1169,35 @@
 	}
 }

-
 // --------------------------------------------------------------------------
 //
 // Function
 //		Name:    static bool SecondStageMatch(xxx)
-//		Purpose: When a match in the hash table is found, scan for second stage match using strong checksum.
+//		Purpose: When a match in the hash table is found, scan for
+//			 second stage match using strong checksum.
 //		Created: 14/1/04
 //
 // --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
-	int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList,
+	RollingChecksum *pFastSum, uint8_t *pLow, int32_t LowSize,
+	uint8_t *pHigh, int32_t HighSize, int32_t BlockSize,
+	int64_t FileOffset, BlocksAvailableEntry *pIndex,
+	std::map<int64_t, int64_t> &rFoundBlocks)
 {
 	// Check parameters
-	ASSERT(pBeginnings != 0);
-	ASSERT(pEndings != 0);
-	ASSERT(Offset >= 0);
+	ASSERT(pFirstInHashList != 0);
+	ASSERT(pFastSum != 0);
+	ASSERT(pLow != 0);
+	ASSERT(LowSize != 0);
+	ASSERT((HighSize > 0) ? (pHigh != 0) : true);
 	ASSERT(BlockSize > 0);
-	ASSERT(pFirstInHashList != 0);
+	ASSERT(BlockSize == (LowSize + HighSize));
 	ASSERT(pIndex != 0);

 #ifndef NDEBUG
-	uint16_t DEBUG_Hash = fastSum.GetComponentForHashing();
+	uint16_t DEBUG_Hash = pFastSum->GetComponentForHashing();
 #endif
-	uint32_t Checksum = fastSum.GetChecksum();
+	uint32_t Checksum = pFastSum->GetChecksum();

 	// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
 	BlocksAvailableEntry *scan = pFirstInHashList;
@@ -834,18 +1218,13 @@

 	// Calculate the strong MD5 digest for this block
 	MD5Digest strong;
-	// Add the data from the beginnings
-	strong.Add(pBeginnings + Offset, BlockSize - Offset);
-	// Add any data from the endings
-	if(Offset > 0)
-	{
-		strong.Add(pEndings, Offset);
-	}
+	strong.Add(pLow, LowSize);
+	if(HighSize > 0) strong.Add(pHigh, HighSize);
 	strong.Finish();

 	// Then go through the entries in the hash list, comparing with the strong digest calculated
 	scan = pFirstInHashList;
-	//BOX_TRACE("second stage match");
+
 	while(scan != 0)
 	{
 		//BOX_TRACE("scan size " << scan->mSize <<
@@ -853,25 +1232,22 @@
 		//	", hash " << Hash);
 		ASSERT(scan->mSize == BlockSize);
 		ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash);
-
+
 		// Compare?
 		if(strong.DigestMatches(scan->mStrongChecksum))
 		{
-			//BOX_TRACE("Match!\n");
 			// Found! Add to list of found blocks...
-			int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
 			int64_t blockIndex = (scan - pIndex);	// pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory

 			// We do NOT search for smallest blocks first, as this code originally assumed.
 			// To prevent this from potentially overwriting a better match, the caller must determine
 			// the relative "goodness" of any existing match and this one, and avoid the call if it
 			// could be detrimental.
-			rFoundBlocks[fileOffset] = blockIndex;
+			rFoundBlocks[FileOffset] = blockIndex;

 			// No point in searching further, report success
 			return true;
 		}
-
 		// Next
 		scan = scan->mpNextInHashList;
 	}
@@ -917,11 +1293,8 @@
 		rRecipe.push_back(instruction);

 		#ifndef NDEBUG
-		if(BackupStoreFile::TraceDetailsOfDiffProcess)
-		{
-			BOX_TRACE("Diff: Default recipe generated, " <<
-				SizeOfInputFile << " bytes of file");
-		}
+		BOX_TRACE("Diff: Default recipe generated, " <<
+			SizeOfInputFile << " bytes of file");
 		#endif

 		// Don't do anything
@@ -1016,7 +1389,8 @@
 	BOX_TRACE("Diff: " <<
 		debug_NewBytesFound << " new bytes found, " <<
 		debug_OldBlocksUsed << " old blocks used");
-	if(BackupStoreFile::TraceDetailsOfDiffProcess)
+
+	if (true)
 	{
 		BOX_TRACE("Diff: Recipe generated (size " << rRecipe.size());
 		BOX_TRACE("======== ========= ========");
Index: test/bbackupd/testbbackupd.cpp
===================================================================
--- test/bbackupd/testbbackupd.cpp	(revision 2361)
+++ test/bbackupd/testbbackupd.cpp	(working copy)
@@ -1002,22 +1002,27 @@
 		// before any matching blocks could be found.
 		intercept_setup_delay("testfiles/TestDir1/spacetest/f1",
 			0, 4000, SYS_read, 1);
-		pid = start_internal_daemon();
-		intercept_clear_setup();
+		{
+			Timers::Init();
+			BackupDaemon bbackupd;
+			bbackupd.Configure("testfiles/bbackupd.conf");
+			bbackupd.InitCrypto();

-		fd = open("testfiles/TestDir1/spacetest/f1", O_WRONLY);
-		TEST_THAT(fd > 0);
-		// write again, to update the file's timestamp
-		TEST_EQUAL(sizeof(buffer), write(fd, buffer, sizeof(buffer)),
-			"Buffer write");
-		TEST_THAT(close(fd) == 0);
+			fd = open("testfiles/TestDir1/spacetest/f1", O_WRONLY);
+			TEST_THAT(fd > 0);
+			// write again, to update the file's timestamp
+			TEST_EQUAL(1, write(fd, "z", 1), "Buffer write");
+			TEST_THAT(close(fd) == 0);

-		wait_for_backup_operation();
-		// can't test whether intercept was triggered, because
-		// it's in a different process.
-		// TEST_THAT(intercept_triggered());
-		TEST_THAT(stop_internal_daemon(pid));
+			// wait long enough to put file into sync window
+			wait_for_operation(5);

+			bbackupd.RunSyncNow();
+			TEST_THAT(intercept_triggered());
+			intercept_clear_setup();
+			Timers::Cleanup();
+		}
+
 		// check that the diff was aborted, i.e. upload was not a diff
 		found1 = false;


## BackupFileDiff.patch
--- lib/backupclient/BackupStoreFile.h	2008-03-03 19:34:49.000000000 -0800
+++ lib/backupclient/BackupStoreFile.h	2008-02-27 20:17:20.000000000 -0800
@@ -206,11 +206,6 @@
 	static void ResetStats();
 	static BackupStoreFileStats msStats;

-	// For debug
-#ifndef NDEBUG
-	static bool TraceDetailsOfDiffProcess;
-#endif
-
 	// For decoding encoded files
 	static void DumpFile(void *clibFileHandle, bool ToTrace, IOStream &rFile);
 };
--- test/backupdiff/testbackupdiff.cpp	2008-03-03 19:34:37.000000000 -0800
+++ test/backupdiff/testbackupdiff.cpp	2008-02-27 20:22:53.000000000 -0800
@@ -376,7 +376,7 @@
 	// Want to trace out all the details
 	#ifndef NDEBUG
 	#ifndef WIN32
-	BackupStoreFile::TraceDetailsOfDiffProcess = true;
+	Logging::SetGlobalLevel(Log::TRACE);
 	#endif
 	#endif

--- lib/backupclient/BackupStoreFileDiff.cpp	2008-03-03 19:34:49.000000000 -0800
+++ lib/backupclient/BackupStoreFileDiff.cpp	2008-03-03 20:09:08.000000000 -0800
@@ -36,21 +36,12 @@
 using namespace BackupStoreFileCryptVar;
 using namespace BackupStoreFileCreation;

-// By default, don't trace out details of the diff as we go along -- would fill up logs significantly.
-// But it's useful for the test.
-#ifndef NDEBUG
-	bool BackupStoreFile::TraceDetailsOfDiffProcess = false;
-#endif
-
 static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis);
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
-static void SearchForMatchingBlocks(IOStream &rFile,
-	std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex,
-	int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES],
-	DiffTimer *pDiffTimer);
+static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks,	DiffTimer *pDiffTimer);
 static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
-BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum *pFastSum,
+								uint8_t *pLow, int32_t LowSize, uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, int64_t FileOffset,
+								BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
 static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);

 // --------------------------------------------------------------------------
@@ -166,10 +157,6 @@

 	try
 	{
-		// Find which sizes should be scanned
-		int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
-		FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);
-
 		// Flag for reporting to the user
 		bool completelyDifferent;

@@ -184,8 +171,7 @@
 				// Get size of file
 				sizeOfInputFile = file.BytesLeftToRead();
 				// Find all those lovely matching blocks
-				SearchForMatchingBlocks(file, foundBlocks, pindex,
-					blocksInIndex, sizesToScan, pDiffTimer);
+				SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, pDiffTimer);

 				// Is it completely different?
 				completelyDifferent = (foundBlocks.size() == 0);
@@ -361,390 +347,744 @@
 	}
 }

-
 // --------------------------------------------------------------------------
 //
 // Function
-//		Name:    static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-//		Purpose: Finds the most commonly used block sizes in the index
+//		Name:    static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, DiffTimer *)
+//		Purpose: Find the matching blocks within the file.
 //		Created: 12/1/04
 //
 // --------------------------------------------------------------------------
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
+static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
+									BlocksAvailableEntry *pIndex, int64_t NumBlocks, DiffTimer *pDiffTimer)
 {
-	// Array for lengths
-	int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	Timer maximumDiffingTime(0);

-	// Set arrays to lots of zeros (= unused entries)
-	for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l)
+	if(pDiffTimer && pDiffTimer->IsManaged())
 	{
-		Sizes[l] = 0;
-		sizeCounts[l] = 0;
+		maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime());
 	}

-	// Array for collecting sizes
-	std::map<int32_t, int64_t> foundSizes;
+	// Flag to abort the run, if too many blocks are found or the diffing
+	// timeout expires
+	bool abortSearch = false;
+
+	// Buffers used during both phases of search
+	uint8_t *pbuffer0 = 0;
+	uint8_t *pbuffer1 = 0;
+
+	// Track offsets that already have block matches. We don't really care
+	// if its sorted, and this actually produces a performance issue, so
+	// see pfitBitmap for a workaround
+	std::map<int64_t, int32_t> goodnessOfFit;

-	// Run through blocks and make a count of the entries
-	for(int64_t b = 0; b < NumBlocks; ++b)
+	// Collect sizes that aren't found in the file at their old offset
+	std::map<int32_t, int64_t> unmatchedSizes;
+
+	// Our arrays of block sizes during second search pass
+	int32_t scanSizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	int64_t scanSizesCount[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	int32_t maxScanBlockSize = 0;
+	::memset(scanSizes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+	::memset(scanSizesCount, 0, (sizeof(int64_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+	// We need to keep separate rolling checksums for each block size in second search
+	RollingChecksum *rollingSums[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	::memset(rollingSums, 0, (sizeof(RollingChecksum *) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+	// And a hash lookup table per block size in seacond search
+	BlocksAvailableEntry **hashTables[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	::memset(hashTables, 0, (sizeof(BlocksAvailableEntry **) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+	// We allow second search pass to short-circuit off rare blocks
+	bool scanThisSize[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+	::memset(scanThisSize, 0, (sizeof(bool) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+	// During block read we have a bitmap of prefit locations to avoid std::map
+	// performance problems
+	uint8_t *pfitBitmap = 0;
+
+
+	// First search pass...
+	//
+	// For many files (especially large ones) most of the file is unchanged.
+	// The RollingChecksum process requires us to read every byte of the file looking for
+	// blocks that have moved. However, we can make that process more efficient by
+	// quickly rolling over areas that match a different block. We can also use
+	// this to eliminate entirely the rolling checksum for block sizes that only exist
+	// at one location in the file.
+	//
+	// Thus we start by looking for blocks that have not moved. Only if a block
+	// cannot be found at its previous location do we consider scanning for it by
+	// rolling checksum size.
+	//
+	// This strategy has some disadvantages for files with lots of repeating content
+	// that happens to align with our block size, but the reduction in diff time
+	// for more typical files is worth it.
+	//
+	// Note also that in this pass we consider _all_ block sizes (smaller than
+	// BACKUP_FILE_MAX_BLOCK_SIZE). Any block size, no matter how small or rare
+	// is cheap for us to find in this pass.
+	//
+	pbuffer0 = (uint8_t *)::malloc(BACKUP_FILE_MAX_BLOCK_SIZE);
+	try
 	{
-		// Only if the block size is bigger than the minimum size we'll scan for
-		if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)
+		if(pbuffer0 == 0)
+		{
+			throw std::bad_alloc();
+		}
+
+		// We have to track file offset since the read may fail
+		int64_t fileOffset = 0;
+
+		// Walk the blocks
+		for(int64_t b = 0; b < NumBlocks; ++b)
 		{
-			// Find entry?
-			std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize));
-			if(f != foundSizes.end())
+			// Check diffing timeout
+			if(maximumDiffingTime.HasExpired())
+			{
+				ASSERT(pDiffTimer != NULL);
+				BOX_INFO("MaximumDiffingTime reached - suspending file diff");
+				abortSearch = true;
+				break;
+			}
+
+			// Send keep alive
+			if(pDiffTimer) pDiffTimer->DoKeepAlive();
+
+			// Skip blocks too large for our buffer
+			if(pIndex[b].mSize > BACKUP_FILE_MAX_BLOCK_SIZE) {
+				fileOffset += pIndex[b].mSize;
+				continue;
+			}
+
+			// Have to guard the seek operation, it could throw. We don't know size of the
+			// current file, and checking is pointless anyway since there's a race.
+			// In reality on Unix this is implemented with lseek which will mean we can
+			// seek past the EOF, but I don't want to make assumptions about Win32.
+			int32_t readSize = 0;
+			try
+			{
+				rFile.Seek(fileOffset, IOStream::SeekType_Absolute);
+				readSize = rFile.Read(pbuffer0, pIndex[b].mSize);
+			}
+			catch(BoxException &e)
+			{
+				if(e.GetType() != CommonException::ExceptionType || e.GetSubType() != CommonException::OSFileError)
+				{
+					// Not what we expected, rethrow
+					throw;
+				}
+			}
+
+			// Check for a match
+			bool blockMatched = false;
+			if(readSize == pIndex[b].mSize)
 			{
-				// Increment existing entry
-				foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1;
+				// We don't have a rolling checksum to this point, so all we can do is MD5. If you
+				// worry this is expensive just remember that prior versions of this code
+				// re-read the file BACKUP_FILE_DIFF_MAX_BLOCK_SIZES times and calculated
+				// rolling checksums every time.
+				MD5Digest strong;
+				strong.Add(pbuffer0, pIndex[b].mSize);
+				strong.Finish();
+
+				// Do we have a match?
+				if(strong.DigestMatches(pIndex[b].mStrongChecksum))
+				{
+#ifndef NDEBUG
+					BOX_TRACE("Found unchanged block of size " << pIndex[b].mSize << " at offset " << fileOffset);
+#endif
+					rFoundBlocks[fileOffset] = b;
+					goodnessOfFit[fileOffset] = pIndex[b].mSize;
+					blockMatched = true;
+				}
 			}
-			else
+
+			// We're done with the file offset, so increment now
+			fileOffset += pIndex[b].mSize;
+
+			// If the block didn't match then this is a size we'll have to scan for
+			if(!blockMatched && (pIndex[b].mSize >= BACKUP_FILE_DIFF_MIN_BLOCK_SIZE))
 			{
-				// New entry
-				foundSizes[pIndex[b].mSize] = 1;
+				// Find entry?
+				std::map<int32_t, int64_t>::const_iterator f(unmatchedSizes.find(pIndex[b].mSize));
+				if(f != unmatchedSizes.end())
+				{
+					// Increment existing entry
+					unmatchedSizes[pIndex[b].mSize] = unmatchedSizes[pIndex[b].mSize] + 1;
+				}
+				else
+				{
+					// New entry
+					unmatchedSizes[pIndex[b].mSize] = 1;
+				}
 			}
 		}
+
+		// Cleanup
+		::free(pbuffer0);
+		pbuffer0 = 0;
 	}
-
-	// Make the block sizes
-	for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i)
+	catch(...)
+	{
+		// Cleanup and rethrow
+		if(pbuffer0 != 0) ::free(pbuffer0);
+		throw;
+	}
+
+	// Are we already out of time?
+	if(abortSearch)
 	{
-		// Find the position of the size in the array
+#ifndef NDEBUG
+		goto dumpDiffList;
+#endif
+		return;
+	}
+
+
+	// Second search pass...
+	//
+	// In our second phase, having matched all unchanged blocks we now need
+	// to scan for moved blocks. This involves looping across all unmatched
+	// block sizes and using the rolling checksum to look for relocations. To keep
+	// this from being too expensive we cap at BACKUP_FILE_DIFF_MAX_BLOCK_SIZES
+	// for the number of sizes to scan. We also scan for the blocks in order of
+	// their relative probability, a block size that occurs frequently is scanned
+	// first.
+	//
+
+	// Loop all sizes inserting higher usages into the array
+	for(std::map<int32_t, int64_t>::const_iterator i(unmatchedSizes.begin()); i != unmatchedSizes.end(); ++i)
+	{
+		// TODO: Scanning for any block size isn't cheap, and realistically in many cases
+		// it would be less expensive to upload a few thousand bytes rather than do the scan.
+		// Here would be a good place to filter block sizes that aren't worth the effort
+		// once a suitable set of heuristics is found.
+
 		for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
 		{
-			// Instead of sorting on the raw count of blocks,
-			// take the file area covered by this block size.
-			if(i->second * i->first > sizeCounts[t] * Sizes[t])
+			// Instead of sorting on the raw count of blocks, take the file area covered by this
+			// block size. This helps avoid favoring low numbers of large blocks over many
+			// small blocks.
+			if((i->second * i->first) > (scanSizesCount[t] * scanSizes[t]))
 			{
 				// Then this size belong before this entry -- shuffle them up
 				for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
 				{
-					Sizes[s] = Sizes[s-1];
-					sizeCounts[s] = sizeCounts[s-1];
+					scanSizes[s] = scanSizes[s-1];
+					scanSizesCount[s] = scanSizesCount[s-1];
 				}

 				// Insert this size
-				Sizes[t] = i->first;
-				sizeCounts[t] = i->second;
+				scanSizes[t] = i->first;
+				scanSizesCount[t] = i->second;

-				// Shouldn't do any more searching
+				// Update max size
+				if(maxScanBlockSize < i->first) maxScanBlockSize = i->first;
+
+				// Shouldn't do any more searching for this size
 				break;
 			}
 		}
 	}
-
-	// trace the size table in debug builds
+	// Dump the size table in debug builds
 #ifndef NDEBUG
-	if(BackupStoreFile::TraceDetailsOfDiffProcess)
+	for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
 	{
-		for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
-		{
-			TRACE3("Diff block size %d: %d (count = %lld)\n", t, Sizes[t], sizeCounts[t]);
-		}
+		if(scanSizes[t] != 0) BOX_TRACE("Scan block size " << t << ": " << scanSizes[t] << " count: " << scanSizesCount[t]);
 	}
 #endif
-}
-
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-//		Name:    static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-//		Purpose: Find the matching blocks within the file.
-//		Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
-	BlocksAvailableEntry *pIndex, int64_t NumBlocks,
-	int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer)
-{
-	Timer maximumDiffingTime(0);

-	if(pDiffTimer && pDiffTimer->IsManaged())
+	// If we didn't find any sizes (could happen if all were found in the first matching
+	// phase) we're done.
+	if(maxScanBlockSize == 0)
 	{
-		maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime());
+#ifndef NDEBUG
+		BOX_TRACE("No scan block sizes, skip rolling checksum");
+		goto dumpDiffList;
+#endif
+		return;
 	}
+	ASSERT(maxScanBlockSize <= BACKUP_FILE_MAX_BLOCK_SIZE);

-	std::map<int64_t, int32_t> goodnessOfFit;
-
-	// Allocate the hash lookup table
-	BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
-
-	// Choose a size for the buffer, just a little bit more than the maximum block size
-	int32_t bufSize = Sizes[0];
-	for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
-	{
-		if(Sizes[z] > bufSize) bufSize = Sizes[z];
-	}
-	bufSize += 4;
-	ASSERT(bufSize > Sizes[0]);
-	ASSERT(bufSize > 0);
-	if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024))
-	{
-		THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
-	}
+	// Allocate two buffers we'll toggle between at the max scan block size
+	// There sizes are doubled to make final block at the end easier (we're cheating)
+	pbuffer0 = (uint8_t *)::malloc(maxScanBlockSize * 2);
+	pbuffer1 = (uint8_t *)::malloc(maxScanBlockSize * 2);
+
+	// Allocate a bitmap buffer to optimize goodnessOfFit access
+	pfitBitmap = (uint8_t *)::malloc(maxScanBlockSize * 2 / 8);

-	// TODO: Because we read in the file a scanned block size at a time,
-	// it is likely to be inefficient. Probably will be much better to
-	// calculate checksums for all block sizes in a single pass.
-
-	// Allocate the buffers.
-	uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize);
-	uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize);
 	try
 	{
 		// Check buffer allocation
-		if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
+		if(pbuffer0 == 0 || pbuffer1 == 0 || pfitBitmap == 0)
 		{
 			// If a buffer got allocated, it will be cleaned up in the catch block
 			throw std::bad_alloc();
 		}

-		// Flag to abort the run, if too many blocks are found -- avoid using
-		// huge amounts of processor time when files contain many similar blocks.
-		bool abortSearch = false;
-
-		// Search for each block size in turn
-		// NOTE: Do the smallest size first, so that the scheme for adding
-		// entries in the found list works as expected and replaces smallers block
-		// with larger blocks when it finds matches at the same offset in the file.
-		for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s)
-		{
-			ASSERT(Sizes[s] <= bufSize);
-			BOX_TRACE("Diff pass " << s << ", for block size " <<
-				Sizes[s]);
-
-			// Check we haven't finished
-			if(Sizes[s] == 0)
-			{
-				// empty entry, try next size
-				continue;
-			}
-
-			// Set up the hash table entries
-			SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable);
+		// Shift file position back to beginning
+		int64_t bufferFileOffset = 0;
+		rFile.Seek(0, IOStream::SeekType_Absolute);

-			// Shift file position to beginning
-			rFile.Seek(0, IOStream::SeekType_Absolute);
-
-			// Read first block
-			if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s])
+		// We're going to be flipping back and forth between two buffers, the low and high
+		uint8_t *lowBuffer = pbuffer0;
+		int32_t lowBufferBytes = 0;
+		uint8_t *highBuffer = pbuffer1;
+		int32_t highBufferBytes = 0;
+
+		// In some cases we need to carry over reads from prior buffers
+		int32_t carryOverBytes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+		::memset(carryOverBytes, 0, (sizeof(int32_t) * BACKUP_FILE_DIFF_MAX_BLOCK_SIZES));
+
+		// Read the first buffer's worth of data
+		lowBufferBytes = rFile.Read(lowBuffer, maxScanBlockSize);
+		// Fill the second buffer if appropriate
+		if(lowBufferBytes == maxScanBlockSize)
+		{
+			highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize);
+		}
+
+		// For every block size, initialize our scan tracking
+		for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+		{
+			ASSERT(scanSizes[z] <= maxScanBlockSize);
+
+			// The sizes array may be mostly empty, in those cases we have no
+			// state to maintain.
+			if(scanSizes[z] != 0)
 			{
-				// Size of file too short to match -- do next size
-				continue;
-			}
-
-			// Setup block pointers
-			uint8_t *beginnings = pbuffer0;
-			uint8_t *endings = pbuffer1;
-			int offset = 0;
-
-			// Calculate the first checksum, ready for rolling
-			RollingChecksum rolling(beginnings, Sizes[s]);
-
-			// Then roll, until the file is exhausted
-			int64_t fileBlockNumber = 0;
-			int64_t fileOffset = 0;
-			int rollOverInitialBytes = 0;
-			while(true)
-			{
-				if(maximumDiffingTime.HasExpired())
-				{
-					ASSERT(pDiffTimer != NULL);
-					BOX_INFO("MaximumDiffingTime reached - "
-						"suspending file diff");
-					abortSearch = true;
-					break;
-				}
+				// Mark for scan
+				scanThisSize[z] = true;

-				if(pDiffTimer)
+				// Set up the hash table for this size
+				hashTables[z] = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
+				if(hashTables[z] == 0)
 				{
-					pDiffTimer->DoKeepAlive();
+					throw std::bad_alloc();
 				}
-
-				// Load in another block of data, and record how big it is
-				int bytesInEndings = rFile.Read(endings, Sizes[s]);
-				int tmp;
+				SetupHashTable(pIndex, NumBlocks, scanSizes[z], hashTables[z]);

-				// Skip any bytes from a previous matched block
-				if(rollOverInitialBytes > 0 && offset < bytesInEndings)
+				// Set up a rolling checksum, but only if there is enough data to start with
+				// (file may now be shorter than some block sizes previously used)
+				if(scanSizes[z] <= lowBufferBytes)
 				{
-					int spaceLeft = bytesInEndings - offset;
-					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+					rollingSums[z] = new RollingChecksum(lowBuffer, scanSizes[z]);
+				}
+			}
+		}
+
+		// Read loop while we can get full maxScanBlockSize reads
+		while(highBufferBytes == maxScanBlockSize)
+		{
+			// Oh happy day! We have maxScanBlockSize * 2 bytes available to us across
+			// the two buffers, which means we can walk every block size across all offsets
+			// in lowBuffer without any concern about overrunning the data available in highBuffer.

-					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+			// Check diffing timeout
+			if(maximumDiffingTime.HasExpired())
+			{
+				ASSERT(pDiffTimer != NULL);
+				BOX_INFO("MaximumDiffingTime reached - suspending file diff");
+				abortSearch = true;
+				break;
+			}
+
+			// Send keep alive
+			if(pDiffTimer) pDiffTimer->DoKeepAlive();
+
+			// Don't you wish hash_map was standard? std::map is very slow
+			// when we access it as often as we do in the loop below. To work around
+			// we'll create a bitmap of the previous fits in this buffer
+			::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8);
+			for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i)
+			{
+				if(i->first >= (bufferFileOffset + maxScanBlockSize)) break;
+				pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7));
+			}
+
+			// Walk all block sizes in block-probability order
+			for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s)
+			{
+				// If there is no rolling checksum at this index, skip its either
+				// a zero size or the file was too small
+				if(!scanThisSize[s] || (rollingSums[s] == 0)) continue;

-					offset += thisRoll;
-					fileOffset += thisRoll;
-					rollOverInitialBytes -= thisRoll;
+#ifndef NDEBUG
+				BOX_TRACE("Diff block size " << scanSizes[s] << " at file offset " << bufferFileOffset);
+#endif
+
+				// Offset of this block buffer
+				int32_t bufferOffset = 0;

-					if(rollOverInitialBytes)
+				// Roll carry over
+				if(carryOverBytes[s] != 0)
+				{
+					ASSERT(carryOverBytes[s] > 0);
+					// Carry can be bigger than maxScanBlockSize because first matching
+					// phase might have used bigger blocks than the current max scan size
+					int32_t thisCarry = carryOverBytes[s];
+					if(thisCarry > maxScanBlockSize)
 					{
-						goto refresh;
+						thisCarry = maxScanBlockSize;
 					}
-				}
-
-				if(goodnessOfFit.count(fileOffset))
-				{
-					tmp = goodnessOfFit[fileOffset];
-				}
-				else
-				{
-					tmp = 0;
-				}

-				if(tmp >= Sizes[s])
-				{
-					// Skip over bigger ready-matched blocks completely
-					rollOverInitialBytes = tmp;
-					int spaceLeft = bytesInEndings - offset;
-					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
-
-					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
-					offset += thisRoll;
-					fileOffset += thisRoll;
-					rollOverInitialBytes -= thisRoll;
-
-					if(rollOverInitialBytes)
+					// Perform carry
+					if((thisCarry + scanSizes[s]) > maxScanBlockSize)
+					{
+						// Roll is split some low, some low/high
+						int32_t lowRollBytes = maxScanBlockSize - scanSizes[s];
+						ASSERT(lowRollBytes >= 0);
+						if(lowRollBytes > 0)
+						{
+							rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], lowRollBytes);
+						}
+						rollingSums[s]->RollForwardSeveral(lowBuffer + lowRollBytes, highBuffer, scanSizes[s], thisCarry - lowRollBytes);
+					}
+					else
 					{
-						goto refresh;
+						// Roll is all in low buffer
+						rollingSums[s]->RollForwardSeveral(lowBuffer, lowBuffer + scanSizes[s], scanSizes[s], thisCarry);
 					}
+					// Either way offset is carry
+					bufferOffset = thisCarry;
+					// Reuce carry by the carry amount actually carried
+					carryOverBytes[s] -= thisCarry;
+					ASSERT(carryOverBytes[s] >= 0);
 				}
-
-				while(offset < bytesInEndings)
+
+				// Loop remaining low buffer bytes, taking a checksum at each offset
+				while (bufferOffset < maxScanBlockSize)
 				{
-					// Is current checksum in hash list?
-					uint16_t hash = rolling.GetComponentForHashing();
-					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
+					// Look for larger size matches at this offset
+					int32_t bestFitSoFarSize = 0;
+					if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7)))
 					{
-						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+						bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset];
+					}
+					if(bestFitSoFarSize >= scanSizes[s]) {
+						// Roll can be bigger than maxScanBlockSize because first matching
+						// phase might have used bigger blocks than the max scan size
+						int32_t thisRoll = bestFitSoFarSize;
+						int32_t remainderRoll = 0;
+						if(thisRoll > maxScanBlockSize)
 						{
-							BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
-							goodnessOfFit[fileOffset] = Sizes[s];
-
-							// Block matched, roll the checksum forward to the next block without doing
-							// any more comparisons, because these are pointless (as any more matches will be ignored when
-							// the receipe is generated) and just take up valuable processor time. Edge cases are
-							// especially nasty, using huge amounts of time and memory.
-							int skip = Sizes[s];
-							if(offset < bytesInEndings && skip > 0)
-							{
-								int spaceLeft = bytesInEndings - offset;
-								int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
-
-								rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
-								offset += thisRoll;
-								fileOffset += thisRoll;
-								skip -= thisRoll;
-							}
-							// Not all the bytes necessary will have been skipped, so get them
-							// skipped after the next block is loaded.
-							rollOverInitialBytes = skip;
-
-							// End this loop, so the final byte isn't used again
-							break;
+							thisRoll = maxScanBlockSize;
+							remainderRoll = bestFitSoFarSize - thisRoll;
 						}
-						else
+
+						// Roll forward in the lower buffer. This will either exhaust the total roll or will
+						// push the rolling checksum so that it is against the end of the buffer.
+						int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]);
+						if(lowRoll > 0)
 						{
-							BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
+							rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll);
+							bufferOffset += lowRoll;
+							thisRoll -= lowRoll;
 						}
-
-						int64_t NumBlocksFound = static_cast<int64_t>(
-							rFoundBlocks.size());
-						int64_t MaxBlocksFound = NumBlocks *
-							BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
-
-						if(NumBlocksFound > MaxBlocksFound)
+						if(thisRoll < 1)
 						{
-							abortSearch = true;
-							break;
+							ASSERT(remainderRoll == 0);
+							continue;  // Back around to bufferOffset loop
 						}
+						// Roll high/low section. Our roll here will either exhaust the total or will
+						// leave the rolling checksum positioned a offset zero in the high buffer
+						int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset);
+						ASSERT(lowHighRoll > 0);
+						rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll);
+						bufferOffset += lowHighRoll;
+						carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll;
+						ASSERT(carryOverBytes[s] >= 0);
+						continue;  // Back around to bufferOffset loop
 					}

-					// Roll checksum forward
-					rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
-
-					// Increment offsets
-					++offset;
-					++fileOffset;
-				}
-
-				if(abortSearch) break;
-
-			refresh:
-				// Finished?
-				if(bytesInEndings != Sizes[s])
-				{
-					// No more data in file -- check the final block
-					// (Do a copy and paste of 5 lines of code instead of introducing a comparison for
-					// each byte of the file)
-					uint16_t hash = rolling.GetComponentForHashing();
-					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
+					// Look for block match at this size
+					uint16_t hashValue = rollingSums[s]->GetComponentForHashing();
+					if(hashTables[s][hashValue] != 0)
 					{
-						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+						int32_t lowSize = maxScanBlockSize - bufferOffset;
+						if(lowSize > scanSizes[s]) lowSize = scanSizes[s];
+						int32_t highSize = scanSizes[s] - lowSize;
+						if(SecondStageMatch(hashTables[s][hashValue], rollingSums[s], lowBuffer + bufferOffset, lowSize,
+											highBuffer, highSize, scanSizes[s], bufferFileOffset + bufferOffset, pIndex, rFoundBlocks))
 						{
-							goodnessOfFit[fileOffset] = Sizes[s];
+#ifndef NDEBUG
+							BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset);
+#endif
+							// Update best fit so far
+							ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s]));
+							goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s];
+							pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7));
+
+							// We've found a match, don't scan anymore if we don't expect more blocks of this size
+							--scanSizesCount[s];
+							if(scanSizesCount[s] < 1)
+							{
+								scanThisSize[s] = false;
+#ifndef NDEBUG
+								BOX_TRACE("Short-circuit size " << scanSizes[s]);
+#endif
+							}
+
+							// Roll so we don't bother diffing the remainder of this block, see bestFitSoFarSize
+							// roll above for an explanation of the logic
+							int32_t thisRoll = scanSizes[s];
+							int32_t remainderRoll = 0;
+							if(thisRoll > maxScanBlockSize)
+							{
+								thisRoll = maxScanBlockSize;
+								remainderRoll = bestFitSoFarSize - thisRoll;
+							}
+							int32_t lowRoll = ((maxScanBlockSize - bufferOffset - scanSizes[s]) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset - scanSizes[s]);
+							if(lowRoll > 0)
+							{
+								rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, lowBuffer + bufferOffset + scanSizes[s], scanSizes[s], lowRoll);
+								bufferOffset += lowRoll;
+								thisRoll -= lowRoll;
+							}
+							if(thisRoll < 1)
+							{
+								ASSERT(remainderRoll == 0);
+								continue;  // Back around to bufferOffset loop
+							}
+							int32_t lowHighRoll = ((maxScanBlockSize - bufferOffset) > thisRoll) ? thisRoll : (maxScanBlockSize - bufferOffset);
+							ASSERT(lowHighRoll > 0);
+							rollingSums[s]->RollForwardSeveral(lowBuffer + bufferOffset, highBuffer + (scanSizes[s] - (maxScanBlockSize - bufferOffset)), scanSizes[s], lowHighRoll);
+							bufferOffset += lowHighRoll;
+							carryOverBytes[s] = (thisRoll - lowHighRoll) + remainderRoll;
+							ASSERT(carryOverBytes[s] >= 0);
+							continue;  // Back around to bufferOffset loop
 						}
 					}
+
+					// Roll one byte, either low or low/high depending on current offset
+					if((bufferOffset + scanSizes[s]) >= maxScanBlockSize)
+					{
+						rollingSums[s]->RollForward(lowBuffer[bufferOffset], highBuffer[scanSizes[s] - (maxScanBlockSize - bufferOffset)], scanSizes[s]);
+					}
+					else
+					{
+						rollingSums[s]->RollForward(lowBuffer[bufferOffset], lowBuffer[bufferOffset + scanSizes[s]], scanSizes[s]);
+					}
+					++bufferOffset;
+				}

-					// finish
+				// Check if we've found too much
+				int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size());
+				int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
+				if(NumBlocksFound > MaxBlocksFound)
+				{
+					abortSearch = true;
 					break;
 				}
-
-				// Switch buffers, reset offset
-				beginnings = endings;
-				endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0);	// ie the other buffer
-				offset = 0;
+			}
+			// If we're aborting, pass through
+			if(abortSearch) break;
+
+			// Swap buffers so high is low and the go back around
+			uint8_t *swap = lowBuffer;
+			lowBuffer = highBuffer;
+			ASSERT((lowBufferBytes == highBufferBytes) && (lowBufferBytes == maxScanBlockSize));
+			bufferFileOffset += maxScanBlockSize;
+			highBuffer = swap;
+			highBufferBytes = rFile.Read(highBuffer, maxScanBlockSize);

-				// And count the blocks which have been done
-				++fileBlockNumber;
+		} // End of maxScanBlockSize reads
+
+		// What remains is either:
+		// - An abort or timeout
+		// - A low buffer with less than maxScanBlockSize bytes and an empty high buffer
+		//		(Potentially zero low bytes for a zero-length file)
+		// - A maxScanBlockSize low buffer and a maxScanBlockSize or less high buffer
+		//
+		if(!abortSearch && !maximumDiffingTime.HasExpired() && (lowBufferBytes > 0))
+		{
+			// We're going to repeat the logic in the previous read loop, but have
+			// to be careful not to walk off the end of the available data. To make this
+			// a lot less convoluted, we're just going to copy to consolidate buffers.
+			// This is cheating a bit, but worth the simplicity.
+			ASSERT((lowBufferBytes < maxScanBlockSize) ? (highBufferBytes == 0) : true);
+			ASSERT((highBufferBytes != 0) ? (lowBufferBytes == maxScanBlockSize) : true);
+			uint8_t *endingBuffer = lowBuffer;
+			if(highBufferBytes != 0)
+			{
+				::memcpy(endingBuffer + maxScanBlockSize, highBuffer, highBufferBytes);
+			}
+			int32_t endingBytes = lowBufferBytes + highBufferBytes;
+
+			// Send keep alive
+			if(pDiffTimer) pDiffTimer->DoKeepAlive();
+
+			// std::map still not winning
+			::memset(pfitBitmap, 0, maxScanBlockSize * 2 / 8);
+			for(std::map<int64_t, int32_t>::const_iterator i(goodnessOfFit.lower_bound(bufferFileOffset)); i != goodnessOfFit.end(); ++i)
+			{
+				pfitBitmap[(i->first - bufferFileOffset) >> 3] |= (1 << ((i->first - bufferFileOffset) & 0x7));
 			}
+
+			// Walk all block sizes in block-probability order
+			for(int s = 0; s < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++s)
+			{
+				// If there is no rolling checksum at this index, skip its either
+				// a zero size or the file was too small
+				if(!scanThisSize[s] || (rollingSums[s] == 0)) continue;
+
+				// If there's not enough data at the end, skip this block size
+				if(endingBytes < scanSizes[s]) continue;

-			if(abortSearch) break;
+#ifndef NDEBUG
+				BOX_TRACE("Diff block size " << scanSizes[s] << " at file ending offset " << bufferFileOffset);
+#endif
+
+				// Offset of this block buffer
+				int32_t bufferOffset = 0;
+
+				// For each block size we can only walk so far safely
+				int32_t maxSafeOffset = endingBytes - scanSizes[s];
+
+				// Roll carry over
+				if(carryOverBytes[s] != 0)
+				{
+					ASSERT(carryOverBytes[s] > 0);
+					// Don't walk off the end
+					if(carryOverBytes[s] > maxSafeOffset) continue;
+					// Roll
+					rollingSums[s]->RollForwardSeveral(endingBuffer, endingBuffer + scanSizes[s], scanSizes[s], carryOverBytes[s]);
+					bufferOffset = carryOverBytes[s];
+					carryOverBytes[s] = 0;  // Not really needed again, but be clean
+				}
+
+				// Loop remaining end bytes, we must walk to exactly and then break the loop
+				while (bufferOffset <= maxSafeOffset)
+				{
+					// Look for larger size matches at this offset
+					int32_t bestFitSoFarSize = 0;
+					if(pfitBitmap[bufferOffset >> 3] & (1 << (bufferOffset & 0x7)))
+					{
+						bestFitSoFarSize = goodnessOfFit[bufferFileOffset + bufferOffset];
+					}
+					if(bestFitSoFarSize >= scanSizes[s]) {
+						// Space for this roll?
+						if((bufferOffset + bestFitSoFarSize) > maxSafeOffset) break; // Kill byte walk loop
+						// Roll
+						rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], bestFitSoFarSize);
+						bufferOffset += bestFitSoFarSize;
+						continue;  // Back around to byte read loop
+					}
+
+					// Look for block match at this size
+					uint16_t hashValue = rollingSums[s]->GetComponentForHashing();
+					if(hashTables[s][hashValue] != 0)
+					{
+						if(SecondStageMatch(hashTables[s][hashValue], rollingSums[s], endingBuffer + bufferOffset, scanSizes[s],
+											0, 0, scanSizes[s], bufferFileOffset + bufferOffset, pIndex, rFoundBlocks))
+						{
+#ifndef NDEBUG
+							BOX_TRACE("Found block match for hash " << hashValue << " of size " << scanSizes[s] << " at offset " << bufferFileOffset + bufferOffset);
+#endif
+							// Update best fit so far
+							ASSERT((goodnessOfFit.count(bufferFileOffset + bufferOffset) == 0) || (goodnessOfFit[bufferFileOffset + bufferOffset] < scanSizes[s]));
+							goodnessOfFit[bufferFileOffset + bufferOffset] = scanSizes[s];
+							pfitBitmap[bufferOffset >> 3] |= (1 << (bufferOffset & 0x7));
+
+							// We've found a match, don't scan anymore if we don't expect more blocks of this size
+							--scanSizesCount[s];
+							if(scanSizesCount[s] < 1)
+							{
+								scanThisSize[s] = false;
+#ifndef NDEBUG
+								BOX_TRACE("Short-circuit size " << scanSizes[s]);
+#endif
+							}
+
+							// Roll over this matched block
+							if((bufferOffset + scanSizes[s]) > maxSafeOffset) break; // Kill byte walk loop
+							rollingSums[s]->RollForwardSeveral(endingBuffer + bufferOffset, endingBuffer + bufferOffset + scanSizes[s], scanSizes[s], scanSizes[s]);
+							bufferOffset += scanSizes[s];
+							continue;  // Back around to byte read loop
+						}
+					}
+
+					// Abort check
+					int64_t NumBlocksFound = static_cast<int64_t>(rFoundBlocks.size());
+					int64_t MaxBlocksFound = NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
+					if(NumBlocksFound > MaxBlocksFound)
+					{
+						abortSearch = true;
+						break;
+					}
+
+					// Because byte loop conditional can equal maxSafeOffset we must not read off
+					// end here
+					if(bufferOffset == maxSafeOffset) break;  // Done with reading bytes
+					rollingSums[s]->RollForward(endingBuffer[bufferOffset], endingBuffer[bufferOffset + scanSizes[s]], scanSizes[s]);
+					++bufferOffset;
+				}
+				if(abortSearch) break;
+				// NOTE: Nothing else can go here without restructuring use of "break" in the byte read loop above
+			}
 		}

-		// Free buffers and hash table
-		::free(pbuffer1);
-		pbuffer1 = 0;
+		// Clean up
 		::free(pbuffer0);
 		pbuffer0 = 0;
-		::free(phashTable);
-		phashTable = 0;
+		::free(pbuffer1);
+		pbuffer1 = 0;
+		::free(pfitBitmap);
+		pfitBitmap = 0;
+		for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+		{
+			if(hashTables[z] != 0)
+			{
+				free(hashTables[z]);
+				hashTables[z] = 0;
+			}
+			if(rollingSums[z] != 0)
+			{
+				delete rollingSums[z];
+				rollingSums[z] = 0;
+			}
+		}
 	}
 	catch(...)
 	{
-		// Cleanup and throw
-		if(pbuffer1 != 0) ::free(pbuffer1);
+		// Cleanup and rethrow
 		if(pbuffer0 != 0) ::free(pbuffer0);
-		if(phashTable != 0) ::free(phashTable);
+		if(pbuffer1 != 0) ::free(pbuffer1);
+		if(pfitBitmap != 0) ::free(pfitBitmap);
+		for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+		{
+			if(hashTables[z] != 0) free(hashTables[z]);
+		}
+		for(int z = 0; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+		{
+			if(rollingSums[z] != 0) delete rollingSums[z];
+		}
 		throw;
 	}
-
+
 #ifndef NDEBUG
-	if(BackupStoreFile::TraceDetailsOfDiffProcess)
-	{
-		// Trace out the found blocks in debug mode
-		BOX_TRACE("Diff: list of found blocks");
-		BOX_TRACE("======== ======== ======== ========");
-		BOX_TRACE("  Offset   BlkIdx     Size Movement");
-		for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
-		{
-			int64_t orgLoc = 0;
-			for(int64_t b = 0; b < i->second; ++b)
-			{
-				orgLoc += pIndex[b].mSize;
-			}
-			BOX_TRACE(std::setw(8) << i->first << " " <<
-				std::setw(8) << i->second << " " <<
-				std::setw(8) << pIndex[i->second].mSize <<
-				" " <<
-				std::setw(8) << (i->first - orgLoc));
-		}
-		BOX_TRACE("======== ======== ======== ========");
+dumpDiffList:
+	// Trace out the found blocks in debug mode
+	BOX_TRACE("Diff: list of found blocks");
+	BOX_TRACE("======== ======== ======== ========");
+	BOX_TRACE("  Offset   BlkIdx     Size Movement");
+	for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
+	{
+		int64_t orgLoc = 0;
+		for(int64_t b = 0; b < i->second; ++b)
+		{
+			orgLoc += pIndex[b].mSize;
+		}
+		BOX_TRACE(std::setw(8) << i->first << " " <<
+			std::setw(8) << i->second << " " <<
+			std::setw(8) << pIndex[i->second].mSize <<
+			" " <<
+			std::setw(8) << (i->first - orgLoc));
 	}
+	BOX_TRACE("======== ======== ======== ========");
 #endif
 }

@@ -776,6 +1116,8 @@
 			{
 				//TRACE1("Another hash entry for %d found\n", hash);
 				// Yes -- need to set the pointer in this entry to the current entry to build the linked list
+				// This is safe because there is only one hash table per block size and block sizes
+				// are unique.
 				pIndex[b].mpNextInHashList = pHashTable[hash];
 			}

@@ -785,7 +1127,6 @@
 	}
 }

-
 // --------------------------------------------------------------------------
 //
 // Function
@@ -794,21 +1135,24 @@
 //		Created: 14/1/04
 //
 // --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
-	int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum *pFastSum,
+								uint8_t *pLow, int32_t LowSize, uint8_t *pHigh, int32_t HighSize, int32_t BlockSize, int64_t FileOffset,
+								BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
 {
 	// Check parameters
-	ASSERT(pBeginnings != 0);
-	ASSERT(pEndings != 0);
-	ASSERT(Offset >= 0);
-	ASSERT(BlockSize > 0);
 	ASSERT(pFirstInHashList != 0);
+	ASSERT(pFastSum != 0);
+	ASSERT(pLow != 0);
+	ASSERT(LowSize != 0);
+	ASSERT((HighSize > 0) ? (pHigh != 0) : true);
+	ASSERT(BlockSize > 0);
+	ASSERT(BlockSize == (LowSize + HighSize));
 	ASSERT(pIndex != 0);

 #ifndef NDEBUG
-	uint16_t DEBUG_Hash = fastSum.GetComponentForHashing();
+	uint16_t DEBUG_Hash = pFastSum->GetComponentForHashing();
 #endif
-	uint32_t Checksum = fastSum.GetChecksum();
+	uint32_t Checksum = pFastSum->GetChecksum();

 	// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
 	BlocksAvailableEntry *scan = pFirstInHashList;
@@ -829,42 +1173,33 @@

 	// Calculate the strong MD5 digest for this block
 	MD5Digest strong;
-	// Add the data from the beginnings
-	strong.Add(pBeginnings + Offset, BlockSize - Offset);
-	// Add any data from the endings
-	if(Offset > 0)
-	{
-		strong.Add(pEndings, Offset);
-	}
+	strong.Add(pLow, LowSize);
+	if(HighSize > 0) strong.Add(pHigh, HighSize);
 	strong.Finish();

 	// Then go through the entries in the hash list, comparing with the strong digest calculated
 	scan = pFirstInHashList;
-	//TRACE0("second stage match\n");
 	while(scan != 0)
 	{
 		//TRACE3("scan size %d, block size %d, hash %d\n", scan->mSize, BlockSize, Hash);
 		ASSERT(scan->mSize == BlockSize);
 		ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash);
-
+
 		// Compare?
 		if(strong.DigestMatches(scan->mStrongChecksum))
 		{
-			//TRACE0("Match!\n");
 			// Found! Add to list of found blocks...
-			int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
 			int64_t blockIndex = (scan - pIndex);	// pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory

 			// We do NOT search for smallest blocks first, as this code originally assumed.
 			// To prevent this from potentially overwriting a better match, the caller must determine
 			// the relative "goodness" of any existing match and this one, and avoid the call if it
 			// could be detrimental.
-			rFoundBlocks[fileOffset] = blockIndex;
+			rFoundBlocks[FileOffset] = blockIndex;

 			// No point in searching further, report success
 			return true;
 		}
-
 		// Next
 		scan = scan->mpNextInHashList;
 	}
@@ -909,12 +1244,9 @@
 		instruction.mSpaceBefore = SizeOfInputFile;
 		rRecipe.push_back(instruction);

-		#ifndef NDEBUG
-		if(BackupStoreFile::TraceDetailsOfDiffProcess)
-		{
-			TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile);
-		}
-		#endif
+#ifndef NDEBUG
+		TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile);
+#endif

 		// Don't do anything
 		return;
@@ -1006,22 +1338,19 @@
 	// dump out the recipe
 #ifndef NDEBUG
 	TRACE2("Diff: %lld new bytes found, %lld old blocks used\n", debug_NewBytesFound, debug_OldBlocksUsed);
-	if(BackupStoreFile::TraceDetailsOfDiffProcess)
+	TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk  NumBlks\n", rRecipe.size());
 	{
-		TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk  NumBlks\n", rRecipe.size());
+		for(unsigned int e = 0; e < rRecipe.size(); ++e)
 		{
-			for(unsigned int e = 0; e < rRecipe.size(); ++e)
-			{
-				char b[64];
+			char b[64];
 #ifdef WIN32
-				sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
+			sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
 #else
-				sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
+			sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
 #endif
-				TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?"       -":b, (int64_t)rRecipe[e].mBlocks);
-			}
+			TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?"       -":b, (int64_t)rRecipe[e].mBlocks);
 		}
-		TRACE0("======== ========= ========\n");
 	}
+	TRACE0("======== ========= ========\n");
 #endif
 }