Created
April 25, 2009 11:22
-
-
Save SamSaffron/101591 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static class LinqExtension { | |
public static IEnumerable<T> SampleEvery<T>(this IEnumerable<T> items, int sample) { | |
int i = 0; | |
foreach (var item in items) { | |
if ((i % sample) == 0) { | |
yield return item; | |
} | |
i++; | |
} | |
} | |
} | |
// number of samples to take | |
const int SampleCount = 4; | |
// size of each random file sample | |
const int SampleSize = 4 * 1024; | |
// files smaller than this get no random sampling | |
const int SamplingThreshold = 16 * 1024; | |
public static Guid GetFileSignature(string filename) { | |
byte[] buffer; | |
long filesize; | |
using (var reader = File.Open(filename, FileMode.Open, FileAccess.Read)) { | |
filesize = reader.Length; | |
if (filesize < SamplingThreshold) { | |
buffer = new byte[filesize]; | |
Read(reader, buffer, 0, (int)filesize); | |
} else { | |
Random random = new Random((int)(filesize % int.MaxValue)); | |
int maxSize = filesize < (long)Int32.MaxValue ? (int)filesize : Int32.MaxValue; | |
// space out random numbers | |
var startPositions = Enumerable | |
.Range(0, SampleCount * 4) | |
.Select(_ => random.Next(maxSize)) | |
.OrderBy(i => i) | |
.SampleEvery(4) | |
.ToArray(); | |
buffer = new byte[SampleCount * SampleSize]; | |
int bufferPosition = 0; | |
long currentPosition = 0; | |
foreach (var start in startPositions) { | |
currentPosition = reader.Seek(start - currentPosition, SeekOrigin.Current); | |
var bytesRead = Read(reader, buffer, bufferPosition, SampleSize); | |
currentPosition += bytesRead; | |
bufferPosition += bytesRead; | |
} | |
} | |
} | |
var md5Provider = new MD5CryptoServiceProvider(); | |
md5Provider.TransformBlock(buffer, 0, buffer.Length, buffer, 0); | |
// include the filesize in the hash | |
var fileSizeArray = BitConverter.GetBytes(filesize); | |
md5Provider.TransformFinalBlock(fileSizeArray, 0, fileSizeArray.Length); | |
return new Guid(md5Provider.Hash); | |
} | |
private static int Read(FileStream reader, byte[] buffer, int offset, int count) { | |
int totalBytesRead = 0; | |
int bytesRead = 0; | |
do { | |
bytesRead = reader.Read(buffer, offset + totalBytesRead, count - totalBytesRead); | |
totalBytesRead += bytesRead; | |
} while (totalBytesRead < count && bytesRead > 0); | |
return totalBytesRead; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment