-
-
Save tvandijck/e8ac50f01b6c656f5599d50b83e35ca9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// MeowHash.cpp : This file contains the 'main' function. Program execution begins and ends there. | |
// | |
#include "pch.h" | |
#include <inttypes.h> | |
#include <intrin.h> | |
#include <stdio.h> | |
#include <string> | |
#include <map> | |
#include <vector> | |
#include <Windows.h> | |
#include <shlwapi.h> | |
#pragma comment(lib, "Shlwapi.lib") | |
#include "meow_hash.h" | |
/*union meow_lane | |
{ | |
struct | |
{ | |
__m128i L0; | |
__m128i L1; | |
__m128i L2; | |
__m128i L3; | |
}; | |
uint64_t Sub[8]; | |
uint32_t Sub32[16]; | |
}; | |
static inline void AESRotate(meow_lane &A, meow_lane &B) | |
{ | |
A.L0 = _mm_aesdec_si128(A.L0, B.L0); | |
A.L1 = _mm_aesdec_si128(A.L1, B.L1); | |
A.L2 = _mm_aesdec_si128(A.L2, B.L2); | |
A.L3 = _mm_aesdec_si128(A.L3, B.L3); | |
__m128i Temp = B.L0; | |
B.L0 = B.L1; | |
B.L1 = B.L2; | |
B.L2 = B.L3; | |
B.L3 = Temp; | |
} | |
static inline void AESLoad(meow_lane& S, const uint8_t* from) | |
{ | |
S.L0 = _mm_aesdec_si128(S.L0, *(__m128i *)(from)); | |
S.L1 = _mm_aesdec_si128(S.L1, *(__m128i *)(from + 16)); | |
S.L2 = _mm_aesdec_si128(S.L2, *(__m128i *)(from + 32)); | |
S.L3 = _mm_aesdec_si128(S.L3, *(__m128i *)(from + 48)); | |
} | |
static inline void AESMerge(meow_lane& A, const meow_lane& B) | |
{ | |
A.L0 = _mm_aesdec_si128(A.L0, B.L0); | |
A.L1 = _mm_aesdec_si128(A.L1, B.L1); | |
A.L2 = _mm_aesdec_si128(A.L2, B.L2); | |
A.L3 = _mm_aesdec_si128(A.L3, B.L3); | |
} | |
struct meow_state | |
{ | |
meow_lane S0123; | |
meow_lane S4567; | |
meow_lane S89AB; | |
meow_lane SCDEF; | |
}; | |
static meow_lane MeowHash(uint64_t seed, const void* sourceInit, uint64_t len, meow_state& state) | |
{ | |
// NOTE(casey): The initialization vector follows falkhash's lead and uses the seed twice, but the second time | |
// the length plus one is added to differentiate. This seemed sensible, but I haven't thought too hard about this, | |
// there may be better things to use as an IV. | |
meow_lane IV; | |
IV.Sub[0] = IV.Sub[2] = IV.Sub[4] = IV.Sub[6] = seed; | |
IV.Sub[1] = IV.Sub[3] = IV.Sub[5] = IV.Sub[7] = seed + len + 1; | |
// NOTE(casey): Initialize all 16 streams with the initialization vector | |
state.S0123 = IV; | |
state.S4567 = IV; | |
state.S89AB = IV; | |
state.SCDEF = IV; | |
// NOTE(casey): Handle as many full 256-byte blocks as possible | |
const uint8_t* source = static_cast<const uint8_t*>(sourceInit); | |
uint64_t blockCount = (len >> 8); | |
len -= (blockCount << 8); | |
while (blockCount--) | |
{ | |
AESLoad(state.S0123, source); | |
AESLoad(state.S4567, source + 64); | |
AESLoad(state.S89AB, source + 128); | |
AESLoad(state.SCDEF, source + 192); | |
source += (1 << 8); | |
} | |
// NOTE(casey): If residual data remains, hash one final 256-byte block padded with the initialization vector | |
if (len) | |
{ | |
// TODO(casey): Can this just be zeroes? | |
meow_lane partial[] = { IV, IV, IV, IV }; | |
uint8_t* dest = reinterpret_cast<uint8_t*>(partial); | |
while (len--) | |
{ | |
*dest++ = *source++; | |
} | |
uint8_t* P = reinterpret_cast<uint8_t*>(partial); | |
AESMerge(state.S0123, partial[0]); | |
AESMerge(state.S4567, partial[1]); | |
AESMerge(state.S89AB, partial[2]); | |
AESMerge(state.SCDEF, partial[3]); | |
} | |
// NOTE(casey): Combine the 16 streams into a single hash to spread the bits out evenly | |
meow_lane R0 = IV; | |
AESRotate(R0, state.S0123); | |
AESRotate(R0, state.S4567); | |
AESRotate(R0, state.S89AB); | |
AESRotate(R0, state.SCDEF); | |
AESRotate(R0, state.S0123); | |
AESRotate(R0, state.S4567); | |
AESRotate(R0, state.S89AB); | |
AESRotate(R0, state.SCDEF); | |
AESRotate(R0, state.S0123); | |
AESRotate(R0, state.S4567); | |
AESRotate(R0, state.S89AB); | |
AESRotate(R0, state.SCDEF); | |
AESRotate(R0, state.S0123); | |
AESRotate(R0, state.S4567); | |
AESRotate(R0, state.S89AB); | |
AESRotate(R0, state.SCDEF); | |
// NOTE(casey): Repeat AES enough times to ensure diffusion to all bits in each 128-bit lane | |
AESMerge(R0, IV); | |
AESMerge(R0, IV); | |
AESMerge(R0, IV); | |
AESMerge(R0, IV); | |
AESMerge(R0, IV); | |
return(R0); | |
}*/ | |
#define HASH_64 | |
std::string toString(const meow_lane& A) | |
{ | |
char buffer[256]; | |
#if defined(HASH_256) | |
sprintf_s(buffer, "%08X%08X%08X%08X-%08X%08X%08X%08X", | |
A.Sub32[0], A.Sub32[1], A.Sub32[2], A.Sub32[3], | |
A.Sub32[4], A.Sub32[5], A.Sub32[6], A.Sub32[7]); | |
#elif defined(HASH_128) | |
sprintf_s(buffer, "%08X%08X%08X%08X", | |
A.Sub32[0], A.Sub32[1], A.Sub32[2], A.Sub32[3]); | |
#elif defined(HASH_64) | |
sprintf_s(buffer, "%08X%08X", | |
A.Sub32[0], A.Sub32[1]); | |
#else | |
sprintf_s(buffer, "%08X%08X%08X%08X-%08X%08X%08X%08X-%08X%08X%08X%08X-%08X%08X%08X%08X", | |
A.Sub32[0], A.Sub32[1], A.Sub32[2], A.Sub32[3], | |
A.Sub32[4], A.Sub32[5], A.Sub32[6], A.Sub32[7], | |
A.Sub32[8], A.Sub32[9], A.Sub32[10], A.Sub32[11], | |
A.Sub32[12], A.Sub32[13], A.Sub32[14], A.Sub32[15]); | |
#endif | |
return buffer; | |
} | |
/*std::string toString(const meow_state& A) | |
{ | |
std::string str = "\n"; | |
str += " S0123: " + toString(A.S0123) + "\n"; | |
str += " S4567: " + toString(A.S4567) + "\n"; | |
str += " S89AB: " + toString(A.S89AB) + "\n"; | |
str += " SCDEF: " + toString(A.SCDEF) + "\n"; | |
return str; | |
}*/ | |
struct Info | |
{ | |
std::string fileName; | |
std::string state; | |
int fileSize; | |
}; | |
static std::map<std::string, Info> s_collisionMap; | |
static std::map<std::string, std::vector<Info>> s_collisions; | |
void Hash(std::string fileName) | |
{ | |
FILE* file = fopen(fileName.c_str(), "rb"); | |
if (file != nullptr) | |
{ | |
fseek(file, 0, SEEK_END); | |
int size = (int)ftell(file); | |
fseek(file, 0, SEEK_SET); | |
char* buffer = (char*)_aligned_malloc(size, 16); | |
fread(buffer, 1, size, file); | |
fclose(file); | |
//meow_state state; | |
//meow_lane hash = MeowHash(0, buffer, size, state); | |
meow_lane hash = MeowHash1(0, size, buffer); | |
_aligned_free(buffer); | |
std::string hashStr = toString(hash); | |
auto iter = s_collisionMap.find(hashStr); | |
if (iter != s_collisionMap.end()) | |
{ | |
if (s_collisions[hashStr].empty()) | |
{ | |
s_collisions[hashStr].push_back(iter->second); | |
} | |
Info info; | |
info.fileName = fileName; | |
info.fileSize = size; | |
//info.state = toString(state); | |
s_collisions[hashStr].push_back(info); | |
} else | |
{ | |
Info info; | |
info.fileName = fileName; | |
info.fileSize = size; | |
//info.state = toString(state); | |
s_collisionMap.insert(std::make_pair(hashStr, info)); | |
} | |
} | |
} | |
void EnumFiles(std::string folderPath, std::string searchParameter) | |
{ | |
std::string searchFile = folderPath + searchParameter; | |
WIN32_FIND_DATAA info; | |
HANDLE hFile = FindFirstFileA(searchFile.c_str(), &info); | |
if (hFile != INVALID_HANDLE_VALUE) | |
{ | |
do | |
{ | |
if (info.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) | |
{ | |
if (strcmp(info.cFileName, ".") != 0 && strcmp(info.cFileName, "..") != 0) | |
{ | |
EnumFiles(folderPath + info.cFileName + "\\", searchParameter); | |
} | |
} | |
else | |
{ | |
Hash(folderPath + info.cFileName); | |
} | |
} while (FindNextFileA(hFile, &info)); | |
FindClose(hFile); | |
} | |
} | |
int main() | |
{ | |
EnumFiles("C:\\extracted\\", "*.*"); | |
int collisionCount = 0; | |
for (auto& item : s_collisions) | |
{ | |
collisionCount++; | |
printf("COLLISION [%d]:\n", collisionCount); | |
printf(" hash : %s\n", item.first.c_str()); | |
printf(" collisions : %d\n", (int)item.second.size()); | |
for (auto& info : item.second) | |
{ | |
printf(" file : %s\n", info.fileName.c_str()); | |
printf(" size : %d\n", info.fileSize); | |
//printf(" state : %s\n", info.state.c_str()); | |
char output[260]; | |
char* ptr = PathFindFileNameA(info.fileName.c_str()); | |
sprintf(output, "c:\\coll\\%s", ptr); | |
CopyFileA(info.fileName.c_str(), output, false); | |
} | |
} | |
printf("\n"); | |
printf("Number of collisions: %d\n", collisionCount); | |
printf("Number of files : %d\n", (int)s_collisionMap.size()); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment