Skip to content

Instantly share code, notes, and snippets.

@moyix
Last active February 2, 2023 13:40
Show Gist options
  • Save moyix/f78e0b0d5724a1bf02e1a035e8bec136 to your computer and use it in GitHub Desktop.
Save moyix/f78e0b0d5724a1bf02e1a035e8bec136 to your computer and use it in GitHub Desktop.
/* License:
* Public domain.
*/
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
// CUDA cruft
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// Number of threads is N*M = 65535
// Threads per block
#define N 128
// Number of blocks
#define M 512
// This is 26^11 / 65536. Assumes that we will be running 65536
// simultaneous threads an we want to partition our search space between
// them evenly.
#define STRIDE 56005012314UL
#define MAX_RESULTS 1073741824UL
#define MAX_CHARS 11
__device__ static uint32_t obfuscateWord(unsigned char *e, int len) {
uint32_t t = 352960512;
for (int i = 0; i < len; i++) {
t = (t*33) ^ e[i];
}
return t;
}
__global__ void check(uint32_t goal, volatile int *index) {
int bindex = blockIdx.x * blockDim.x + threadIdx.x;
int64_t start = bindex*STRIDE;
int64_t num = 0;
// tmp for holding the digits of the start value
unsigned char start_buf[MAX_CHARS];
unsigned char candidate_buf[MAX_CHARS+1];
candidate_buf[MAX_CHARS] = '\0';
memset(candidate_buf, 'a', MAX_CHARS);
// Convert our start index to base 26. Don't know how many digits we
// will have so store it backward in start_buf and then reverse it
// into the end of candidate_buf
int digits = 0;
while (start > 0) {
int rem = start % 26;
unsigned char v = 'a' + rem;
start_buf[digits++] = v;
start = (start - rem) / 26;
}
for (int i = 0; i < digits; i++) {
candidate_buf[MAX_CHARS-i-1] = start_buf[i];
}
//printf("Thread %d working on chunk starting at %s\n",
// bindex, candidate_buf);
while (num < STRIDE+26UL) {
// Trick from Paul Khuong: once we compute hash of N-1 characters
// we can get the last one by just xoring in the last character.
uint32_t prefix = obfuscateWord(candidate_buf, MAX_CHARS-1)*33;
candidate_buf[MAX_CHARS-1] = '\0';
//printf("Prefix: %s Hash %08x\n", candidate_buf, prefix);
for (unsigned char c = 'a'; c <= 'z'; c++) {
if ((prefix^c) == goal) {
candidate_buf[MAX_CHARS-1] = c;
printf("%s\n", candidate_buf);
int old = atomicAdd((int *)index, 1);
//memcpy(resbuf+(old*MAX_CHARS), candidate_buf, MAX_CHARS);
}
}
num += 26;
// Now just incremement starting from the secont to last char
bool finished = true;
for (int i = MAX_CHARS-2; i >= 0; i--) {
// Increment
candidate_buf[i]++;
// Handle carry
if (candidate_buf[i] > 'z') {
candidate_buf[i] = 'a';
}
else {
finished = false;
break;
}
}
// If finished is still set then the carry propagated through
// the entire array, so we must be done.
if (finished) break;
}
}
int main(int argc, char **argv) {
//unsigned char *host_results;
//unsigned char *device_results;
if (argc < 2) {
fprintf(stderr, "usage: %s <hash>\n", argv[0]);
return 1;
}
uint64_t goal = strtoul(argv[1], NULL, 0);
if (errno != 0) {
perror("strtoul");
return 1;
}
if (goal > 0xffffffff) {
fprintf(stderr, "This doesn't look like a Copilot hash... too big!\n");
return 1;
}
uint32_t goal32 = (uint32_t)goal;
// This sets up a pinned chunk of memory that is visible on both
// the device and host. We will use it to report progress from
// the device which tells us when we need to collect results on
// the host.
volatile int *d_data, *h_data;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags error");
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc error");
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
cudaCheckErrors("cudaHostGetDevicePointer error");
*h_data = 0;
#if 0
// Alloc mem for results device-side and host-side
cudaMallocHost((void **)&host_results, MAX_RESULTS*MAX_CHARS); // pinned
cudaMalloc((void **)&device_results, MAX_RESULTS*MAX_CHARS);
cudaCheckErrors("cudaMalloc device_results fail");
#endif
// We create a second CUDA stream here so that we can do memory
// copies from device to host in parallel with the kernel execution
cudaStream_t stream0;
cudaStream_t stream1;
cudaStreamCreate(&stream0);
cudaCheckErrors("cudaStreamCreate");
cudaStreamCreate(&stream1);
cudaCheckErrors("cudaStreamCreate");
// Kernel launch! We use an event so we can tell when the kernel
// has finished.
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,stream0);
check<<<M, N, 0, stream0>>>(goal32, d_data);
cudaEventRecord(stop,stream0);
// Monitor for results
int value = 0;
while (true) {
sleep(5);
int value1 = *h_data;
if (value1 > value) {
fprintf(stderr, "solutions found = %d\n", value1);
#if 0
cudaMemcpyAsync(host_results+(value*MAX_CHARS),
device_results+(value*MAX_CHARS),
(value1-value)*MAX_CHARS,
cudaMemcpyDeviceToHost,
stream1);
cudaStreamSynchronize(stream1);
for (int i = value; i < value1; i++) {
printf("%.*s\n", MAX_CHARS, host_results+(i*MAX_CHARS));
}
#endif
value = value1;
}
if (cudaEventQuery(stop) == cudaSuccess) break;
}
float et;
cudaEventElapsedTime(&et, start, stop);
double keyspace = pow(26,MAX_CHARS);
fprintf(stderr, "Searched keyspace of size %lu in %f ms (%f H/s)\n",
(uint64_t)keyspace, et, (double)keyspace / (et / 1000));
return 0;
}
@Sc00bz
Copy link

Sc00bz commented Sep 2, 2021

Faster:

-         for (unsigned char c = 'a'; c <= 'z'; c++) {
-             if ((prefix^c) == goal) {
+         uint32_t c = prefix ^ goal;
+         if (c >= 'a' && c <= 'z') {
                  candidate_buf[MAX_CHARS-1] = c;
                  printf("%s\n", candidate_buf);
                  int old = atomicAdd((int *)index, 1);
                  //memcpy(resbuf+(old*MAX_CHARS), candidate_buf, MAX_CHARS);
              }
-         }

Also you can also do an early exit starting at MAX_CHARS-4 which should be much faster (or at least use less power):

__device__ static uint32_t earlyExit(uint32_t hash, uint32_t testHash, uint32_t pow33) {
    uint32_t hi = (hash | 0x7f) * pow33 + pow33 - 1;
    uint32_t lo = (hash & ~0x7f) * pow33;
    
    if ((lo < hi && (lo > testHash || hi < testHash)) ||
        (lo > hi && (lo > testHash && hi < testHash))) {
        return 1;
    }
    return 0;
}

...
while (num < STRIDE+26*26*26*26) {
    uint32_t hash0 = obfuscateWord(candidate_buf, MAX_CHARS-4);
    if (earlyExit(hash0, goal, 33*33*33*33) == 0) {
        uint32_t hash1 = 33 * hash0;
        for (unsigned char c1 = 'a'; c1 <= 'z'; c1++) {
            if (earlyExit(hash1 ^ c1, goal, 33*33*33) == 0) {
                uint32_t hash2 = 33 * (hash1 ^ c1);
                for (unsigned char c2 = 'a'; c2 <= 'z'; c2++) {
                    if (earlyExit(hash2 ^ c2, goal, 33*33) == 0) {
                        uint32_t hash3 = 33 * (hash2 ^ c2);
                        for (unsigned char c3 = 'a'; c3 <= 'z'; c3++) {
                            uint32_t c4 = (33 * (hash3 ^ c3)) ^ goal;
                            if (c4 >= 'a' && c4 <= 'z') {
                                candidate_buf[MAX_CHARS-4] = c1;
                                candidate_buf[MAX_CHARS-3] = c2;
                                candidate_buf[MAX_CHARS-2] = c3;
                                candidate_buf[MAX_CHARS-1] = c4;
                                printf("%s\n", candidate_buf);
                                int old = atomicAdd((int *)index, 1);
                                //memcpy(resbuf+(old*MAX_CHARS), candidate_buf, MAX_CHARS);
                            }
                        }
                    }
                }
            }
        }
    }
    num += 26*26*26*26;

    // Now just increment starting from the 5th to last char
    bool finished = true;
    for (int i = MAX_CHARS-5; i >= 0; i--) {
...

[edit: fixed earlyExit() missing a parenthesis and forgot to give it testHash]

@Sc00bz
Copy link

Sc00bz commented Sep 2, 2021

I got the meet in the middle attack working. It takes about a minute on a CPU to do 11 letters.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment