moyix/copilot_search.cu

## copilot_search.cu
/* License:
 *   Public domain.
 */

#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>

// CUDA cruft
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// Number of threads is N*M = 65535
// Threads per block
#define N 128
// Number of blocks
#define M 512

// This is 26^11 / 65536. Assumes that we will be running 65536
// simultaneous threads an we want to partition our search space between
// them evenly.
#define STRIDE 56005012314UL

#define MAX_RESULTS 1073741824UL
#define MAX_CHARS 11

__device__ static uint32_t obfuscateWord(unsigned char *e, int len) {
    uint32_t t = 352960512;
    for (int i = 0; i < len; i++) {
        t = (t*33) ^ e[i];
    }
    return t;
}

__global__ void check(uint32_t goal, volatile int *index) {
    int bindex = blockIdx.x * blockDim.x + threadIdx.x;
    int64_t start = bindex*STRIDE;
    int64_t num = 0;

    // tmp for holding the digits of the start value
    unsigned char start_buf[MAX_CHARS];
    unsigned char candidate_buf[MAX_CHARS+1];
    candidate_buf[MAX_CHARS] = '\0';
    memset(candidate_buf, 'a', MAX_CHARS);

    // Convert our start index to base 26. Don't know how many digits we
    // will have so store it backward in start_buf and then reverse it
    // into the end of candidate_buf
    int digits = 0;
    while (start > 0) {
        int rem = start % 26;
        unsigned char v = 'a' + rem;
        start_buf[digits++] = v;
        start = (start - rem) / 26;
    }

    for (int i = 0; i < digits; i++) {
        candidate_buf[MAX_CHARS-i-1] = start_buf[i];
    }
    //printf("Thread %d working on chunk starting at %s\n",
    //        bindex, candidate_buf);

    while (num < STRIDE+26UL) {
        // Trick from Paul Khuong: once we compute hash of N-1 characters
        // we can get the last one by just xoring in the last character.
        uint32_t prefix = obfuscateWord(candidate_buf, MAX_CHARS-1)*33;
        candidate_buf[MAX_CHARS-1] = '\0';
        //printf("Prefix: %s Hash %08x\n", candidate_buf, prefix);
        for (unsigned char c = 'a'; c <= 'z'; c++) {
            if ((prefix^c) == goal) {
                candidate_buf[MAX_CHARS-1] = c;
                printf("%s\n", candidate_buf);
                int old = atomicAdd((int *)index, 1);
                //memcpy(resbuf+(old*MAX_CHARS), candidate_buf, MAX_CHARS);
            }
        }
        num += 26;
        // Now just incremement starting from the secont to last char
        bool finished = true;
        for (int i = MAX_CHARS-2; i >= 0; i--) {
            // Increment
            candidate_buf[i]++;
            // Handle carry
            if (candidate_buf[i] > 'z') {
                candidate_buf[i] = 'a';
            }
            else {
                finished = false;
                break;
            }
        }
        // If finished is still set then the carry propagated through
        // the entire array, so we must be done.
        if (finished) break;
    }
}

int main(int argc, char **argv) {
    //unsigned char *host_results;
    //unsigned char *device_results;

    if (argc < 2) {
        fprintf(stderr, "usage: %s <hash>\n", argv[0]);
        return 1;
    }

    uint64_t goal = strtoul(argv[1], NULL, 0);
    if (errno != 0) {
        perror("strtoul");
        return 1;
    }
    if (goal > 0xffffffff) {
        fprintf(stderr, "This doesn't look like a Copilot hash... too big!\n");
        return 1;
    }
    uint32_t goal32 = (uint32_t)goal;

    // This sets up a pinned chunk of memory that is visible on both
    // the  device and host. We will use it to report progress from
    // the device which tells us when we need to collect results on
    // the host.
    volatile int *d_data, *h_data;
    cudaSetDeviceFlags(cudaDeviceMapHost);
    cudaCheckErrors("cudaSetDeviceFlags error");
    cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
    cudaCheckErrors("cudaHostAlloc error");
    cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
    cudaCheckErrors("cudaHostGetDevicePointer error");
    *h_data = 0;

#if 0
    // Alloc mem for results device-side and host-side
    cudaMallocHost((void **)&host_results, MAX_RESULTS*MAX_CHARS); // pinned
    cudaMalloc((void **)&device_results, MAX_RESULTS*MAX_CHARS);
    cudaCheckErrors("cudaMalloc device_results fail");
#endif

    // We create a second CUDA stream here so that we can do memory
    // copies from device to host in parallel with the kernel execution
    cudaStream_t stream0;
    cudaStream_t stream1;
    cudaStreamCreate(&stream0);
    cudaCheckErrors("cudaStreamCreate");
    cudaStreamCreate(&stream1);
    cudaCheckErrors("cudaStreamCreate");

    // Kernel launch! We use an event so we can tell when the kernel
    // has finished.
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start,stream0);
    check<<<M, N, 0, stream0>>>(goal32, d_data);
    cudaEventRecord(stop,stream0);

    // Monitor for results
    int value = 0;
    while (true) {
        sleep(5);
        int value1 = *h_data;
        if (value1 > value) {
            fprintf(stderr, "solutions found = %d\n", value1);
#if 0
            cudaMemcpyAsync(host_results+(value*MAX_CHARS),
                            device_results+(value*MAX_CHARS),
                            (value1-value)*MAX_CHARS,
                            cudaMemcpyDeviceToHost,
                            stream1);
            cudaStreamSynchronize(stream1);
            for (int i = value; i < value1; i++) {
                printf("%.*s\n", MAX_CHARS, host_results+(i*MAX_CHARS));
            }
#endif
            value = value1;
        }
        if (cudaEventQuery(stop) == cudaSuccess) break;
    }
    float et;
    cudaEventElapsedTime(&et, start, stop);
    double keyspace = pow(26,MAX_CHARS);
    fprintf(stderr, "Searched keyspace of size %lu in %f ms (%f H/s)\n",
            (uint64_t)keyspace, et, (double)keyspace / (et / 1000));

    return 0;
}
	/* License:
	* Public domain.
	*/

	#include <stdio.h>
	#include <stdint.h>
	#include <string.h>
	#include <unistd.h>
	#include <errno.h>

	// CUDA cruft
	#define cudaCheckErrors(msg) \
	do { \
	cudaError_t __err = cudaGetLastError(); \
	if (__err != cudaSuccess) { \
	fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
	msg, cudaGetErrorString(__err), \
	__FILE__, __LINE__); \
	fprintf(stderr, "*** FAILED - ABORTING\n"); \
	exit(1); \
	} \
	} while (0)

	// Number of threads is N*M = 65535
	// Threads per block
	#define N 128
	// Number of blocks
	#define M 512

	// This is 26^11 / 65536. Assumes that we will be running 65536
	// simultaneous threads an we want to partition our search space between
	// them evenly.
	#define STRIDE 56005012314UL

	#define MAX_RESULTS 1073741824UL
	#define MAX_CHARS 11

	__device__ static uint32_t obfuscateWord(unsigned char *e, int len) {
	uint32_t t = 352960512;
	for (int i = 0; i < len; i++) {
	t = (t*33) ^ e[i];
	}
	return t;
	}

	__global__ void check(uint32_t goal, volatile int *index) {
	int bindex = blockIdx.x * blockDim.x + threadIdx.x;
	int64_t start = bindex*STRIDE;
	int64_t num = 0;

	// tmp for holding the digits of the start value
	unsigned char start_buf[MAX_CHARS];
	unsigned char candidate_buf[MAX_CHARS+1];
	candidate_buf[MAX_CHARS] = '\0';
	memset(candidate_buf, 'a', MAX_CHARS);

	// Convert our start index to base 26. Don't know how many digits we
	// will have so store it backward in start_buf and then reverse it
	// into the end of candidate_buf
	int digits = 0;
	while (start > 0) {
	int rem = start % 26;
	unsigned char v = 'a' + rem;
	start_buf[digits++] = v;
	start = (start - rem) / 26;
	}

	for (int i = 0; i < digits; i++) {
	candidate_buf[MAX_CHARS-i-1] = start_buf[i];
	}
	//printf("Thread %d working on chunk starting at %s\n",
	// bindex, candidate_buf);

	while (num < STRIDE+26UL) {
	// Trick from Paul Khuong: once we compute hash of N-1 characters
	// we can get the last one by just xoring in the last character.
	uint32_t prefix = obfuscateWord(candidate_buf, MAX_CHARS-1)*33;
	candidate_buf[MAX_CHARS-1] = '\0';
	//printf("Prefix: %s Hash %08x\n", candidate_buf, prefix);
	for (unsigned char c = 'a'; c <= 'z'; c++) {
	if ((prefix^c) == goal) {
	candidate_buf[MAX_CHARS-1] = c;
	printf("%s\n", candidate_buf);
	int old = atomicAdd((int *)index, 1);
	//memcpy(resbuf+(old*MAX_CHARS), candidate_buf, MAX_CHARS);
	}
	}
	num += 26;
	// Now just incremement starting from the secont to last char
	bool finished = true;
	for (int i = MAX_CHARS-2; i >= 0; i--) {
	// Increment
	candidate_buf[i]++;
	// Handle carry
	if (candidate_buf[i] > 'z') {
	candidate_buf[i] = 'a';
	}
	else {
	finished = false;
	break;
	}
	}
	// If finished is still set then the carry propagated through
	// the entire array, so we must be done.
	if (finished) break;
	}
	}

	int main(int argc, char **argv) {
	//unsigned char *host_results;
	//unsigned char *device_results;

	if (argc < 2) {
	fprintf(stderr, "usage: %s <hash>\n", argv[0]);
	return 1;
	}

	uint64_t goal = strtoul(argv[1], NULL, 0);
	if (errno != 0) {
	perror("strtoul");
	return 1;
	}
	if (goal > 0xffffffff) {
	fprintf(stderr, "This doesn't look like a Copilot hash... too big!\n");
	return 1;
	}
	uint32_t goal32 = (uint32_t)goal;

	// This sets up a pinned chunk of memory that is visible on both
	// the device and host. We will use it to report progress from
	// the device which tells us when we need to collect results on
	// the host.
	volatile int d_data, h_data;
	cudaSetDeviceFlags(cudaDeviceMapHost);
	cudaCheckErrors("cudaSetDeviceFlags error");
	cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
	cudaCheckErrors("cudaHostAlloc error");
	cudaHostGetDevicePointer((int *)&d_data, (int )h_data, 0);
	cudaCheckErrors("cudaHostGetDevicePointer error");
	*h_data = 0;

	#if 0
	// Alloc mem for results device-side and host-side
	cudaMallocHost((void *)&host_results, MAX_RESULTSMAX_CHARS); // pinned
	cudaMalloc((void *)&device_results, MAX_RESULTSMAX_CHARS);
	cudaCheckErrors("cudaMalloc device_results fail");
	#endif

	// We create a second CUDA stream here so that we can do memory
	// copies from device to host in parallel with the kernel execution
	cudaStream_t stream0;
	cudaStream_t stream1;
	cudaStreamCreate(&stream0);
	cudaCheckErrors("cudaStreamCreate");
	cudaStreamCreate(&stream1);
	cudaCheckErrors("cudaStreamCreate");

	// Kernel launch! We use an event so we can tell when the kernel
	// has finished.
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start,stream0);
	check<<<M, N, 0, stream0>>>(goal32, d_data);
	cudaEventRecord(stop,stream0);

	// Monitor for results
	int value = 0;
	while (true) {
	sleep(5);
	int value1 = *h_data;
	if (value1 > value) {
	fprintf(stderr, "solutions found = %d\n", value1);
	#if 0
	cudaMemcpyAsync(host_results+(value*MAX_CHARS),
	device_results+(value*MAX_CHARS),
	(value1-value)*MAX_CHARS,
	cudaMemcpyDeviceToHost,
	stream1);
	cudaStreamSynchronize(stream1);
	for (int i = value; i < value1; i++) {
	printf("%.s\n", MAX_CHARS, host_results+(iMAX_CHARS));
	}
	#endif
	value = value1;
	}
	if (cudaEventQuery(stop) == cudaSuccess) break;
	}
	float et;
	cudaEventElapsedTime(&et, start, stop);
	double keyspace = pow(26,MAX_CHARS);
	fprintf(stderr, "Searched keyspace of size %lu in %f ms (%f H/s)\n",
	(uint64_t)keyspace, et, (double)keyspace / (et / 1000));

	return 0;
	}