Skip to content

Instantly share code, notes, and snippets.

@fador
Created November 3, 2016 16:43
Show Gist options
  • Save fador/3ab7d4f2c1ef4d262a49b7af0e088d6b to your computer and use it in GitHub Desktop.
Save fador/3ab7d4f2c1ef4d262a49b7af0e088d6b to your computer and use it in GitHub Desktop.
arc4 hasher in OpenCL for tietoturvahaaste.fi challenge
/*
Copyright (c) 2016, Marko Viitanen (Fador)
Permission to use, copy, modify, and/or distribute this software for any purpose
with or without fee is hereby granted, provided that the above copyright notice
and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include <iostream>
#include <Windows.h>
#include <cstdio>
#include <cstdlib>
#include <CL/opencl.h>
#include <ctime>
#include <random>
#include <fstream>
char *arc4_brute;
unsigned long long microTime()
{
#ifndef WIN32
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
return (now.tv_sec*(uint64_t)1000000 + now.tv_nsec/(uint64_t)1000);
#else
FILETIME ft;
GetSystemTimeAsFileTime(&ft);
unsigned long long out = ((unsigned long long)ft.dwHighDateTime)<<32 | (unsigned long long)ft.dwLowDateTime;
out /= 10; // from 100ns to 1us
return out;
#endif
}
int main(int argc, char **argv)
{
FILE *src = NULL;
cl_int error;
cl_uint num;
cl_uint ok = 1;
cl_context context;
unsigned long long GPUstart;
unsigned long long GPUend;
unsigned long long CPUstart;
unsigned long long CPUend;
unsigned char *pBuf = NULL;
if (argc < 3) {
printf("Usage:\r\n");
printf("arc4 <file> <keylen> (<platform ID> <start index>)\r\n");
return 0;
}
src = fopen(argv[1], "rb");
if (src == NULL) {
printf("Cannot open input file\r\n");
return -1;
}
fseek(src, 0, SEEK_END);
size_t size = ftell(src);
rewind(src);
pBuf = (unsigned char *)calloc(size, 1);
if(fread(pBuf, 1, size, src) != size)
{
printf("Failed reading the file!\r\n");
return EXIT_FAILURE;
}
fclose(src);
cl_short keylen = atoi(argv[2]);
int selected_platform = -1;
if (argc >= 4) {
selected_platform = atoi(argv[3]);
}
uint64_t offset = 0;
if (argc >= 5) {
offset = atoll(argv[4]);
}
cl_platform_id platform = NULL;
// cl_context context;
cl_command_queue queue;
cl_device_id device;
// Platform
error = clGetPlatformIDs(0, NULL, &num);
if (error != CL_SUCCESS) {
std::cout << "Error getting platform id: " << error << std::endl;
exit(error);
}
std::cout << "Number of platforms: " << num << std::endl;
cl_platform_id* platforms = new cl_platform_id[num];
error = clGetPlatformIDs(num, platforms, NULL);
if (error != CL_SUCCESS) {
std::cout << "Error getting device ids: " << error << std::endl;
exit(error);
}
char platformName[100];
// Use given Platform ID or search for an AMD GPU
if (selected_platform != -1) {
if (selected_platform >= num) {
printf("Invalid platform ID\r\n");
return EXIT_FAILURE;
}
clGetPlatformInfo(platforms[selected_platform],
CL_PLATFORM_VENDOR,
sizeof(platformName),
platformName,
NULL);
platform = platforms[selected_platform];
printf("Selected platform: %s\r\n", platformName);
} else {
for (unsigned i = 0; i < num; ++i) {
error = clGetPlatformInfo(platforms[i],
CL_PLATFORM_VENDOR,
sizeof(platformName),
platformName,
NULL);
if (error != CL_SUCCESS) {
std::cout << "clGetPlatformInfo failed." << std::endl;
}
platform = platforms[i];
if (!strcmp(platformName, "Advanced Micro Devices, Inc.")) {
break;
}
}
}
if (platform == NULL) {
std::cout << "NULL platform found so Exiting Application." << std::endl;
exit(error);
}
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)platform,
0
};
// Context
context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &error);
if (error != CL_SUCCESS) {
std::cout << "Error creating context: " << error << std::endl;
exit(error);
}
size_t deviceListSize = 0;
int status = 0;
status = clGetContextInfo(
context,
CL_CONTEXT_DEVICES,
0,
NULL,
&deviceListSize);
if (error != CL_SUCCESS) {
std::cout << "clGetContextInfo failed." << error << std::endl;
exit(error);
}
int deviceCount = (int)(deviceListSize / sizeof(cl_device_id));
cl_device_id *devices;
devices = (cl_device_id *)malloc(deviceListSize);
/* Now, get the device list data */
status = clGetContextInfo(context,
CL_CONTEXT_DEVICES,
deviceListSize,
devices,
NULL);
cl_command_queue_properties prop = 0;
// Command-queue
queue = clCreateCommandQueue(context, devices[0], prop, &error);
if (error != CL_SUCCESS) {
std::cout << "Error creating command queue: " << error << std::endl;
exit(error);
}
std::ifstream in("arc4_search.cl");
std::string contents((std::istreambuf_iterator<char>(in)),
std::istreambuf_iterator<char>());
arc4_brute = (char *)contents.c_str();
size_t progsize = strlen(arc4_brute);
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&arc4_brute, &progsize, &error);
if (error != CL_SUCCESS) {
std::cout << "Error clCreateProgramWithSource: " << error << std::endl;
exit(error);
}
// Builds the program
error = clBuildProgram(program, 1, &devices[0], NULL, NULL, NULL);
if (error != CL_SUCCESS) {
std::cout << "Error clBuildProgram: " << error << std::endl;
char *build_log;
size_t ret_val_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = new char[ret_val_size + 1];
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
std::cout << build_log << std::endl;
exit(error);
}
size_t image_support; size_t image_support_size;
error = clGetDeviceInfo(devices[0], CL_DEVICE_IMAGE_SUPPORT, sizeof(size_t), &image_support, &image_support_size);
if (error != CL_SUCCESS) {
std::cout << "Error clGetDeviceInfo: " << error << std::endl;
exit(error);
}
size_t image_support_height; size_t image_support_height_size;
error = clGetDeviceInfo(devices[0], CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &image_support_height, &image_support_height_size);
if (error != CL_SUCCESS) {
std::cout << "Error clGetDeviceInfo: " << error << std::endl;
exit(error);
}
size_t image_support_width; size_t image_support_width_size;
error = clGetDeviceInfo(devices[0], CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &image_support_width, &image_support_width_size);
if (error != CL_SUCCESS) {
std::cout << "Error clGetDeviceInfo: " << error << std::endl;
exit(error);
}
std::cout << "image2d support: " << image_support << " (" << image_support_width << "x" << image_support_height << ")" << std::endl;
// Extracting the kernel
cl_kernel arc4_search_k = clCreateKernel(program, "arc4_search", &error);
if (error != CL_SUCCESS) {
std::cout << "Error clCreateKernel: " << error << std::endl;
exit(error);
}
//Fetch information about compute device
size_t pref_workg_size_mult;
size_t max_workg_size;
size_t max_workit_sizes[3];
clGetKernelWorkGroupInfo(arc4_search_k, devices[0],
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof(size_t), (void*)&pref_workg_size_mult, NULL);
clGetDeviceInfo(devices[0],
CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(size_t), (void*)&max_workg_size, NULL);
clGetDeviceInfo(devices[0],
CL_DEVICE_MAX_WORK_ITEM_SIZES,
sizeof(size_t) * 3, (void*)max_workit_sizes, NULL);
std::cout << "CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: " << pref_workg_size_mult << std::endl;
std::cout << "CL_DEVICE_MAX_WORK_GROUP_SIZE: " << max_workg_size << std::endl;
std::cout << "CL_DEVICE_MAX_WORK_ITEM_SIZES: " << max_workit_sizes[0] << " " << max_workit_sizes[1] << " " << max_workit_sizes[2] << std::endl;
char* best_match_string = new char[64];
cl_mem output_data_str = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 64*sizeof(cl_char), NULL, &error);
#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024
cl_mem input_string = clCreateBuffer(context, CL_MEM_READ_ONLY, 64, NULL, &error);
cl_mem input_keyposition = clCreateBuffer(context, CL_MEM_READ_ONLY, 256*sizeof(cl_int), NULL, &error);
cl_mem page_locked_buffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
char *page_locked_ptr = (char *)clEnqueueMapBuffer( queue, page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
// Calculate how many actual key positions are used because of the broken algorithm in arc4
unsigned char keyPositions[256][256] = { 0 };
int keyLens[256] = { 0 };
for (int keylen_t = 1; keylen_t < 256; keylen_t++) {
for (int i = 0; i < 256; i++) keyPositions[keylen_t][i & keylen_t] = 1;
for (int i = 0; i < keylen_t; i++) {
if (keyPositions[keylen_t][i]) keyLens[keylen_t]++;
}
}
int key_changable_positions[256] = { 0 };
int cur_key_pos = 0;
for (int i = 0; i < keylen; i++) {
if (keyPositions[keylen][i]) key_changable_positions[cur_key_pos++] = i;
}
// Copy the crypted input to the page locked memory
memcpy(page_locked_ptr, pBuf, 41);
// Move page locked memory to the GPU input_string
error = clEnqueueWriteBuffer(queue, input_string, CL_TRUE, 0, 41, page_locked_ptr, 0, NULL, NULL );
memcpy(page_locked_ptr, key_changable_positions, 256*sizeof(int));
error = clEnqueueWriteBuffer(queue, input_keyposition, CL_TRUE, 0, 256*sizeof(cl_int), page_locked_ptr, 0, NULL, NULL);
if (error != CL_SUCCESS) {
std::cout << "Error clEnqueueWriteImage: " << error << std::endl;
exit(error);
}
// Number of work-items per work-group, use recommended
size_t local_ws[] = { max_workg_size };
// Split work on multiple sets because GPU doesn't like to have too many work units at the same time
size_t global_ws[] = { 256 * 256 * 25 }; // Total number of work-items
printf("changable_len: %d\r\n", keyLens[keylen]);
uint64_t total_work = pow(36, keyLens[keylen]);
printf("total_work: %d\r\n", total_work);
bool more_work = true;
total_work -= offset;
int work_units = total_work / global_ws[0];
// Enqueuing parameters
error = clSetKernelArg(arc4_search_k, 0, sizeof(keylen), &keylen);
error = clSetKernelArg(arc4_search_k, 1, sizeof(input_string), &input_string);
error |= clSetKernelArg(arc4_search_k, 2, sizeof(output_data_str), &output_data_str);
error |= clSetKernelArg(arc4_search_k, 3, sizeof(input_keyposition), &input_keyposition);
error |= clSetKernelArg(arc4_search_k, 4, sizeof(cl_int), &keyLens[keylen]);
if (error != CL_SUCCESS) {
std::cout << "Error clSetKernelArg: " << error << std::endl;
exit(error);
}
std::cout << "Worksize: " << global_ws[0] << std::endl;
// Do work on limited sets
while (more_work) {
GPUstart = microTime();
// Update parameter for offset with each round
error |= clSetKernelArg(arc4_search_k, 5, sizeof(cl_ulong), &offset);
error = clEnqueueNDRangeKernel(queue, arc4_search_k, 1, NULL, global_ws, NULL, 0, NULL, NULL);
if (error != CL_SUCCESS) {
std::cout << "Error clEnqueueNDRangeKernel: " << error << std::endl;
exit(error);
}
// Reading back
error = clEnqueueReadBuffer(queue, output_data_str, CL_TRUE, 0, 41*sizeof(cl_char), best_match_string, 0, NULL, NULL);
if (error != CL_SUCCESS) {
std::cout << "Error clEnqueueReadBuffer: " << error << std::endl;
exit(error);
}
CPUstart = microTime();
best_match_string[41] = 0;
std::cout << best_match_string << std::endl;
offset+= global_ws[0];
if(!work_units) more_work = false;
std::cout << "Finished task " << offset << "/" << total_work << " (task " << work_units << ")" << std::endl;
work_units--;
if (!strncmp("http://", best_match_string, 7)) more_work = false;
GPUend = microTime();
std::cout << (uint64_t)(global_ws[0]/(((double)GPUend - (double)GPUstart)/1000000.0)) << " hash/s" << std::endl;
}
// Cleaning up
clReleaseKernel(arc4_search_k);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return EXIT_SUCCESS;
}
typedef struct _arc4state
{
unsigned char sbox[256];
} arc4state;
void init_arc4(arc4state *state, char *key, int keylen)
{
int j;
unsigned char tmp;
for(int i = 0; i < 256; i++)
{
state->sbox[i] = i;
}
j = 0;
for(int i = 0; i < 256; i++)
{
j = (state->sbox[i] + j + key[i & keylen]) & 0xFF;
tmp = state->sbox[i];
state->sbox[i] = state->sbox[j];
state->sbox[j] = tmp;
}
}
void arc4(arc4state *state, const global unsigned char *inbuf, int buflen, global unsigned char *output_data_str)
{
unsigned char i,j,u,m,t;
i = 0;
j = 0;
const char match[7] = "http://";
int curbuf = buflen;
int decrypting_ok = 0;
int readpos = 0;
while(curbuf > 0)
{
i++;
m = state->sbox[i];
j += (m + 1);
u = state->sbox[j];
state->sbox[i] = u;
state->sbox[j] = m;
t = (m + u);
unsigned char outval = inbuf[readpos++] ^ state->sbox[t];
if(decrypting_ok > 6 || outval == match[decrypting_ok]) {
output_data_str[decrypting_ok] = outval;
decrypting_ok ++;
} else {
return;
}
curbuf--;
}
}
__kernel void arc4_search(short keylength_under_testing,
const global unsigned char *input_string,
global unsigned char *output_data_str,
const global int *key_changable_positions,
int current_brute_force_items_in_key,
ulong offset) {
const ulong pos = get_global_id(0) + offset;
char values[37] = "abcdefghijklmnopqrstuvwxyz0123456789.";
ulong curval = pos;
arc4state state;
char input_key[256];
for(int i = 0; i < current_brute_force_items_in_key; i++) {
ulong temp = (curval/36);
input_key[key_changable_positions[i]] = values[curval-(temp*36)];
curval = temp;
}
input_key[keylength_under_testing] = 0;
init_arc4(&state, input_key, keylength_under_testing);
arc4(&state, input_string, 41, output_data_str);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment