Skip to content

Instantly share code, notes, and snippets.

@K0IN
Last active June 19, 2024 19:08
Show Gist options
  • Save K0IN/3fbafacffa5cbe135938d1d9dcf84084 to your computer and use it in GitHub Desktop.
Save K0IN/3fbafacffa5cbe135938d1d9dcf84084 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <string>
#include <random>
#include <ctime>
#include <cuda.h>
#include <cuda_runtime.h>
#include <chrono>
// SHA-256 constants and transformations
__constant__ uint32_t k[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2};
// SHA-256 functions and transformations
__device__ inline uint32_t rotr(uint32_t x, uint32_t n)
{
return (x >> n) | (x << (32 - n));
}
__device__ inline uint32_t ch(uint32_t x, uint32_t y, uint32_t z)
{
return (x & y) ^ (~x & z);
}
__device__ inline uint32_t maj(uint32_t x, uint32_t y, uint32_t z)
{
return (x & y) ^ (x & z) ^ (y & z);
}
__device__ inline uint32_t sigma0(uint32_t x)
{
return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
}
__device__ inline uint32_t sigma1(uint32_t x)
{
return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
}
__device__ inline uint32_t gamma0(uint32_t x)
{
return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
}
__device__ inline uint32_t gamma1(uint32_t x)
{
return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
}
__device__ inline uint8_t count_leading_zeros(const uint32_t val[8])
{
uint8_t leading_zero_bits = 0;
#pragma unroll 8
for (auto j = 0; j < 8; ++j)
{
if (val[j] == 0)
{
leading_zero_bits += 32;
}
else
{
uint32_t mask = 0x80000000;
while ((val[j] & mask) == 0)
{
leading_zero_bits += 1;
mask >>= 1;
}
break;
// leading_zero_bits = __clz(val[j]);
}
}
return leading_zero_bits;
}
__global__ void sha256_transform(uint8_t *out, const uint8_t *messages, const uint32_t num_chunks)
{
const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_chunks)
return;
const uint32_t msg_size = 64;
const uint8_t *msg = messages + idx * msg_size;
uint32_t w[64];
uint32_t a, b, c, d, e, f, g, h;
// Initialize hash values
a = 0x6a09e667;
b = 0xbb67ae85;
c = 0x3c6ef372;
d = 0xa54ff53a;
e = 0x510e527f;
f = 0x9b05688c;
g = 0x1f83d9ab;
h = 0x5be0cd19;
// Prepare message schedule
#pragma unroll 16
for (auto i = 0; i < 16; i++)
{
w[i] = (msg[i * 4] << 24) | (msg[i * 4 + 1] << 16) | (msg[i * 4 + 2] << 8) | (msg[i * 4 + 3]);
}
#pragma unroll 64
for (auto i = 16; i < 64; i++)
{
w[i] = gamma1(w[i - 2]) + w[i - 7] + gamma0(w[i - 15]) + w[i - 16];
}
// Compression function main loop
#pragma unroll 64
for (auto i = 0; i < 64; i++)
{
const uint32_t T1 = h + sigma1(e) + ch(e, f, g) + k[i] + w[i];
const uint32_t T2 = sigma0(a) + maj(a, b, c);
h = g;
g = f;
f = e;
e = d + T1;
d = c;
c = b;
b = a;
a = T1 + T2;
}
// Add the compressed chunk to the current hash value
const uint32_t hashes[]{
a + 0x6a09e667,
b + 0xbb67ae85,
c + 0x3c6ef372,
d + 0xa54ff53a,
e + 0x510e527f,
f + 0x9b05688c,
g + 0x1f83d9ab,
h + 0x5be0cd19};
out[idx] = count_leading_zeros(hashes);
}
static long long count = 0;
std::string generate_string(const std::string &username, const uint32_t size)
{
count += 1;
const std::string base64_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+/";
std::string nonce = "";
static std::default_random_engine generator(std::time(nullptr));
static std::uniform_int_distribution<int> distribution(0, base64_chars.size() - 1);
for (int i = 0; i < size; ++i)
{
nonce += base64_chars[distribution(generator)];
}
return username + "/never+gonna+give+you+up/" + nonce;
// return username + "/never+gonna+give+you+up/" + std::to_string(count++);
}
int main()
{
const std::string username = "k0in";
const uint32_t nonce_size = 8;
const auto num_of_itters = 100;
const uint32_t total_count_per_batch = 1024;
const uint32_t chunk_size = 64;
const uint32_t size_per_batch = total_count_per_batch * chunk_size;
// Allocate host memory
uint8_t *host_messages = new uint8_t[size_per_batch]();
uint8_t *host_leading_zeros = new uint8_t[total_count_per_batch];
// Allocate device memory
uint8_t *device_messages;
uint8_t *device_leading_zeros;
cudaMalloc(&device_messages, size_per_batch * sizeof(device_messages[0]));
cudaMalloc(&device_leading_zeros, total_count_per_batch * sizeof(device_leading_zeros[0]));
std::vector<std::string> inputs(total_count_per_batch);
uint32_t best_hash[8] = {0};
uint32_t max_leading_zeros = 0;
std::string best_input;
for (auto j = 0; j < total_count_per_batch; j++)
{
const auto input = generate_string(username, nonce_size);
// Add the length of the input
host_messages[j * chunk_size + input.size()] = 0x80;
// Add the length of the input
host_messages[j * chunk_size + chunk_size - 8] = (input.size() * 8) >> 56;
host_messages[j * chunk_size + chunk_size - 7] = (input.size() * 8) >> 48;
host_messages[j * chunk_size + chunk_size - 6] = (input.size() * 8) >> 40;
host_messages[j * chunk_size + chunk_size - 5] = (input.size() * 8) >> 32;
host_messages[j * chunk_size + chunk_size - 4] = (input.size() * 8) >> 24;
host_messages[j * chunk_size + chunk_size - 3] = (input.size() * 8) >> 16;
host_messages[j * chunk_size + chunk_size - 2] = (input.size() * 8) >> 8;
host_messages[j * chunk_size + chunk_size - 1] = (input.size() * 8);
}
// Launch kernel
const auto threadsPerBlock = 256; // Start with a value like 256 or 512
const auto blocksPerGrid = (total_count_per_batch + threadsPerBlock - 1) / threadsPerBlock;
while (true)
{
auto start_nonce = count;
auto start_time = std::chrono::high_resolution_clock::now();
for (auto i = 0; i < num_of_itters; ++i)
{
for (auto j = 0; j < total_count_per_batch; j++)
{
const auto input = generate_string(username, nonce_size);
inputs[j] = input;
// init the blocks with the input
std::copy(input.begin(), input.end(), host_messages + j * chunk_size);
}
// Copy message to device
cudaMemcpy(device_messages, host_messages, size_per_batch * sizeof(host_messages[0]), cudaMemcpyHostToDevice);
sha256_transform<<<blocksPerGrid, threadsPerBlock>>>(device_leading_zeros, device_messages, total_count_per_batch);
// sync threads
cudaDeviceSynchronize();
// Copy hashes back to host
cudaMemcpy(host_leading_zeros, device_leading_zeros, total_count_per_batch * sizeof(device_leading_zeros[0]), cudaMemcpyDeviceToHost);
for (auto i = 0; i < total_count_per_batch; i++)
{
const auto leading_zeros = host_leading_zeros[i];
if (leading_zeros > max_leading_zeros)
{
max_leading_zeros = leading_zeros;
best_input = inputs[i];
}
}
}
// std::cout << std::endl;
auto stop_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time);
double hashrate = (count - start_nonce) / (duration.count() != 0 ? duration.count() : 1);
std::cout << "Hashrate: " << hashrate << " KHash/s " << duration.count() << " ms/iter/" << std::endl;
std::cout << "Leading zeros: " << max_leading_zeros << std::endl;
std::cout << "Best input: " << best_input << std::endl;
std::cout << "Nonce: " << count << " +" << count - start_nonce << std::endl;
}
// Free memory
delete[] host_messages;
delete[] host_leading_zeros;
cudaFree(device_messages);
cudaFree(device_leading_zeros);
return 0;
}
@K0IN
Copy link
Author

K0IN commented Jun 18, 2024

challenge: https://shallenge.quirino.net/
Objective: Get the lowest possible SHA256 hash.
Submit a string in the format "{username}/{nonce}", where:
username: 1-32 characters from a-zA-Z0-9_-
nonce: 1-64 characters from Base64 (a-zA-Z0-9+/)
The hash of the full string will be considered, not just the nonce

@K0IN
Copy link
Author

K0IN commented Jun 19, 2024

This hits 4 MH/s on my RTX 2070 Super

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment