Last active
June 19, 2024 19:08
-
-
Save K0IN/3fbafacffa5cbe135938d1d9dcf84084 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <random> | |
#include <ctime> | |
#include <cuda.h> | |
#include <cuda_runtime.h> | |
#include <chrono> | |
// SHA-256 constants and transformations | |
__constant__ uint32_t k[64] = { | |
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, | |
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, | |
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, | |
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, | |
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, | |
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, | |
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, | |
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2}; | |
// SHA-256 functions and transformations | |
__device__ inline uint32_t rotr(uint32_t x, uint32_t n) | |
{ | |
return (x >> n) | (x << (32 - n)); | |
} | |
__device__ inline uint32_t ch(uint32_t x, uint32_t y, uint32_t z) | |
{ | |
return (x & y) ^ (~x & z); | |
} | |
__device__ inline uint32_t maj(uint32_t x, uint32_t y, uint32_t z) | |
{ | |
return (x & y) ^ (x & z) ^ (y & z); | |
} | |
__device__ inline uint32_t sigma0(uint32_t x) | |
{ | |
return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22); | |
} | |
__device__ inline uint32_t sigma1(uint32_t x) | |
{ | |
return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25); | |
} | |
__device__ inline uint32_t gamma0(uint32_t x) | |
{ | |
return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3); | |
} | |
__device__ inline uint32_t gamma1(uint32_t x) | |
{ | |
return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10); | |
} | |
__device__ inline uint8_t count_leading_zeros(const uint32_t val[8]) | |
{ | |
uint8_t leading_zero_bits = 0; | |
#pragma unroll 8 | |
for (auto j = 0; j < 8; ++j) | |
{ | |
if (val[j] == 0) | |
{ | |
leading_zero_bits += 32; | |
} | |
else | |
{ | |
uint32_t mask = 0x80000000; | |
while ((val[j] & mask) == 0) | |
{ | |
leading_zero_bits += 1; | |
mask >>= 1; | |
} | |
break; | |
// leading_zero_bits = __clz(val[j]); | |
} | |
} | |
return leading_zero_bits; | |
} | |
__global__ void sha256_transform(uint8_t *out, const uint8_t *messages, const uint32_t num_chunks) | |
{ | |
const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x; | |
if (idx >= num_chunks) | |
return; | |
const uint32_t msg_size = 64; | |
const uint8_t *msg = messages + idx * msg_size; | |
uint32_t w[64]; | |
uint32_t a, b, c, d, e, f, g, h; | |
// Initialize hash values | |
a = 0x6a09e667; | |
b = 0xbb67ae85; | |
c = 0x3c6ef372; | |
d = 0xa54ff53a; | |
e = 0x510e527f; | |
f = 0x9b05688c; | |
g = 0x1f83d9ab; | |
h = 0x5be0cd19; | |
// Prepare message schedule | |
#pragma unroll 16 | |
for (auto i = 0; i < 16; i++) | |
{ | |
w[i] = (msg[i * 4] << 24) | (msg[i * 4 + 1] << 16) | (msg[i * 4 + 2] << 8) | (msg[i * 4 + 3]); | |
} | |
#pragma unroll 64 | |
for (auto i = 16; i < 64; i++) | |
{ | |
w[i] = gamma1(w[i - 2]) + w[i - 7] + gamma0(w[i - 15]) + w[i - 16]; | |
} | |
// Compression function main loop | |
#pragma unroll 64 | |
for (auto i = 0; i < 64; i++) | |
{ | |
const uint32_t T1 = h + sigma1(e) + ch(e, f, g) + k[i] + w[i]; | |
const uint32_t T2 = sigma0(a) + maj(a, b, c); | |
h = g; | |
g = f; | |
f = e; | |
e = d + T1; | |
d = c; | |
c = b; | |
b = a; | |
a = T1 + T2; | |
} | |
// Add the compressed chunk to the current hash value | |
const uint32_t hashes[]{ | |
a + 0x6a09e667, | |
b + 0xbb67ae85, | |
c + 0x3c6ef372, | |
d + 0xa54ff53a, | |
e + 0x510e527f, | |
f + 0x9b05688c, | |
g + 0x1f83d9ab, | |
h + 0x5be0cd19}; | |
out[idx] = count_leading_zeros(hashes); | |
} | |
static long long count = 0; | |
std::string generate_string(const std::string &username, const uint32_t size) | |
{ | |
count += 1; | |
const std::string base64_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+/"; | |
std::string nonce = ""; | |
static std::default_random_engine generator(std::time(nullptr)); | |
static std::uniform_int_distribution<int> distribution(0, base64_chars.size() - 1); | |
for (int i = 0; i < size; ++i) | |
{ | |
nonce += base64_chars[distribution(generator)]; | |
} | |
return username + "/never+gonna+give+you+up/" + nonce; | |
// return username + "/never+gonna+give+you+up/" + std::to_string(count++); | |
} | |
int main() | |
{ | |
const std::string username = "k0in"; | |
const uint32_t nonce_size = 8; | |
const auto num_of_itters = 100; | |
const uint32_t total_count_per_batch = 1024; | |
const uint32_t chunk_size = 64; | |
const uint32_t size_per_batch = total_count_per_batch * chunk_size; | |
// Allocate host memory | |
uint8_t *host_messages = new uint8_t[size_per_batch](); | |
uint8_t *host_leading_zeros = new uint8_t[total_count_per_batch]; | |
// Allocate device memory | |
uint8_t *device_messages; | |
uint8_t *device_leading_zeros; | |
cudaMalloc(&device_messages, size_per_batch * sizeof(device_messages[0])); | |
cudaMalloc(&device_leading_zeros, total_count_per_batch * sizeof(device_leading_zeros[0])); | |
std::vector<std::string> inputs(total_count_per_batch); | |
uint32_t best_hash[8] = {0}; | |
uint32_t max_leading_zeros = 0; | |
std::string best_input; | |
for (auto j = 0; j < total_count_per_batch; j++) | |
{ | |
const auto input = generate_string(username, nonce_size); | |
// Add the length of the input | |
host_messages[j * chunk_size + input.size()] = 0x80; | |
// Add the length of the input | |
host_messages[j * chunk_size + chunk_size - 8] = (input.size() * 8) >> 56; | |
host_messages[j * chunk_size + chunk_size - 7] = (input.size() * 8) >> 48; | |
host_messages[j * chunk_size + chunk_size - 6] = (input.size() * 8) >> 40; | |
host_messages[j * chunk_size + chunk_size - 5] = (input.size() * 8) >> 32; | |
host_messages[j * chunk_size + chunk_size - 4] = (input.size() * 8) >> 24; | |
host_messages[j * chunk_size + chunk_size - 3] = (input.size() * 8) >> 16; | |
host_messages[j * chunk_size + chunk_size - 2] = (input.size() * 8) >> 8; | |
host_messages[j * chunk_size + chunk_size - 1] = (input.size() * 8); | |
} | |
// Launch kernel | |
const auto threadsPerBlock = 256; // Start with a value like 256 or 512 | |
const auto blocksPerGrid = (total_count_per_batch + threadsPerBlock - 1) / threadsPerBlock; | |
while (true) | |
{ | |
auto start_nonce = count; | |
auto start_time = std::chrono::high_resolution_clock::now(); | |
for (auto i = 0; i < num_of_itters; ++i) | |
{ | |
for (auto j = 0; j < total_count_per_batch; j++) | |
{ | |
const auto input = generate_string(username, nonce_size); | |
inputs[j] = input; | |
// init the blocks with the input | |
std::copy(input.begin(), input.end(), host_messages + j * chunk_size); | |
} | |
// Copy message to device | |
cudaMemcpy(device_messages, host_messages, size_per_batch * sizeof(host_messages[0]), cudaMemcpyHostToDevice); | |
sha256_transform<<<blocksPerGrid, threadsPerBlock>>>(device_leading_zeros, device_messages, total_count_per_batch); | |
// sync threads | |
cudaDeviceSynchronize(); | |
// Copy hashes back to host | |
cudaMemcpy(host_leading_zeros, device_leading_zeros, total_count_per_batch * sizeof(device_leading_zeros[0]), cudaMemcpyDeviceToHost); | |
for (auto i = 0; i < total_count_per_batch; i++) | |
{ | |
const auto leading_zeros = host_leading_zeros[i]; | |
if (leading_zeros > max_leading_zeros) | |
{ | |
max_leading_zeros = leading_zeros; | |
best_input = inputs[i]; | |
} | |
} | |
} | |
// std::cout << std::endl; | |
auto stop_time = std::chrono::high_resolution_clock::now(); | |
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time); | |
double hashrate = (count - start_nonce) / (duration.count() != 0 ? duration.count() : 1); | |
std::cout << "Hashrate: " << hashrate << " KHash/s " << duration.count() << " ms/iter/" << std::endl; | |
std::cout << "Leading zeros: " << max_leading_zeros << std::endl; | |
std::cout << "Best input: " << best_input << std::endl; | |
std::cout << "Nonce: " << count << " +" << count - start_nonce << std::endl; | |
} | |
// Free memory | |
delete[] host_messages; | |
delete[] host_leading_zeros; | |
cudaFree(device_messages); | |
cudaFree(device_leading_zeros); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This hits 4 MH/s on my RTX 2070 Super