K0IN/SHA256.cu

## SHA256.cu
#include <iostream>
#include <string>
#include <random>
#include <ctime>
#include <cuda.h>
#include <cuda_runtime.h>
#include <chrono>

// SHA-256 constants and transformations
__constant__ uint32_t k[64] = {
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2};

// SHA-256 functions and transformations
__device__ inline uint32_t rotr(uint32_t x, uint32_t n)
{
  return (x >> n) | (x << (32 - n));
}

__device__ inline uint32_t ch(uint32_t x, uint32_t y, uint32_t z)
{
  return (x & y) ^ (~x & z);
}

__device__ inline uint32_t maj(uint32_t x, uint32_t y, uint32_t z)
{
  return (x & y) ^ (x & z) ^ (y & z);
}

__device__ inline uint32_t sigma0(uint32_t x)
{
  return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
}

__device__ inline uint32_t sigma1(uint32_t x)
{
  return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
}

__device__ inline uint32_t gamma0(uint32_t x)
{
  return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
}

__device__ inline uint32_t gamma1(uint32_t x)
{
  return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
}

__device__ inline uint8_t count_leading_zeros(const uint32_t val[8])
{
  uint8_t leading_zero_bits = 0;
#pragma unroll 8
  for (auto j = 0; j < 8; ++j)
  {
    if (val[j] == 0)
    {
      leading_zero_bits += 32;
    }
    else
    {
      uint32_t mask = 0x80000000;
      while ((val[j] & mask) == 0)
      {
        leading_zero_bits += 1;
        mask >>= 1;
      }
      break;
      // leading_zero_bits = __clz(val[j]);
    }
  }
  return leading_zero_bits;
}

__global__ void sha256_transform(uint8_t *out, const uint8_t *messages, const uint32_t num_chunks)
{
  const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx >= num_chunks)
    return;

  const uint32_t msg_size = 64;
  const uint8_t *msg = messages + idx * msg_size;

  uint32_t w[64];
  uint32_t a, b, c, d, e, f, g, h;

  // Initialize hash values
  a = 0x6a09e667;
  b = 0xbb67ae85;
  c = 0x3c6ef372;
  d = 0xa54ff53a;
  e = 0x510e527f;
  f = 0x9b05688c;
  g = 0x1f83d9ab;
  h = 0x5be0cd19;

  // Prepare message schedule
#pragma unroll 16
  for (auto i = 0; i < 16; i++)
  {
    w[i] = (msg[i * 4] << 24) | (msg[i * 4 + 1] << 16) | (msg[i * 4 + 2] << 8) | (msg[i * 4 + 3]);
  }
#pragma unroll 64
  for (auto i = 16; i < 64; i++)
  {
    w[i] = gamma1(w[i - 2]) + w[i - 7] + gamma0(w[i - 15]) + w[i - 16];
  }

  // Compression function main loop
#pragma unroll 64
  for (auto i = 0; i < 64; i++)
  {
    const uint32_t T1 = h + sigma1(e) + ch(e, f, g) + k[i] + w[i];
    const uint32_t T2 = sigma0(a) + maj(a, b, c);
    h = g;
    g = f;
    f = e;
    e = d + T1;
    d = c;
    c = b;
    b = a;
    a = T1 + T2;
  }

  // Add the compressed chunk to the current hash value
  const uint32_t hashes[]{
      a + 0x6a09e667,
      b + 0xbb67ae85,
      c + 0x3c6ef372,
      d + 0xa54ff53a,
      e + 0x510e527f,
      f + 0x9b05688c,
      g + 0x1f83d9ab,
      h + 0x5be0cd19};

  out[idx] = count_leading_zeros(hashes);
}

static long long count = 0;
std::string generate_string(const std::string &username, const uint32_t size)
{
  count += 1;
  const std::string base64_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+/";
  std::string nonce = "";
  static std::default_random_engine generator(std::time(nullptr));
  static std::uniform_int_distribution<int> distribution(0, base64_chars.size() - 1);
  for (int i = 0; i < size; ++i)
  {
    nonce += base64_chars[distribution(generator)];
  }
  return username + "/never+gonna+give+you+up/" + nonce;
  // return username + "/never+gonna+give+you+up/" + std::to_string(count++);
}

int main()
{
  const std::string username = "k0in";
  const uint32_t nonce_size = 8;
  const auto num_of_itters = 100;

  const uint32_t total_count_per_batch = 1024;
  const uint32_t chunk_size = 64;

  const uint32_t size_per_batch = total_count_per_batch * chunk_size;

  // Allocate host memory
  uint8_t *host_messages = new uint8_t[size_per_batch]();
  uint8_t *host_leading_zeros = new uint8_t[total_count_per_batch];

  // Allocate device memory
  uint8_t *device_messages;
  uint8_t *device_leading_zeros;
  cudaMalloc(&device_messages, size_per_batch * sizeof(device_messages[0]));
  cudaMalloc(&device_leading_zeros, total_count_per_batch * sizeof(device_leading_zeros[0]));

  std::vector<std::string> inputs(total_count_per_batch);

  uint32_t best_hash[8] = {0};
  uint32_t max_leading_zeros = 0;
  std::string best_input;

  for (auto j = 0; j < total_count_per_batch; j++)
  {
    const auto input = generate_string(username, nonce_size);
    // Add the length of the input
    host_messages[j * chunk_size + input.size()] = 0x80;

    // Add the length of the input
    host_messages[j * chunk_size + chunk_size - 8] = (input.size() * 8) >> 56;
    host_messages[j * chunk_size + chunk_size - 7] = (input.size() * 8) >> 48;
    host_messages[j * chunk_size + chunk_size - 6] = (input.size() * 8) >> 40;
    host_messages[j * chunk_size + chunk_size - 5] = (input.size() * 8) >> 32;
    host_messages[j * chunk_size + chunk_size - 4] = (input.size() * 8) >> 24;
    host_messages[j * chunk_size + chunk_size - 3] = (input.size() * 8) >> 16;
    host_messages[j * chunk_size + chunk_size - 2] = (input.size() * 8) >> 8;
    host_messages[j * chunk_size + chunk_size - 1] = (input.size() * 8);
  }

  // Launch kernel
  const auto threadsPerBlock = 256; // Start with a value like 256 or 512
  const auto blocksPerGrid = (total_count_per_batch + threadsPerBlock - 1) / threadsPerBlock;
  while (true)
  {
    auto start_nonce = count;
    auto start_time = std::chrono::high_resolution_clock::now();
    for (auto i = 0; i < num_of_itters; ++i)
    {
      for (auto j = 0; j < total_count_per_batch; j++)
      {
        const auto input = generate_string(username, nonce_size);
        inputs[j] = input;
        // init the blocks with the input
        std::copy(input.begin(), input.end(), host_messages + j * chunk_size);
      }

      // Copy message to device
      cudaMemcpy(device_messages, host_messages, size_per_batch * sizeof(host_messages[0]), cudaMemcpyHostToDevice);

      sha256_transform<<<blocksPerGrid, threadsPerBlock>>>(device_leading_zeros, device_messages, total_count_per_batch);

      // sync threads
      cudaDeviceSynchronize();

      // Copy hashes back to host
      cudaMemcpy(host_leading_zeros, device_leading_zeros, total_count_per_batch * sizeof(device_leading_zeros[0]), cudaMemcpyDeviceToHost);

      for (auto i = 0; i < total_count_per_batch; i++)
      {
        const auto leading_zeros = host_leading_zeros[i];
        if (leading_zeros > max_leading_zeros)
        {
          max_leading_zeros = leading_zeros;
          best_input = inputs[i];
        }
      }
    }

    // std::cout << std::endl;

    auto stop_time = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time);
    double hashrate = (count - start_nonce) / (duration.count() != 0 ? duration.count() : 1);

    std::cout << "Hashrate: " << hashrate << " KHash/s " << duration.count() << " ms/iter/" << std::endl;
    std::cout << "Leading zeros: " << max_leading_zeros << std::endl;
    std::cout << "Best input: " << best_input << std::endl;
    std::cout << "Nonce: " << count << " +" << count - start_nonce << std::endl;
  }

  // Free memory
  delete[] host_messages;
  delete[] host_leading_zeros;
  cudaFree(device_messages);
  cudaFree(device_leading_zeros);

  return 0;
}
	#include <iostream>
	#include <string>
	#include <random>
	#include <ctime>
	#include <cuda.h>
	#include <cuda_runtime.h>
	#include <chrono>

	// SHA-256 constants and transformations
	__constant__ uint32_t k[64] = {
	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2};

	// SHA-256 functions and transformations
	__device__ inline uint32_t rotr(uint32_t x, uint32_t n)
	{
	return (x >> n) \| (x << (32 - n));
	}

	__device__ inline uint32_t ch(uint32_t x, uint32_t y, uint32_t z)
	{
	return (x & y) ^ (~x & z);
	}

	__device__ inline uint32_t maj(uint32_t x, uint32_t y, uint32_t z)
	{
	return (x & y) ^ (x & z) ^ (y & z);
	}

	__device__ inline uint32_t sigma0(uint32_t x)
	{
	return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
	}

	__device__ inline uint32_t sigma1(uint32_t x)
	{
	return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
	}

	__device__ inline uint32_t gamma0(uint32_t x)
	{
	return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
	}

	__device__ inline uint32_t gamma1(uint32_t x)
	{
	return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
	}

	__device__ inline uint8_t count_leading_zeros(const uint32_t val[8])
	{
	uint8_t leading_zero_bits = 0;
	#pragma unroll 8
	for (auto j = 0; j < 8; ++j)
	{
	if (val[j] == 0)
	{
	leading_zero_bits += 32;
	}
	else
	{
	uint32_t mask = 0x80000000;
	while ((val[j] & mask) == 0)
	{
	leading_zero_bits += 1;
	mask >>= 1;
	}
	break;
	// leading_zero_bits = __clz(val[j]);
	}
	}
	return leading_zero_bits;
	}

	__global__ void sha256_transform(uint8_t out, const uint8_t messages, const uint32_t num_chunks)
	{
	const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx >= num_chunks)
	return;

	const uint32_t msg_size = 64;
	const uint8_t msg = messages + idx msg_size;

	uint32_t w[64];
	uint32_t a, b, c, d, e, f, g, h;

	// Initialize hash values
	a = 0x6a09e667;
	b = 0xbb67ae85;
	c = 0x3c6ef372;
	d = 0xa54ff53a;
	e = 0x510e527f;
	f = 0x9b05688c;
	g = 0x1f83d9ab;
	h = 0x5be0cd19;

	// Prepare message schedule
	#pragma unroll 16
	for (auto i = 0; i < 16; i++)
	{
	w[i] = (msg[i * 4] << 24) \| (msg[i * 4 + 1] << 16) \| (msg[i * 4 + 2] << 8) \| (msg[i * 4 + 3]);
	}
	#pragma unroll 64
	for (auto i = 16; i < 64; i++)
	{
	w[i] = gamma1(w[i - 2]) + w[i - 7] + gamma0(w[i - 15]) + w[i - 16];
	}

	// Compression function main loop
	#pragma unroll 64
	for (auto i = 0; i < 64; i++)
	{
	const uint32_t T1 = h + sigma1(e) + ch(e, f, g) + k[i] + w[i];
	const uint32_t T2 = sigma0(a) + maj(a, b, c);
	h = g;
	g = f;
	f = e;
	e = d + T1;
	d = c;
	c = b;
	b = a;
	a = T1 + T2;
	}

	// Add the compressed chunk to the current hash value
	const uint32_t hashes[]{
	a + 0x6a09e667,
	b + 0xbb67ae85,
	c + 0x3c6ef372,
	d + 0xa54ff53a,
	e + 0x510e527f,
	f + 0x9b05688c,
	g + 0x1f83d9ab,
	h + 0x5be0cd19};

	out[idx] = count_leading_zeros(hashes);
	}

	static long long count = 0;
	std::string generate_string(const std::string &username, const uint32_t size)
	{
	count += 1;
	const std::string base64_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+/";
	std::string nonce = "";
	static std::default_random_engine generator(std::time(nullptr));
	static std::uniform_int_distribution<int> distribution(0, base64_chars.size() - 1);
	for (int i = 0; i < size; ++i)
	{
	nonce += base64_chars[distribution(generator)];
	}
	return username + "/never+gonna+give+you+up/" + nonce;
	// return username + "/never+gonna+give+you+up/" + std::to_string(count++);
	}

	int main()
	{
	const std::string username = "k0in";
	const uint32_t nonce_size = 8;
	const auto num_of_itters = 100;

	const uint32_t total_count_per_batch = 1024;
	const uint32_t chunk_size = 64;

	const uint32_t size_per_batch = total_count_per_batch * chunk_size;

	// Allocate host memory
	uint8_t *host_messages = new uint8_t[size_per_batch]();
	uint8_t *host_leading_zeros = new uint8_t[total_count_per_batch];

	// Allocate device memory
	uint8_t *device_messages;
	uint8_t *device_leading_zeros;
	cudaMalloc(&device_messages, size_per_batch * sizeof(device_messages[0]));
	cudaMalloc(&device_leading_zeros, total_count_per_batch * sizeof(device_leading_zeros[0]));

	std::vector<std::string> inputs(total_count_per_batch);

	uint32_t best_hash[8] = {0};
	uint32_t max_leading_zeros = 0;
	std::string best_input;

	for (auto j = 0; j < total_count_per_batch; j++)
	{
	const auto input = generate_string(username, nonce_size);
	// Add the length of the input
	host_messages[j * chunk_size + input.size()] = 0x80;

	// Add the length of the input
	host_messages[j * chunk_size + chunk_size - 8] = (input.size() * 8) >> 56;
	host_messages[j * chunk_size + chunk_size - 7] = (input.size() * 8) >> 48;
	host_messages[j * chunk_size + chunk_size - 6] = (input.size() * 8) >> 40;
	host_messages[j * chunk_size + chunk_size - 5] = (input.size() * 8) >> 32;
	host_messages[j * chunk_size + chunk_size - 4] = (input.size() * 8) >> 24;
	host_messages[j * chunk_size + chunk_size - 3] = (input.size() * 8) >> 16;
	host_messages[j * chunk_size + chunk_size - 2] = (input.size() * 8) >> 8;
	host_messages[j * chunk_size + chunk_size - 1] = (input.size() * 8);
	}

	// Launch kernel
	const auto threadsPerBlock = 256; // Start with a value like 256 or 512
	const auto blocksPerGrid = (total_count_per_batch + threadsPerBlock - 1) / threadsPerBlock;
	while (true)
	{
	auto start_nonce = count;
	auto start_time = std::chrono::high_resolution_clock::now();
	for (auto i = 0; i < num_of_itters; ++i)
	{
	for (auto j = 0; j < total_count_per_batch; j++)
	{
	const auto input = generate_string(username, nonce_size);
	inputs[j] = input;
	// init the blocks with the input
	std::copy(input.begin(), input.end(), host_messages + j * chunk_size);
	}

	// Copy message to device
	cudaMemcpy(device_messages, host_messages, size_per_batch * sizeof(host_messages[0]), cudaMemcpyHostToDevice);

	sha256_transform<<<blocksPerGrid, threadsPerBlock>>>(device_leading_zeros, device_messages, total_count_per_batch);

	// sync threads
	cudaDeviceSynchronize();

	// Copy hashes back to host
	cudaMemcpy(host_leading_zeros, device_leading_zeros, total_count_per_batch * sizeof(device_leading_zeros[0]), cudaMemcpyDeviceToHost);

	for (auto i = 0; i < total_count_per_batch; i++)
	{
	const auto leading_zeros = host_leading_zeros[i];
	if (leading_zeros > max_leading_zeros)
	{
	max_leading_zeros = leading_zeros;
	best_input = inputs[i];
	}
	}
	}

	// std::cout << std::endl;

	auto stop_time = std::chrono::high_resolution_clock::now();
	auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time);
	double hashrate = (count - start_nonce) / (duration.count() != 0 ? duration.count() : 1);

	std::cout << "Hashrate: " << hashrate << " KHash/s " << duration.count() << " ms/iter/" << std::endl;
	std::cout << "Leading zeros: " << max_leading_zeros << std::endl;
	std::cout << "Best input: " << best_input << std::endl;
	std::cout << "Nonce: " << count << " +" << count - start_nonce << std::endl;
	}

	// Free memory
	delete[] host_messages;
	delete[] host_leading_zeros;
	cudaFree(device_messages);
	cudaFree(device_leading_zeros);

	return 0;
	}