monkins1010/scanshashv2.1.c

## scanshashv2.1.c
static const int PROTOCOL_VERSION = 170002;

#include <cuda_helper.h>

#define EQNONCE_OFFSET 30 /* 27:34 */
#define NONCE_OFT EQNONCE_OFFSET

static bool init[MAX_GPUS] = { 0 };
static int valid_sols[MAX_GPUS] = { 0 };
static uint8_t _ALIGN(64) data_sols[MAX_GPUS][10][1536] = { 0 }; // 140+3+1344 required
static __thread uint32_t throughput = 0;
extern void verus_hash(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces);
extern void verus_setBlock(uint8_t *blockf, uint32_t *pTargetIn, uint8_t *lkey, int thr_id);
extern void verus_init(int thr_id);


#ifndef htobe32
#define htobe32(x) swab32(x)
#endif

extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
{

	uint32_t _ALIGN(64) endiandata[35];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	int dev_id = device_map[thr_id];

	struct timeval tv_start, tv_end, diff;
	double secs, solps;

	uint32_t nonce_buf = 0;
	uint32_t intensity = 20;

	unsigned char block_41970[] = { 0xfd, 0x40, 0x05, 0x01 };
	uint8_t _ALIGN(64) full_data[140 + 3 + 1344] = { 0 };
	uint8_t* sol_data = &full_data[140];

//	memcpy(full_data, block_41970, 1487);
	memcpy(endiandata, pdata, 140);
	memcpy(sol_data, block_41970, 4);
	memcpy(full_data, endiandata, 140);

	throughput = cuda_default_throughput(thr_id, 1U << intensity);
	if (init[thr_id]) throughput = min(throughput, max_nonce - nonce_buf);

	if (!init[thr_id])
	{
		cudaSetDevice(dev_id);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			CUDA_LOG_ERROR();
		}
		cuda_get_arch(thr_id);
		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

		CVerusHash::init();
		CVerusHashV2::init();
		verus_init(thr_id);
		init[thr_id] = true;
	}
	alignas(32) uint256 curHash, curTarget = *(uint256*)work->target;
    const uint64_t *compResult = (uint64_t *)&curHash;
	const uint64_t *compTarget = (uint64_t *)&curTarget;
    CVerusHashV2 vh;

	verusclhasher &vclh = vh.vclh;
	uint8_t *localkey;
	localkey = (uint8_t *)malloc(VERUS_KEY_SIZE);  // make mem for 128 copies of key to send to gpu
	vh.Reset();
	vh.Write((const unsigned char*)full_data, 1487);

	//vhw.Reset();
	//vhw << work->data;
	u128 *hashKey = (u128 *)verusclhasher_key.get();
	verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
	void *hasherrefresh = ((unsigned char *)hashKey) + pdesc->keySizeInBytes;
	const int keyrefreshsize = vclh.keyrefreshsize(); // number of 256 bit blocks

	unsigned char *curBuf = vh.CurBuffer();
	vh.FillExtra((u128 *)curBuf);

	// skip keygen if it is the current key - TODO fix this
	//if (pdesc->seed != *((uint256 *)curBuf))
	//{
		// generate a new key by chain hashing with Haraka256 from the last curbuf
		// assume 256 bit boundary
		int n256blks = pdesc->keySizeInBytes >> 5;
		unsigned char *pkey = ((unsigned char *)hashKey);
		unsigned char *psrc = curBuf;
		for (int i = 0; i < n256blks; i++)
		{
			haraka256(pkey, psrc);
			psrc = pkey;
			pkey += 32;

		}
		pdesc->seed = *((uint256 *)curBuf);
		memcpy(hasherrefresh, hashKey, pdesc->keySizeInBytes);

	memcpy(localkey, hasherrefresh, 8832);

    work->valid_nonces = 0;
//	vh.Finalize2b((unsigned char *)&curHash);

	gettimeofday(&tv_start, NULL);
	verus_setBlock((uint8_t*)curBuf, work->target, localkey, thr_id); //set data to gpu kernel

	do {

		*hashes_done = nonce_buf + throughput;
		verus_hash(thr_id, throughput, nonce_buf, work->nonces);

		if (work->nonces[0] != UINT32_MAX)
		{
			const uint32_t Htarg = ptarget[7];
			uint32_t _ALIGN(64) vhash[8];

			*((uint32_t *)full_data + 368) = work->nonces[0];
			memcpy(curBuf + 32, full_data + 1486 - 14, 15);

			vh.Finalize2b((unsigned char *)&curHash);
			memcpy(vhash, &curHash, 32);

			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
			{

				work->valid_nonces++;

				memcpy(work->data, endiandata, 140);
				int nonce = work->valid_nonces - 1;
				memcpy(work->extra, sol_data, 1347);
				bn_store_hash_target_ratio(vhash, work->target, work, nonce);

				work->nonces[work->valid_nonces - 1] = endiandata[NONCE_OFT];
				//pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
				goto out;
			}
			else if (vhash[7] > Htarg) {
				gpu_increment_reject(thr_id);
				if (!opt_quiet)
					gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", work->nonces[0]);
			}
		}
		if ((uint64_t)throughput + (uint64_t)nonce_buf >= (uint64_t)max_nonce) {

			break;
		}
		nonce_buf += throughput;

	} while (!work_restart[thr_id].restart);


out:
	gettimeofday(&tv_end, NULL);
	timeval_subtract(&diff, &tv_end, &tv_start);
	secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec);
	solps = (double)nonce_buf / secs;
	//gpulog(LOG_INFO, thr_id, "%d k/hashes in %.2f s (%.2f MH/s)", nonce_buf / 1000, secs, solps / 1000000);
	// H/s

	//*hashes_done = first_nonce;
	pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
	free(localkey);
	return work->valid_nonces;
}

// cleanup
void free_verushash(int thr_id)
{
	if (!init[thr_id])
		return;


	init[thr_id] = false;
}
	static const int PROTOCOL_VERSION = 170002;

	#include <cuda_helper.h>

	#define EQNONCE_OFFSET 30 /* 27:34 */
	#define NONCE_OFT EQNONCE_OFFSET

	static bool init[MAX_GPUS] = { 0 };
	static int valid_sols[MAX_GPUS] = { 0 };
	static uint8_t _ALIGN(64) data_sols[MAX_GPUS][10][1536] = { 0 }; // 140+3+1344 required
	static __thread uint32_t throughput = 0;
	extern void verus_hash(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces);
	extern void verus_setBlock(uint8_t blockf, uint32_t pTargetIn, uint8_t *lkey, int thr_id);
	extern void verus_init(int thr_id);


	#ifndef htobe32
	#define htobe32(x) swab32(x)
	#endif

	extern "C" int scanhash_verus(int thr_id, struct work work, uint32_t max_nonce, unsigned long hashes_done)
	{

	uint32_t _ALIGN(64) endiandata[35];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	int dev_id = device_map[thr_id];

	struct timeval tv_start, tv_end, diff;
	double secs, solps;

	uint32_t nonce_buf = 0;
	uint32_t intensity = 20;

	unsigned char block_41970[] = { 0xfd, 0x40, 0x05, 0x01 };
	uint8_t _ALIGN(64) full_data[140 + 3 + 1344] = { 0 };
	uint8_t* sol_data = &full_data[140];

	// memcpy(full_data, block_41970, 1487);
	memcpy(endiandata, pdata, 140);
	memcpy(sol_data, block_41970, 4);
	memcpy(full_data, endiandata, 140);

	throughput = cuda_default_throughput(thr_id, 1U << intensity);
	if (init[thr_id]) throughput = min(throughput, max_nonce - nonce_buf);

	if (!init[thr_id])
	{
	cudaSetDevice(dev_id);
	if (opt_cudaschedule == -1 && gpu_threads == 1) {
	cudaDeviceReset();
	// reduce cpu usage
	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
	CUDA_LOG_ERROR();
	}
	cuda_get_arch(thr_id);
	gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

	CVerusHash::init();
	CVerusHashV2::init();
	verus_init(thr_id);
	init[thr_id] = true;
	}
	alignas(32) uint256 curHash, curTarget = (uint256)work->target;
	const uint64_t compResult = (uint64_t )&curHash;
	const uint64_t compTarget = (uint64_t )&curTarget;
	CVerusHashV2 vh;

	verusclhasher &vclh = vh.vclh;
	uint8_t *localkey;
	localkey = (uint8_t *)malloc(VERUS_KEY_SIZE); // make mem for 128 copies of key to send to gpu
	vh.Reset();
	vh.Write((const unsigned char*)full_data, 1487);

	//vhw.Reset();
	//vhw << work->data;
	u128 hashKey = (u128 )verusclhasher_key.get();
	verusclhash_descr pdesc = (verusclhash_descr )verusclhasher_descr.get();
	void hasherrefresh = ((unsigned char )hashKey) + pdesc->keySizeInBytes;
	const int keyrefreshsize = vclh.keyrefreshsize(); // number of 256 bit blocks

	unsigned char *curBuf = vh.CurBuffer();
	vh.FillExtra((u128 *)curBuf);

	// skip keygen if it is the current key - TODO fix this
	//if (pdesc->seed != ((uint256 )curBuf))
	//{
	// generate a new key by chain hashing with Haraka256 from the last curbuf
	// assume 256 bit boundary
	int n256blks = pdesc->keySizeInBytes >> 5;
	unsigned char pkey = ((unsigned char )hashKey);
	unsigned char *psrc = curBuf;
	for (int i = 0; i < n256blks; i++)
	{
	haraka256(pkey, psrc);
	psrc = pkey;
	pkey += 32;

	}
	pdesc->seed = ((uint256 )curBuf);
	memcpy(hasherrefresh, hashKey, pdesc->keySizeInBytes);

	memcpy(localkey, hasherrefresh, 8832);

	work->valid_nonces = 0;
	// vh.Finalize2b((unsigned char *)&curHash);

	gettimeofday(&tv_start, NULL);
	verus_setBlock((uint8_t*)curBuf, work->target, localkey, thr_id); //set data to gpu kernel

	do {

	*hashes_done = nonce_buf + throughput;
	verus_hash(thr_id, throughput, nonce_buf, work->nonces);

	if (work->nonces[0] != UINT32_MAX)
	{
	const uint32_t Htarg = ptarget[7];
	uint32_t _ALIGN(64) vhash[8];

	((uint32_t )full_data + 368) = work->nonces[0];
	memcpy(curBuf + 32, full_data + 1486 - 14, 15);

	vh.Finalize2b((unsigned char *)&curHash);
	memcpy(vhash, &curHash, 32);

	if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
	{

	work->valid_nonces++;

	memcpy(work->data, endiandata, 140);
	int nonce = work->valid_nonces - 1;
	memcpy(work->extra, sol_data, 1347);
	bn_store_hash_target_ratio(vhash, work->target, work, nonce);

	work->nonces[work->valid_nonces - 1] = endiandata[NONCE_OFT];
	//pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
	goto out;
	}
	else if (vhash[7] > Htarg) {
	gpu_increment_reject(thr_id);
	if (!opt_quiet)
	gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", work->nonces[0]);
	}
	}
	if ((uint64_t)throughput + (uint64_t)nonce_buf >= (uint64_t)max_nonce) {

	break;
	}
	nonce_buf += throughput;

	} while (!work_restart[thr_id].restart);


	out:
	gettimeofday(&tv_end, NULL);
	timeval_subtract(&diff, &tv_end, &tv_start);
	secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec);
	solps = (double)nonce_buf / secs;
	//gpulog(LOG_INFO, thr_id, "%d k/hashes in %.2f s (%.2f MH/s)", nonce_buf / 1000, secs, solps / 1000000);
	// H/s

	//*hashes_done = first_nonce;
	pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
	free(localkey);
	return work->valid_nonces;
	}

	// cleanup
	void free_verushash(int thr_id)
	{
	if (!init[thr_id])
	return;



	init[thr_id] = false;
	}