cryptonight: extra cleanup, to be continued...

cbuchner1 · Jun 22, 2018 · 1cc5b6d · 1cc5b6d
1 parent 793c9f1
commit 1cc5b6d
Show file tree

Hide file tree

Showing 9 changed files with 62 additions and 204 deletions.
diff --git a/crypto/cn_groestl.cuh b/crypto/cn_groestl.cuh
@@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState*  __restrict__ ctx, BitSequence* __restri
 	for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
 		output[j] = s[i];
 	}
-
+#if 0
 	for (i = 0; i < GROESTL_COLS512; i++) {
 		ctx->chaining[i] = 0;
 	}
 	for (i = 0; i < GROESTL_SIZE512; i++) {
 		ctx->buffer[i] = 0;
 	}
+#endif
 }
 
 __device__

diff --git a/crypto/cryptolight-core.cu b/crypto/cryptolight-core.cu
@@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
 
 	if(thread < threads)
 	{
-		const int oft = thread * 52 + sub + 16; // not aligned 16!
+		const int oft = thread * 50 + sub + 16; // not aligned 16!
 		const int long_oft = (thread << LONG_SHL_IDX) + sub;
 		uint32_t __align__(16) key[40];
 		uint32_t __align__(16) text[4];
@@ -222,7 +222,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
 	if(thread < threads)
 	{
 		const int long_oft = (thread << LONG_SHL_IDX) + sub;
-		const int oft = thread * 52 + sub + 16;
+		const int oft = thread * 50 + sub + 16;
 		uint32_t __align__(16) key[40];
 		uint32_t __align__(16) text[4];
 

diff --git a/crypto/cryptolight.cu b/crypto/cryptolight.cu
@@ -67,7 +67,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 
 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput);
+		cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);

diff --git a/crypto/cryptolight.h b/crypto/cryptolight.h
@@ -137,8 +137,8 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
 
 void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
 
-void cryptonight_extra_setData(int thr_id, const void *data, const void *pTargetIn);
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
 void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
 void cryptonight_extra_free(int thr_id);
-void cryptonight_extra_prepare(int thr_id, int threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak1_2);
-void cryptonight_extra_final(int thr_id, int threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state);
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state);
diff --git a/crypto/cryptonight-core.cu b/crypto/cryptonight-core.cu
@@ -126,121 +126,6 @@ void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, cons
 
 // --------------------------------------------------------------------------------------------------------------
 
-#if 0
-#if UINTPTR_MAX == UINT64_MAX
-#define LPTR "l"
-#else
-#define LPTR "r"
-#endif
-
-__device__ __forceinline__ uint64_t loadGlobal64(uint64_t * const addr) {
-	uint64_t x;
-	asm volatile("ld.global.cg.u64 %0, [%1];" : "=l"(x) : LPTR (addr));
-	return x;
-}
-
-__device__ __forceinline__ uint32_t loadGlobal32(uint32_t * const addr) {
-	uint32_t x;
-	asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : LPTR (addr));
-	return x;
-}
-
-__device__ __forceinline__ void storeGlobal32(uint32_t* addr, uint32_t const & val) {
-	asm volatile("st.global.cg.u32 [%0], %1;" : : LPTR (addr), "r"(val));
-}
-
-__device__ __forceinline__ uint32_t variant1_1(const uint32_t src)
-{
-	const uint32_t tmp = (src >> 24); // __byte_perm(src, 0, 0x7773);
-	const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
-	return (src & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
-}
-
-__global__
-void monero_phase2_messed(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx,
-	uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
-	uint64_t * __restrict__ d_tweak)
-{
-	__shared__ uint32_t sharedMemory[1024];
-	cn_aes_gpu_init(sharedMemory);
-	__syncthreads();
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
-	if (thread >= threads)
-		return;
-
-	const uint32_t batchsize = ITER >> (2U + bfactor);
-	const uint32_t start = partidx * batchsize;
-	const uint32_t end = start + batchsize;
-
-	const uint32_t subthr = threadIdx.x & 3;
-	const uint32_t thrctx = thread << 2;
-	uint32_t * ctx_a = &d_ctx_a[thrctx];
-	uint32_t * ctx_b = &d_ctx_b[thrctx];
-	uint32_t * long_state = &d_long_state[thread << LONG_SHL32];
-
-	uint32_t d[2], t1[2], t2[2];
-	uint32_t a = ctx_a[subthr];
-	d[1] = ctx_b[subthr];
-
-	uint32_t tweak[2];
-	AS_UINT2(&tweak) = AS_UINT2(&d_tweak[thread]);
-
-	for (uint32_t i = start; i < end; i++)
-	{
-		#pragma unroll 2
-		for (int x = 0; x < 2; x++)
-		{
-			uint32_t j = ((__shfl(a, 0, 4) & 0x1FFFF0) >> 2) + subthr;
-
-			const uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j);
-			const uint32_t x_1 = __shfl(x_0, subthr + 1, 4);
-			const uint32_t x_2 = __shfl(x_0, subthr + 2, 4);
-			const uint32_t x_3 = __shfl(x_0, subthr + 3, 4);
-
-			// t_fn = aes shared mem read
-			d[x] = a ^ 	t_fn0(x_0 & 0xff) ^
-				t_fn1((x_1 >>  8) & 0xff) ^
-				t_fn2((x_2 >> 16) & 0xff) ^
-				t_fn3((x_3 >> 24));
-			t1[0] = __shfl(d[x], 0, 4);
-
-			uint32_t z = d[0] ^ d[1];
-			if (subthr == 2) z = variant1_1(z);
-			storeGlobal32(long_state + j, z);
-
-			// -----------------------------------------------------------
-			j = ((*t1 & 0x1FFFF0) >> 2) + subthr;
-
-			uint32_t yy[2];
-			AS_U64(yy) = loadGlobal64<uint64_t>(((uint64_t *)long_state) + (j >> 1));
-
-			t1[1] = __shfl(d[x], 1, 4);
-
-			uint32_t sub2 = (threadIdx.x & 2);
-			t2[0] = __shfl(a, sub2, 4);
-			t2[1] = __shfl(a, sub2 + 1U, 4);
-
-			uint32_t zz[2];
-			zz[0] = __shfl(yy[0], 0, 4);
-			zz[1] = __shfl(yy[1], 0, 4);
-
-			AS_U64(t2) += sub2 ? (AS_U64(t1) * AS_U64(zz)) : __umul64hi(AS_U64(t1), AS_U64(zz));
-
-			uint32_t s1 = subthr & 1U;
-			z = AS_U64(t2) >> (s1 * 32U); // hi or lo dword
-			//z = __byte_perm(t2[0], t2[1], s1 ? 0x7654 : 0x3210);
-			storeGlobal32(long_state + j, sub2 ? tweak[s1] ^ z : z);
-			a = z ^ yy[s1];
-		}
-	}
-
-	if (bfactor) {
-		ctx_a[subthr] = a;
-		ctx_b[subthr] = d[1];
-	}
-}
-#endif
-
 __device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z)
 {
 	const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
@@ -345,7 +230,7 @@ void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict_
 extern int device_bfactor[MAX_GPUS];
 
 __host__
-void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
+void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
 	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	dim3 grid(blocks);
@@ -371,7 +256,6 @@ void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long
 			cryptonight_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
 		else
 			monero_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
-			//monero_phase2_messed <<<grid, b>>> (throughput, bfactor, i, (uint32_t*) d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		if(partcount > 1) usleep(bsleep);
 	}

diff --git a/crypto/cryptonight-cpu.cpp b/crypto/cryptonight-cpu.cpp
@@ -15,9 +15,8 @@ extern "C" {
 #define VARIANT1_1(p) \
 	if (variant > 0) { \
 		const uint8_t tmp = ((const uint8_t*)(p))[11]; \
-		static const uint32_t table = 0x75310; \
 		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \
-		((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \
+		((uint8_t*)(p))[11] = tmp ^ ((0x75310 >> index) & 0x30); \
 	}
 
 struct cryptonight_ctx {