timetravel algo

+ new kernels jh512-80 groestl-80 and cubehash-80 Signed-off-by: Tanguy Pruvot <[email protected]>
DumaxFr · Mar 7, 2017 · 07ebcb5 · 07ebcb5
1 parent 3ede61b
commit 07ebcb5
Show file tree

Hide file tree

Showing 15 changed files with 959 additions and 44 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -64,7 +64,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
 			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
-			  x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu \
+			  x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu \
 			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
 			  x15/whirlpool.cu \

diff --git a/algos.h b/algos.h
@@ -43,6 +43,7 @@ enum sha_algos {
 	ALGO_SKEIN,
 	ALGO_SKEIN2,
 	ALGO_S3,
+	ALGO_TIMETRAVEL,
 	ALGO_X11EVO,
 	ALGO_X11,
 	ALGO_X13,
@@ -101,6 +102,7 @@ static const char *algo_names[] = {
 	"skein",
 	"skein2",
 	"s3",
+	"timetravel",
 	"x11evo",
 	"x11",
 	"x13",

diff --git a/bench.cpp b/bench.cpp
@@ -92,6 +92,7 @@ void algo_free_all(int thr_id)
 	//free_sha256d(thr_id);
 	free_scrypt(thr_id);
 	free_scrypt_jane(thr_id);
+	free_timetravel(thr_id);
 }
 
 // benchmark all algos (called once per mining thread)

diff --git a/ccminer.cpp b/ccminer.cpp
@@ -261,6 +261,7 @@ Options:\n\
 			skein       Skein SHA2 (Skeincoin)\n\
 			skein2      Double Skein (Woodcoin)\n\
 			s3          S3 (1Coin)\n\
+			timetravel  Machinecoin permuted x8\n\
 			vanilla     Blake256-8 (VNL)\n\
 			veltor      Thorsriddle streebog\n\
 			whirlcoin   Old Whirlcoin (Whirlpool algo)\n\
@@ -1619,6 +1620,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_LBRY:
 		case ALGO_LYRA2v2:
 		case ALGO_LYRA2Z:
+		case ALGO_TIMETRAVEL:
 			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
 			break;
 		case ALGO_KECCAK:
@@ -2121,6 +2123,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_HEAVY:
 			case ALGO_LYRA2v2:
 			case ALGO_S3:
+			case ALGO_TIMETRAVEL:
 			case ALGO_X11EVO:
 			case ALGO_X11:
 			case ALGO_X13:
@@ -2333,6 +2336,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_WILDKECCAK:
 			rc = scanhash_wildkeccak(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_TIMETRAVEL:
+			rc = scanhash_timetravel(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_X11EVO:
 			rc = scanhash_x11evo(thr_id, &work, max_nonce, &hashes_done);
 			break;

diff --git a/ccminer.vcxproj b/ccminer.vcxproj
@@ -539,6 +539,7 @@
     <CudaCompile Include="x11\fresh.cu" />
     <CudaCompile Include="x11\sib.cu" />
     <CudaCompile Include="x11\s3.cu" />
+    <CudaCompile Include="x11\timetravel.cu" />
     <CudaCompile Include="x11\veltor.cu" />
     <CudaCompile Include="x11\x11.cu" />
     <CudaCompile Include="x11\x11evo.cu" />

diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
@@ -739,6 +739,9 @@
     <CudaCompile Include="x11\s3.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
+    <CudaCompile Include="x11\timetravel.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
     <CudaCompile Include="x11\veltor.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>

diff --git a/miner.h b/miner.h
@@ -306,6 +306,7 @@ extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsig
 extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blake_rounds);
 extern int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -360,6 +361,7 @@ extern void free_sib(int thr_id);
 extern void free_skeincoin(int thr_id);
 extern void free_skein2(int thr_id);
 extern void free_s3(int thr_id);
+extern void free_timetravel(int thr_id);
 extern void free_vanilla(int thr_id);
 extern void free_veltor(int thr_id);
 extern void free_whirl(int thr_id);
@@ -882,6 +884,7 @@ void sibhash(void *output, const void *input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
 void s3hash(void *output, const void *input);
+void timetravel_hash(void *output, const void *input);
 void veltorhash(void *output, const void *input);
 void wcoinhash(void *state, const void *input);
 void whirlxHash(void *state, const void *input);

diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu
@@ -334,3 +334,86 @@ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
 
 // Setup function
 __host__ void  quark_jh512_cpu_init(int thr_id, uint32_t threads) {}
+
+#define WANT_JH80
+#ifdef WANT_JH80
+
+__constant__
+static uint32_t c_PaddedMessage80[20]; // padded message (80 bytes)
+
+__host__
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+__global__
+void jh512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t h[20];
+		AS_UINT4(&h[ 0]) = AS_UINT4(&c_PaddedMessage80[ 0]);
+		AS_UINT4(&h[ 4]) = AS_UINT4(&c_PaddedMessage80[ 4]);
+		AS_UINT4(&h[ 8]) = AS_UINT4(&c_PaddedMessage80[ 8]);
+		AS_UINT4(&h[12]) = AS_UINT4(&c_PaddedMessage80[12]);
+		AS_UINT2(&h[16]) = AS_UINT2(&c_PaddedMessage80[16]);
+		h[18] = c_PaddedMessage80[18];
+		h[19] = cuda_swab32(startNounce + thread);
+
+		uint32_t x[8][4] = { /* init */
+			{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
+			{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
+			{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea },
+			{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba },
+			{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e },
+			{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d },
+			{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
+			{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc }
+		};
+
+		// 1 (could be precomputed)
+		#pragma unroll
+		for (int i = 0; i < 16; i++)
+			x[i/4][i & 3] ^= h[i];
+		E8(x);
+		#pragma unroll
+		for (int i = 0; i < 16; i++)
+			x[(i+16)/4][(i+16) & 3] ^= h[i];
+
+		// 2 (16 bytes with nonce)
+		#pragma unroll
+		for (int i = 0; i < 4; i++)
+			x[0][i] ^= h[16+i];
+		x[1][0] ^= 0x80U;
+		E8(x);
+		#pragma unroll
+		for (int i = 0; i < 4; i++)
+			x[4][i] ^= h[16+i];
+		x[5][0] ^= 0x80U;
+
+		// 3 close
+		x[3][3] ^= 0x80020000U; // 80 bytes = 640bits (0x280)
+		E8(x);
+		x[7][3] ^= 0x80020000U;
+
+		uint32_t *Hash = &g_outhash[(size_t)16 * thread];
+		AS_UINT4(&Hash[ 0]) = AS_UINT4(&x[4][0]);
+		AS_UINT4(&Hash[ 4]) = AS_UINT4(&x[5][0]);
+		AS_UINT4(&Hash[ 8]) = AS_UINT4(&x[6][0]);
+		AS_UINT4(&Hash[12]) = AS_UINT4(&x[7][0]);
+	}
+}
+
+__host__
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	jh512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, d_hash);
+}
+
+#endif
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
@@ -18,6 +18,11 @@
 #include "groestl_transf_quad.h"
 #endif
 
+#define WANT_GROESTL80
+#ifdef WANT_GROESTL80
+__constant__ static uint32_t c_Message80[20];
+#endif
+
 #include "cuda_quark_groestl512_sm2.cuh"
 
 __global__ __launch_bounds__(TPB, THF)
@@ -114,3 +119,93 @@ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNo
 		quark_groestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
 }
 
+// --------------------------------------------------------------------------------------------------------------------------------------------
+
+#ifdef WANT_GROESTL80
+
+__host__
+void groestl512_setBlock_80(int thr_id, uint32_t *endiandata)
+{
+	cudaMemcpyToSymbol(c_Message80, endiandata, sizeof(c_Message80), 0, cudaMemcpyHostToDevice);
+}
+
+__global__ __launch_bounds__(TPB, THF)
+void groestl512_gpu_hash_80_quad(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
+{
+#if __CUDA_ARCH__ >= 300
+	// BEWARE : 4-WAY CODE (one hash need 4 threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
+	if (thread < threads)
+	{
+		const uint32_t thr = threadIdx.x & 0x3; // % THF
+
+		/*| M0 M1 M2 M3 M4 | M5 M6 M7 | (input)
+		--|----------------|----------|
+		T0|  0  4  8 12 16 | 80       |
+		T1|  1  5       17 |          |
+		T2|  2  6       18 |          |
+		T3|  3  7       Nc |       01 |
+		--|----------------|----------| TPR */
+
+		uint32_t message[8];
+
+		#pragma unroll 5
+		for(int k=0; k<5; k++) message[k] = c_Message80[thr + (k * THF)];
+
+		#pragma unroll 3
+		for(int k=5; k<8; k++) message[k] = 0;
+
+		if (thr == 0) message[5] = 0x80U;
+		if (thr == 3) {
+			message[4] = cuda_swab32(startNounce + thread);
+			message[7] = 0x01000000U;
+		}
+
+		uint32_t msgBitsliced[8];
+		to_bitslice_quad(message, msgBitsliced);
+
+		uint32_t state[8];
+		groestl512_progressMessage_quad(state, msgBitsliced);
+
+		uint32_t hash[16];
+		from_bitslice_quad(state, hash);
+
+		if (thr == 0) { /* 4 threads were done */
+			const off_t hashPosition = thread;
+			//if (!thread) hash[15] = 0xFFFFFFFF;
+			uint4 *outpt = (uint4*) &g_outhash[hashPosition << 4];
+			uint4 *phash = (uint4*) hash;
+			outpt[0] = phash[0];
+			outpt[1] = phash[1];
+			outpt[2] = phash[2];
+			outpt[3] = phash[3];
+		}
+	}
+#endif
+}
+
+__host__
+void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	int dev_id = device_map[thr_id];
+
+	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) {
+		const uint32_t threadsperblock = TPB;
+		const uint32_t factor = THF;
+
+		dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
+		dim3 block(threadsperblock);
+
+		groestl512_gpu_hash_80_quad <<<grid, block>>> (threads, startNounce, d_hash);
+
+	} else {
+
+		const uint32_t threadsperblock = 256;
+		dim3 grid((threads + threadsperblock-1)/threadsperblock);
+		dim3 block(threadsperblock);
+
+		groestl512_gpu_hash_80_sm2 <<<grid, block>>> (threads, startNounce, d_hash);
+	}
+}
+
+#endif