From 2ca6ede92b2c43e485e01ca839d6131b15586ec9 Mon Sep 17 00:00:00 2001 From: Christian Buchner Date: Thu, 27 Mar 2014 00:47:12 +0100 Subject: [PATCH] update to version v0.5 (2014-03-27) --- Makefile.am | 2 +- Makefile.in | 2 +- README.txt | 42 +++- ccminer.vcxproj | 27 +-- configure | 20 +- configure.ac | 2 +- cpu-miner.c | 50 +++- cpuminer-config.h | 4 +- cuda_blake512.cu | 4 +- cuda_combine.cu | 2 +- cuda_fugue256.cu | 9 +- cuda_groestl512.cu | 2 +- cuda_groestlcoin.cu | 276 ++++++++++++---------- cuda_hefty1.cu | 565 +++++++++++++++++++++++--------------------- cuda_keccak512.cu | 2 +- cuda_sha256.cu | 9 +- heavy.cu | 35 +++ 17 files changed, 588 insertions(+), 465 deletions(-) diff --git a/Makefile.am b/Makefile.am index c523ffa6a4..3f8f816790 100644 --- a/Makefile.am +++ b/Makefile.am @@ -34,4 +34,4 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f # we're now targeting all major compute architectures within one binary. .cu.o: - $(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< diff --git a/Makefile.in b/Makefile.in index 0925b1f4d0..130d7ce2d4 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1035,7 +1035,7 @@ uninstall-am: uninstall-binPROGRAMS # we're now targeting all major compute architectures within one binary. .cu.o: - $(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff --git a/README.txt b/README.txt index 31dbac55f6..3c956e081f 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 0.4 (Mar 24th 2014) - Groestlcoin Pool Release +ccMiner release 0.5 (Mar 27th 2014) - "Hefty Optimization" ------------------------------------------------------------- *************************************************************** @@ -38,6 +38,11 @@ its command line interface and options. fugue256 use to mine Fuguecoin groestl use to mine Groestlcoin + -d, --devices gives a comma separated list of CUDA device IDs + to operate on. Device IDs start counting from 0! + Alternatively give string names of your card like + gtx780ti or gt640#2 (matching 2nd gt640 in the PC). + -o, --url=URL URL of mining server (default: " DEF_RPC_URL ") -O, --userpass=U:P username:password pair for mining server -u, --user=USERNAME username for mining server @@ -63,8 +68,10 @@ its command line interface and options. -V, --version display version information and exit -h, --help display this help text and exit + >>> Examples <<< + Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <> -p <> -v 512 @@ -107,22 +114,33 @@ from your old clunkers. >>> RELEASE HISTORY <<< - March, 24 2014 fixed Groestl pool support + March, 27 2014 Heavycoin exchange rates soar, and as a result this coin + gets some love: We greatly optimized the Hefty1 kernel + for speed. Expect some hefty gains, especially on 750Ti's! + + By popular demand, we added the -d option as known from + cudaminer. + + different compute capability builds are now provided until + we figure out how to pack everything into a single executable + in a Windows build. + + March, 24 2014 fixed Groestl pool support - went back to Compute 1.x for cuda_hefty1.cu kernel by - default after numerous reports of ccminer v0.2/v0.3 - not working with HeavyCoin for some people. + went back to Compute 1.x for cuda_hefty1.cu kernel by + default after numerous reports of ccminer v0.2/v0.3 + not working with HeavyCoin for some people. - March, 23 2014 added Groestlcoin support. stratum status unknown - (the only pool is currently down for fixing issues) + March, 23 2014 added Groestlcoin support. stratum status unknown + (the only pool is currently down for fixing issues) - March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates - on Fermi and Maxwell devices. Kepler may suffer slightly - (3-5%) + March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates + on Fermi and Maxwell devices. Kepler may suffer slightly + (3-5%) - Fixed Stratum for Fuguecoin. Tested on dwarfpool. + Fixed Stratum for Fuguecoin. Tested on dwarfpool. - March, 18 2014 initial release. + March, 18 2014 initial release. >>> AUTHORS <<< diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 029fceb062..5424f0a422 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -95,12 +95,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" true - 63 + 80 true true - compute_20,sm_20 + compute_35,sm_35 -Xptxas "-abi=no -v" %(AdditionalOptions) @@ -127,12 +127,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" true - 63 + 80 true true - compute_20,sm_20 + compute_35,sm_35 -Xptxas "-abi=no -v" %(AdditionalOptions) @@ -163,12 +163,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" true - 63 + 80 true true - compute_20,sm_20 + compute_35,sm_35 -Xptxas "-abi=no -v" %(AdditionalOptions) @@ -199,12 +199,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" true - 63 + 80 true true - compute_20,sm_20 + compute_35,sm_35 -Xptxas "-abi=no -v" %(AdditionalOptions) @@ -277,16 +277,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" - - compute_10,sm_10 - compute_10,sm_10 - 124 - 124 - compute_10,sm_10 - 124 - compute_10,sm_10 - 124 - + diff --git a/configure b/configure index c565c170e6..d826992518 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.68 for ccminer 2014.03.24. +# Generated by GNU Autoconf 2.68 for ccminer 2014.03.27. # # # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, @@ -557,8 +557,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='ccminer' PACKAGE_TARNAME='ccminer' -PACKAGE_VERSION='2014.03.24' -PACKAGE_STRING='ccminer 2014.03.24' +PACKAGE_VERSION='2014.03.27' +PACKAGE_STRING='ccminer 2014.03.27' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures ccminer 2014.03.24 to adapt to many kinds of systems. +\`configure' configures ccminer 2014.03.27 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1368,7 +1368,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of ccminer 2014.03.24:";; + short | recursive ) echo "Configuration of ccminer 2014.03.27:";; esac cat <<\_ACEOF @@ -1469,7 +1469,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -ccminer configure 2014.03.24 +ccminer configure 2014.03.27 generated by GNU Autoconf 2.68 Copyright (C) 2010 Free Software Foundation, Inc. @@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by ccminer $as_me 2014.03.24, which was +It was created by ccminer $as_me 2014.03.27, which was generated by GNU Autoconf 2.68. Invocation command line was $ $0 $@ @@ -2901,7 +2901,7 @@ fi # Define the identity of the package. PACKAGE='ccminer' - VERSION='2014.03.24' + VERSION='2014.03.27' cat >>confdefs.h <<_ACEOF @@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by ccminer $as_me 2014.03.24, which was +This file was extended by ccminer $as_me 2014.03.27, which was generated by GNU Autoconf 2.68. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7184,7 +7184,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -ccminer config.status 2014.03.24 +ccminer config.status 2014.03.27 configured by $0, generated by GNU Autoconf 2.68, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 18063f98f7..ce82fdc86d 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2014.03.24]) +AC_INIT([ccminer], [2014.03.27]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index e88b79ed77..370a257f02 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -51,8 +51,13 @@ // from heavy.cu #ifdef __cplusplus extern "C" +{ #endif int cuda_num_devices(); +int cuda_finddevice(char *name); +#ifdef __cplusplus +} +#endif #ifdef __linux /* Linux specific policy and affinity management */ @@ -144,10 +149,11 @@ static int opt_scantime = 5; static json_t *opt_config; static const bool opt_time = true; static sha256_algos opt_algo = ALGO_HEAVY; -static int opt_n_threads; +static int opt_n_threads = 0; bool opt_trust_pool = false; uint16_t opt_vote = 9999; static int num_processors; +int device_map[8] = {0,1,2,3,4,5,6,7}; // CB static char *rpc_url; static char *rpc_userpass; static char *rpc_user, *rpc_pass; @@ -185,7 +191,11 @@ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ fugue256 Fuguecoin hash\n\ heavy Heavycoin hash\n\ - -v, --vote=VOTE block reward vote\n\ + -d, --devices takes a comma separated list of CUDA devices to use.\n\ + Device IDs start counting from 0! Alternatively takes\n\ + string names of your cards like gtx780ti or gt640#2\n\ + (matching 2nd gt640 in the PC)\n\ + -v, --vote=VOTE block reward vote (for HeavyCoin)\n\ -m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ @@ -227,7 +237,7 @@ static char const short_options[] = #ifdef HAVE_SYSLOG_H "S" #endif - "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vmv:"; + "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:"; static struct option const options[] = { { "algo", 1, NULL, 'a' }, @@ -259,6 +269,7 @@ static struct option const options[] = { { "user", 1, NULL, 'u' }, { "userpass", 1, NULL, 'O' }, { "version", 0, NULL, 'V' }, + { "devices", 1, NULL, 'd' }, { 0, 0, 0, 0 } }; @@ -1251,6 +1262,32 @@ static void parse_arg (int key, char *arg) case 'S': use_syslog = true; break; + case 'd': // CB + { + char * pch = strtok (arg,","); + opt_n_threads = 0; + while (pch != NULL) { + if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0') + { + if (atoi(pch) < num_processors) + device_map[opt_n_threads++] = atoi(pch); + else { + applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch)); + exit(1); + } + } else { + int device = cuda_finddevice(pch); + if (device >= 0 && device < num_processors) + device_map[opt_n_threads++] = device; + else { + applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch); + exit(1); + } + } + pch = strtok (NULL, ","); + } + } + break; case 'V': show_version_and_exit(); case 'h': @@ -1346,7 +1383,7 @@ static void signal_handler(int sig) } #endif -#define PROGRAM_VERSION "0.4" +#define PROGRAM_VERSION "0.5" int main(int argc, char *argv[]) { struct thr_info *thr; @@ -1370,6 +1407,9 @@ int main(int argc, char *argv[]) rpc_user = strdup(""); rpc_pass = strdup(""); + pthread_mutex_init(&applog_lock, NULL); + num_processors = cuda_num_devices(); + /* parse command line */ parse_cmdline(argc, argv); @@ -1385,7 +1425,6 @@ int main(int argc, char *argv[]) sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); } - pthread_mutex_init(&applog_lock, NULL); pthread_mutex_init(&stats_lock, NULL); pthread_mutex_init(&g_work_lock, NULL); pthread_mutex_init(&stratum.sock_lock, NULL); @@ -1416,7 +1455,6 @@ int main(int argc, char *argv[]) } #endif - num_processors = cuda_num_devices(); if (num_processors == 0) { applog(LOG_ERR, "No CUDA devices found! terminating."); diff --git a/cpuminer-config.h b/cpuminer-config.h index 50914abdb7..8b63e2e3cf 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -152,7 +152,7 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 2014.03.24" +#define PACKAGE_STRING "ccminer 2014.03.27" /* Define to the one symbol short name of this package. */ #undef PACKAGE_TARNAME @@ -161,7 +161,7 @@ #undef PACKAGE_URL /* Define to the version of this package. */ -#define PACKAGE_VERSION "2014.03.24" +#define PACKAGE_VERSION "2014.03.27" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/cuda_blake512.cu b/cuda_blake512.cu index 79fef85f94..3602b232cd 100644 --- a/cuda_blake512.cu +++ b/cuda_blake512.cu @@ -292,13 +292,13 @@ __host__ void blake512_cpu_setBlock(void *pdata) __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce) { - const int threadsperblock = 128; + const int threadsperblock = 256; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) + // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); diff --git a/cuda_combine.cu b/cuda_combine.cu index eabb2650ca..fb1033c155 100644 --- a/cuda_combine.cu +++ b/cuda_combine.cu @@ -138,7 +138,7 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) + // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); diff --git a/cuda_fugue256.cu b/cuda_fugue256.cu index bc8b9eeabe..7f0909948f 100644 --- a/cuda_fugue256.cu +++ b/cuda_fugue256.cu @@ -9,7 +9,10 @@ #define USE_SHARED 1 -// heavy.cu +// aus cpu-miner.c +extern int device_map[8]; + +// aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); // Folgende Definitionen später durch header ersetzen @@ -732,7 +735,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas void fugue256_cpu_init(int thr_id, int threads) { - cudaSetDevice(thr_id); + cudaSetDevice(device_map[thr_id]); // Kopiere die Hash-Tabellen in den GPU-Speicher texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); @@ -774,7 +777,7 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void * dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) + // Größe des dynamischen Shared Memory Bereichs #if USE_SHARED size_t shared_size = 4 * 256 * sizeof(uint32_t); #else diff --git a/cuda_groestl512.cu b/cuda_groestl512.cu index e29f4f6dbc..1aebcf341c 100644 --- a/cuda_groestl512.cu +++ b/cuda_groestl512.cu @@ -813,7 +813,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) + // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index 442b1dc83d..56d1a8933a 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -9,6 +9,10 @@ #define USE_SHARED 1 +// aus cpu-miner.c +extern int device_map[8]; + +// aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); // Folgende Definitionen später durch header ersetzen @@ -20,13 +24,7 @@ typedef unsigned long long uint64_t; __constant__ uint32_t pTarget[8]; // Single GPU extern uint32_t *d_resultNonce[8]; -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hashGROESTLCOINoutput[8]; - -__constant__ uint32_t groestlcoin_gpu_state[32]; __constant__ uint32_t groestlcoin_gpu_msg[32]; -__constant__ uint32_t sha256coin_gpu_constantTable[64]; -__constant__ uint32_t sha256coin_gpu_register[8]; #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) @@ -83,7 +81,13 @@ extern uint32_t T2dn_cpu[]; extern uint32_t T3up_cpu[]; extern uint32_t T3dn_cpu[]; -#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#else + // Kepler (Compute 3.5) + #define S(x, n) __funnelshift_r( x, x, n ); +#endif #define R(x, n) ((x) >> (n)) #define Ch(x, y, z) ((x & (y ^ z)) ^ z) #define Maj(x, y, z) ((x & (y | z)) | (y & z)) @@ -95,18 +99,57 @@ extern uint32_t T3dn_cpu[]; #define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) -__device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs) +__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs) { uint32_t t[32]; //#pragma unroll 14 for(int r=0;r<14;r++) { -#pragma unroll 16 - for(int k=0;k<16;k++) + switch(r) { - a[(k*2)+0] ^= PC32up(k * 0x10, r); - //a[(k<<1)+1] ^= PC32dn(k * 0x10, r); + case 0: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break; + case 1: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break; + case 2: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break; + case 3: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break; + case 4: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break; + case 5: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break; + case 6: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break; + case 7: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break; + case 8: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break; + case 9: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break; + case 10: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break; + case 11: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break; + case 12: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break; + case 13: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break; } // RBTT @@ -137,18 +180,57 @@ __device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs) } } -__device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs) +__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs) { //#pragma unroll 14 for(int r=0;r<14;r++) { uint32_t t[32]; -#pragma unroll 16 - for(int k=0;k<16;k++) + switch(r) { - a[(k*2)+0] ^= QC32up(k * 0x10, r); - a[(k*2)+1] ^= QC32dn(k * 0x10, r); + case 0: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break; + case 1: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break; + case 2: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break; + case 3: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break; + case 4: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break; + case 5: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break; + case 6: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break; + case 7: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break; + case 8: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break; + case 9: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break; + case 10: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break; + case 11: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break; + case 12: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break; + case 13: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break; } // RBTT @@ -179,12 +261,12 @@ __device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs) } } #if USE_SHARED -__global__ void __launch_bounds__(256) +__global__ void /* __launch_bounds__(256) */ #else __global__ void #endif - groestlcoin_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) + groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) { #if USE_SHARED extern __shared__ char mixtabs[]; @@ -204,146 +286,111 @@ __global__ void int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - ///// - ///// Lieber groestl, mach, dass es abgeht!!! - ///// // GROESTL uint32_t message[32]; uint32_t state[32]; - uint32_t g[32]; - #pragma unroll 32 - for(int k=0;k<32;k++) - { - // TODO: die Vorbelegung mit Nullen braucht nicht zwingend aus dem - // constant Memory zu lesen. Das ist Verschwendung von Bandbreite. - state[k] = groestlcoin_gpu_state[k]; - message[k] = groestlcoin_gpu_msg[k]; - } + for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k]; uint32_t nounce = startNounce + thread; message[19] = SWAB32(nounce); #pragma unroll 32 - for(int u=0;u<32;u++) - g[u] = message[u] ^ state[u]; // TODO: state ist fast ueberall 0. + for(int u=0;u<32;u++) state[u] = message[u]; + state[31] ^= 0x20000; // Perm #if USE_SHARED - groestlcoin_perm_P(g, mixtabs); // TODO: g[] entspricht fast genau message[] - groestlcoin_perm_Q(message, mixtabs); // kann man das ausnutzen? + groestlcoin_perm_P(state, mixtabs); + state[31] ^= 0x20000; + groestlcoin_perm_Q(message, mixtabs); #else - groestlcoin_perm_P(g, NULL); + groestlcoin_perm_P(state, NULL); + state[31] ^= 0x20000; groestlcoin_perm_Q(message, NULL); #endif - #pragma unroll 32 - for(int u=0;u<32;u++) - { - // TODO: kann man evtl. das xor mit g[u] vorziehen hinter die groestlcoin_perm_P Funktion - // was den Registerbedarf senken koennte? - state[u] ^= g[u] ^ message[u]; - g[u] = state[u]; - } + for(int u=0;u<32;u++) state[u] ^= message[u]; + +#pragma unroll 32 + for(int u=0;u<32;u++) message[u] = state[u]; #if USE_SHARED - groestlcoin_perm_P(g, mixtabs); + groestlcoin_perm_P(message, mixtabs); #else - groestlcoin_perm_P(g, NULL); + groestlcoin_perm_P(message, NULL); #endif #pragma unroll 32 - for(int u=0;u<32;u++) - state[u] ^= g[u]; + for(int u=0;u<32;u++) state[u] ^= message[u]; //// //// 2. Runde groestl //// #pragma unroll 16 - for(int k=0;k<16;k++) - message[k] = state[k + 16]; - -#pragma unroll 32 - for(int k=0;k<32;k++) - state[k] = groestlcoin_gpu_state[k]; - -#pragma unroll 16 - for(int k=0;k<16;k++) + for(int k=0;k<16;k++) message[k] = state[k + 16]; +#pragma unroll 14 + for(int k=1;k<15;k++) message[k+16] = 0; - message[16] = 0x80; + message[16] = 0x80; message[31] = 0x01000000; #pragma unroll 32 for(int u=0;u<32;u++) - g[u] = message[u] ^ state[u]; + state[u] = message[u]; + state[31] ^= 0x20000; // Perm #if USE_SHARED - groestlcoin_perm_P(g, mixtabs); + groestlcoin_perm_P(state, mixtabs); + state[31] ^= 0x20000; groestlcoin_perm_Q(message, mixtabs); #else - groestlcoin_perm_P(g, NULL); + groestlcoin_perm_P(state, NULL); + state[31] ^= 0x20000; groestlcoin_perm_Q(message, NULL); #endif #pragma unroll 32 - for(int u=0;u<32;u++) - { - state[u] ^= g[u] ^ message[u]; - g[u] = state[u]; - } + for(int u=0;u<32;u++) state[u] ^= message[u]; + +#pragma unroll 32 + for(int u=0;u<32;u++) message[u] = state[u]; #if USE_SHARED - groestlcoin_perm_P(g, mixtabs); + groestlcoin_perm_P(message, mixtabs); #else - groestlcoin_perm_P(g, NULL); + groestlcoin_perm_P(message, NULL); #endif #pragma unroll 32 - for(int u=0;u<32;u++) - state[u] ^= g[u]; - -/* - #pragma unroll 8 - for(int k=0;k<8;k++) - hash[k] = state[k+16]; -*/ + for(int u=0;u<32;u++) state[u] ^= message[u]; // kopiere Ergebnis - /* -#pragma unroll 16 - for(int k=0;k<16;k++) - ((uint32_t*)outputHash)[16*thread+k] = state[k + 16]; - */ - int i; + int i, position = -1; bool rc = true; - + +#pragma unroll 8 for (i = 7; i >= 0; i--) { if (state[i+16] > pTarget[i]) { - rc = false; - break; - } - if (state[i+16] < pTarget[i]) { - rc = true; - break; - } + if(position < i) { + position = i; + rc = false; + } + } + if (state[i+16] < pTarget[i]) { + if(position < i) { + position = i; + rc = true; + } + } } if(rc == true) - { if(resNounce[0] > nounce) - { resNounce[0] = nounce; - /* - #pragma unroll 8 - for(int k=0;k<8;k++) - ((uint32_t*)outputHash)[k] = (hash[k]); - */ - } - } - } } @@ -360,7 +407,7 @@ __global__ void // Setup-Funktionen __host__ void groestlcoin_cpu_init(int thr_id, int threads) { - cudaSetDevice(thr_id); + cudaSetDevice(device_map[thr_id]); cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ); // Texturen mit obigem Makro initialisieren texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); @@ -372,23 +419,8 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads) texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); - // setze register - // TODO: fast vollstaendige Vorbelegung mit Nullen. - // da besteht doch Optimierungspotenzial im GPU Kernel - // denn mit Nullen braucht man nicht wirklich rechnen. - uint32_t groestl_state_init[32]; - memset(groestl_state_init, 0, sizeof(uint32_t) * 32); - groestl_state_init[31] = 0x20000; - - // state speichern - cudaMemcpyToSymbol( groestlcoin_gpu_state, - groestl_state_init, - 128); - + // Speicher für Gewinner-Nonce belegen cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); - - // Speicher für alle Ergebnisse belegen (nur für Debug) - cudaMalloc(&d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads); } __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) @@ -430,7 +462,7 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) + // Größe des dynamischen Shared Memory Bereichs #if USE_SHARED size_t shared_size = 8 * 256 * sizeof(uint32_t); #else @@ -440,16 +472,10 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); //fprintf(stderr, "ThrID: %d\n", thr_id); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - groestlcoin_gpu_hash<<>>(threads, startNounce, d_hashGROESTLCOINoutput[thr_id], d_resultNonce[thr_id]); + groestlcoin_gpu_hash<<>>(threads, startNounce, d_resultNonce[thr_id]); // Strategisches Sleep Kommando zur Senkung der CPU Last MyStreamSynchronize(NULL, 0, thr_id); cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - - /// Debug - //cudaMemcpy(outputHashes, d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost); - - // Nounce - //cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); } diff --git a/cuda_hefty1.cu b/cuda_hefty1.cu index f1333d3dcd..2f72c0fe46 100644 --- a/cuda_hefty1.cu +++ b/cuda_hefty1.cu @@ -2,26 +2,40 @@ #include "cuda_runtime.h" #include "device_launch_parameters.h" +// aus cpu-miner.c +extern int device_map[8]; + #include #include +#define USE_SHARED 1 + // Folgende Definitionen später durch header ersetzen typedef unsigned int uint32_t; typedef unsigned char uint8_t; typedef unsigned short uint16_t; +// diese Struktur wird in der Init Funktion angefordert +static cudaDeviceProp props; + // globaler Speicher für alle HeftyHashes aller Threads uint32_t *d_heftyHashes[8]; /* Hash-Tabellen */ __constant__ uint32_t hefty_gpu_constantTable[64]; +#if USE_SHARED +#define heftyLookUp(x) (*((uint32_t*)heftytab + (x))) +#else +#define heftyLookUp(x) hefty_gpu_constantTable[x] +#endif // muss expandiert werden __constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message __constant__ uint32_t hefty_gpu_register[8]; __constant__ uint32_t hefty_gpu_sponge[4]; -uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL, +uint32_t hefty_cpu_hashTable[] = { + 0x6a09e667UL, 0xbb67ae85UL, 0x3c6ef372UL, 0xa54ff53aUL, @@ -29,8 +43,9 @@ uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL, 0x9b05688cUL, 0x1f83d9abUL, 0x5be0cd19UL }; + uint32_t hefty_cpu_constantTable[] = { - 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, @@ -48,350 +63,352 @@ uint32_t hefty_cpu_constantTable[] = { 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL }; -#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) -#define R(x, n) ((x) >> (n)) -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) -#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) -#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) -#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) +//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +static __host__ __device__ uint32_t S(uint32_t x, int n) +{ + return (((x) >> (n)) | ((x) << (32 - (n)))); +} +#define R(x, n) ((x) >> (n)) +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) +#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) +#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) +#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) // uint8_t -#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) ) +#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) ) __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x) { - uint16_t w = (x >> 16) ^ (x & 0xffff); - uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) ); - return (n >> 2) ^ (n & 0x03); + uint16_t w = (x >> 16) ^ (x & 0xffff); + uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) ); + return 24 - (((n >> 2) ^ (n & 0x03)) << 3); } // 4 auf einmal -#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F ) -#define getByte(x,y) ( ((x) >> (y)) & 0xFF ) +#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F ) +#define getByte(x,y) ( ((x) >> (y)) & 0xFF ) -__host__ __device__ void Mangle(uint32_t *inp) +__host__ __forceinline__ __device__ void Mangle(uint32_t *inp) { - uint32_t r = smoosh4Quad(inp[0]); - //uint8_t r0 = smoosh4( (uint8_t)(inp[0] >> 24) ); - //uint8_t r1 = smoosh4( (uint8_t)(inp[0] >> 16) ); - //uint8_t r2 = smoosh4( (uint8_t)(inp[0] >> 8) ); - //uint8_t r3 = smoosh4( (uint8_t)(inp[0] & 0xFF) ); - - inp[1] = inp[1] ^ S(inp[0], getByte(r, 24)); - - switch (smoosh2(inp[1])) { - case 0: inp[2] ^= S(inp[0], 1 + getByte(r,24)); break; - case 1: inp[2] += S(~inp[0], 1 + getByte(r,16)); break; - case 2: inp[2] &= S(~inp[0], 1 + getByte(r,8)); break; - case 3: inp[2] ^= S(inp[0], 1 + getByte(r,0)); break; - } + uint32_t r = smoosh4Quad(inp[0]); + uint32_t inp0org; + uint32_t tmp0Mask, tmp1Mask; + uint32_t in1, in2, isAddition; + uint32_t tmp; + uint8_t b; + + inp[1] = inp[1] ^ S(inp[0], getByte(r, 24)); + + r += 0x01010101; + tmp = smoosh2(inp[1]); + b = getByte(r,tmp); + inp0org = S(inp[0], b); + tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0 + tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0 - uint32_t tmp = smoosh2(inp[1] ^ inp[2]); - switch (tmp) { - case 0: inp[3] ^= S(inp[0], 2 + getByte(r,24)); break; - case 1: inp[3] += S(~inp[0], 2 + getByte(r,16)); break; - case 2: inp[3] &= S(~inp[0], 2 + getByte(r,8)); break; - case 3: inp[3] ^= S(inp[0], 2 + getByte(r,0)); break; - } - - inp[0] ^= (inp[1] ^ inp[2]) + inp[3]; + in1 = (inp[2] & ~inp0org) | + (tmp1Mask & ~inp[2] & inp0org) | + (~tmp0Mask & ~inp[2] & inp0org); + in2 = inp[2] += ~inp0org; + isAddition = ~tmp0Mask & tmp1Mask; + inp[2] = isAddition ? in2 : in1; + + r += 0x01010101; + tmp = smoosh2(inp[1] ^ inp[2]); + b = getByte(r,tmp); + inp0org = S(inp[0], b); + tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0 + tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0 + + in1 = (inp[3] & ~inp0org) | + (tmp1Mask & ~inp[3] & inp0org) | + (~tmp0Mask & ~inp[3] & inp0org); + in2 = inp[3] += ~inp0org; + isAddition = ~tmp0Mask & tmp1Mask; + inp[3] = isAddition ? in2 : in1; + + inp[0] ^= (inp[1] ^ inp[2]) + inp[3]; } __host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x) { - inp[0] ^= x; - Mangle(inp); + inp[0] ^= x; + Mangle(inp); } __host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp) { - uint32_t y = inp[0]; - Mangle(inp); - return y; + uint32_t y = inp[0]; + Mangle(inp); + return y; } __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x) { - uint32_t r = Squeeze(sponge); - - //uint8_t r0 = r >> 8; - uint8_t r1 = r & 0xFF; - uint32_t y = 1 << ((r >> 8) & 0x1F); - - //uint32_t retVal; - //retVal = x; - - uint32_t resArr[4]; - resArr[0] = x; - resArr[1] = x & ~y; - resArr[2] = x | y; - resArr[3] = x ^ y; - return resArr[r1 & 0x03]; - - /* - switch(r1 & 0x03) - { - case 0: - break; - case 1: - retVal = x & ~y; - break; - case 2: - retVal = x | y; - break; - case 3: - retVal = x ^ y; - break; - } - return retVal; - */ + uint32_t r = Squeeze(sponge); + uint32_t t = ((r >> 8) & 0x1F); + uint32_t y = 1 << t; + + uint32_t a = (((r>>1) & 0x01) << t) & y; + uint32_t b = ((r & 0x01) << t) & y; + uint32_t c = x & y; + + uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c); + return retVal; } __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) { - uint32_t tmpBr; - - uint32_t brG = Br(sponge, regs[6]); - uint32_t brF = Br(sponge, regs[5]); - uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; - uint32_t brE = Br(sponge, regs[4]); - uint32_t tmp2 = tmp1 + S1(brE); - uint32_t brC = Br(sponge, regs[2]); - uint32_t brB = Br(sponge, regs[1]); - uint32_t brA = Br(sponge, regs[0]); - uint32_t tmp3 = Maj(brA, brB, brC); - tmpBr = Br(sponge, regs[0]); - uint32_t tmp4 = tmp3 + S0(tmpBr); - tmpBr = Br(sponge, tmp2); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = tmp2 + tmp4; - regs[4] += tmpBr; + uint32_t tmpBr; + + uint32_t brG = Br(sponge, regs[6]); + uint32_t brF = Br(sponge, regs[5]); + uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; + uint32_t brE = Br(sponge, regs[4]); + uint32_t tmp2 = tmp1 + S1(brE); + uint32_t brC = Br(sponge, regs[2]); + uint32_t brB = Br(sponge, regs[1]); + uint32_t brA = Br(sponge, regs[0]); + uint32_t tmp3 = Maj(brA, brB, brC); + tmpBr = Br(sponge, regs[0]); + uint32_t tmp4 = tmp3 + S0(tmpBr); + tmpBr = Br(sponge, tmp2); + + #pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = tmp2 + tmp4; + regs[4] += tmpBr; } __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) { - uint32_t tmpBr; - - uint32_t brG = Br(sponge, regs[6]); - uint32_t brF = Br(sponge, regs[5]); - uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; - uint32_t brE = Br(sponge, regs[4]); - uint32_t tmp2 = tmp1 + S1(brE); - uint32_t brC = Br(sponge, regs[2]); - uint32_t brB = Br(sponge, regs[1]); - uint32_t brA = Br(sponge, regs[0]); - uint32_t tmp3 = Maj(brA, brB, brC); - tmpBr = Br(sponge, regs[0]); - uint32_t tmp4 = tmp3 + S0(tmpBr); - tmpBr = Br(sponge, tmp2); - - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = tmp2 + tmp4; - regs[4] += tmpBr; + uint32_t tmpBr; + + uint32_t brG = Br(sponge, regs[6]); + uint32_t brF = Br(sponge, regs[5]); + uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; + uint32_t brE = Br(sponge, regs[4]); + uint32_t tmp2 = tmp1 + S1(brE); + uint32_t brC = Br(sponge, regs[2]); + uint32_t brB = Br(sponge, regs[1]); + uint32_t brA = Br(sponge, regs[0]); + uint32_t tmp3 = Maj(brA, brB, brC); + tmpBr = Br(sponge, regs[0]); + uint32_t tmp4 = tmp3 + S0(tmpBr); + tmpBr = Br(sponge, tmp2); + + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = tmp2 + tmp4; + regs[4] += tmpBr; } // Die Hash-Funktion __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash) { - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - uint32_t nounce = startNounce + thread; - - // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory -#if USE_SHARED - extern __shared__ unsigned char s[]; - uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]); -#else - // reduktion von 256 byte auf 128 byte - uint32_t W1[16]; - uint32_t W2[16]; + #if USE_SHARED + extern __shared__ char heftytab[]; + if(threadIdx.x < 64) + { + *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x]; + } + + __syncthreads(); #endif - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - uint32_t sponge[4]; - + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // bestimme den aktuellen Zähler + uint32_t nounce = startNounce + thread; + + // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory + // reduktion von 256 byte auf 128 byte + uint32_t W1[16]; + uint32_t W2[16]; + + // Initialisiere die register a bis h mit der Hash-Tabelle + uint32_t regs[8]; + uint32_t hash[8]; + uint32_t sponge[4]; + #pragma unroll 4 - for(int k=0; k < 4; k++) - sponge[k] = hefty_gpu_sponge[k]; + for(int k=0; k < 4; k++) + sponge[k] = hefty_gpu_sponge[k]; - // pre + // pre #pragma unroll 8 - for (int k=0; k < 8; k++) - { - regs[k] = hefty_gpu_register[k]; - hash[k] = regs[k]; - } - - //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding + for (int k=0; k < 8; k++) + { + regs[k] = hefty_gpu_register[k]; + hash[k] = regs[k]; + } + + //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding #pragma unroll 16 - for(int k=0;k<16;k++) - W1[k] = hefty_gpu_blockHeader[k]; - W1[3] = SWAB32(nounce); - + for(int k=0;k<16;k++) + W1[k] = hefty_gpu_blockHeader[k]; + W1[3] = SWAB32(nounce); - // 2. Runde + // 2. Runde #pragma unroll 16 - for(int j=0;j<16;j++) - Absorb(sponge, W1[j] ^ hefty_gpu_constantTable[j]); + for(int j=0;j<16;j++) + Absorb(sponge, W1[j] ^ heftyLookUp(j)); // Progress W1 (Bytes 0...63) #pragma unroll 16 - for(int j=0;j<16;j++) - { - Absorb(sponge, regs[3] ^ regs[7]); - hefty_gpu_round(regs, W1[j], hefty_gpu_constantTable[j], sponge); - } + for(int j=0;j<16;j++) + { + Absorb(sponge, regs[3] ^ regs[7]); + hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge); + } // Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ... - + #pragma unroll 3 - for(int k=0;k<3;k++) - { - #pragma unroll 2 - for(int j=0;j<2;j++) - W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - #pragma unroll 5 - for(int j=2;j<7;j++) - W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - - #pragma unroll 8 - for(int j=7;j<15;j++) - W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - #pragma unroll 16 - for(int j=0;j<16;j++) - { - Absorb(sponge, regs[3] + regs[7]); - hefty_gpu_round(regs, W2[j], hefty_gpu_constantTable[j + 16 * (k+1)], sponge); - } - #pragma unroll 16 - for(int j=0;j<16;j++) - W1[j] = W2[j]; - } - - + for(int k=0;k<3;k++) + { + #pragma unroll 2 + for(int j=0;j<2;j++) + W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + #pragma unroll 5 + for(int j=2;j<7;j++) + W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + + #pragma unroll 8 + for(int j=7;j<15;j++) + W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + #pragma unroll 16 + for(int j=0;j<16;j++) + { + Absorb(sponge, regs[3] + regs[7]); + hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge); + } + #pragma unroll 16 + for(int j=0;j<16;j++) + W1[j] = W2[j]; + } + #pragma unroll 8 - for(int k=0;k<8;k++) - hash[k] += regs[k]; + for(int k=0;k<8;k++) + hash[k] += regs[k]; #pragma unroll 8 - for(int k=0;k<8;k++) - ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]); - } + for(int k=0;k<8;k++) + ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]); + } } // Setup-Funktionen __host__ void hefty_cpu_init(int thr_id, int threads) { - cudaSetDevice(thr_id); + cudaSetDevice(device_map[thr_id]); - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( hefty_gpu_constantTable, - hefty_cpu_constantTable, - sizeof(uint32_t) * 64 ); + cudaGetDeviceProperties(&props, device_map[thr_id]); - // Speicher für alle Hefty1 hashes belegen - cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads); + // Kopiere die Hash-Tabellen in den GPU-Speicher + cudaMemcpyToSymbol( hefty_gpu_constantTable, + hefty_cpu_constantTable, + sizeof(uint32_t) * 64 ); + + // Speicher für alle Hefty1 hashes belegen + cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads); } __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data) - // data muss 84-Byte haben! + // data muss 84-Byte haben! { - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 84); - msgBlock[21] |= 0x80; - msgBlock[31] = 672; // bitlen - - for(int i=0;i<31;i++) // Byteorder drehen - msgBlock[i] = SWAB32(msgBlock[i]); - - // die erste Runde wird auf der CPU durchgeführt, da diese für - // alle Threads gleich ist. Der Hash wird dann an die Threads - // übergeben - - // Erstelle expandierten Block W - uint32_t W[64]; - memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); - for(int j=16;j<64;j++) - W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; - - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - uint32_t sponge[4]; - - // pre - memset(sponge, 0, sizeof(uint32_t) * 4); + // Nachricht expandieren und setzen + uint32_t msgBlock[32]; + + memset(msgBlock, 0, sizeof(uint32_t) * 32); + memcpy(&msgBlock[0], data, 84); + msgBlock[21] |= 0x80; + msgBlock[31] = 672; // bitlen + + for(int i=0;i<31;i++) // Byteorder drehen + msgBlock[i] = SWAB32(msgBlock[i]); + + // die erste Runde wird auf der CPU durchgeführt, da diese für + // alle Threads gleich ist. Der Hash wird dann an die Threads + // übergeben + + // Erstelle expandierten Block W + uint32_t W[64]; + memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); + for(int j=16;j<64;j++) + W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; + + // Initialisiere die register a bis h mit der Hash-Tabelle + uint32_t regs[8]; + uint32_t hash[8]; + uint32_t sponge[4]; + + // pre + memset(sponge, 0, sizeof(uint32_t) * 4); for (int k=0; k < 8; k++) - { - regs[k] = hefty_cpu_hashTable[k]; - hash[k] = regs[k]; - } - - // 1. Runde - for(int j=0;j<16;j++) - Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]); - - for(int j=0;j<16;j++) - { - Absorb(sponge, regs[3] ^ regs[7]); - hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); - } - - for(int j=16;j<64;j++) - { - Absorb(sponge, regs[3] + regs[7]); - hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); - } - - for(int k=0;k<8;k++) - hash[k] += regs[k]; - - // sponge speichern - - cudaMemcpyToSymbol( hefty_gpu_sponge, - sponge, - sizeof(uint32_t) * 4 ); - // hash speichern - cudaMemcpyToSymbol( hefty_gpu_register, - hash, - sizeof(uint32_t) * 8 ); - - // Blockheader setzen (korrekte Nonce fehlt da drin noch) - cudaMemcpyToSymbol( hefty_gpu_blockHeader, - &msgBlock[16], - 64); + { + regs[k] = hefty_cpu_hashTable[k]; + hash[k] = regs[k]; + } + + // 1. Runde + for(int j=0;j<16;j++) + Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]); + + for(int j=0;j<16;j++) + { + Absorb(sponge, regs[3] ^ regs[7]); + hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); + } + + for(int j=16;j<64;j++) + { + Absorb(sponge, regs[3] + regs[7]); + hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); + } + + for(int k=0;k<8;k++) + hash[k] += regs[k]; + + // sponge speichern + + cudaMemcpyToSymbol( hefty_gpu_sponge, + sponge, + sizeof(uint32_t) * 4 ); + // hash speichern + cudaMemcpyToSymbol( hefty_gpu_register, + hash, + sizeof(uint32_t) * 8 ); + + // Blockheader setzen (korrekte Nonce fehlt da drin noch) + cudaMemcpyToSymbol( hefty_gpu_blockHeader, + &msgBlock[16], + 64); } __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) { - const int threadsperblock = 128; + // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, + // alle anderen mit 512 Threads. + int threadsperblock = (props.major >= 3) ? 768 : 512; - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) -#if USE_SHARED - size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte + // Größe des dynamischen Shared Memory Bereichs + #if USE_SHARED + size_t shared_size = 8 * 64 * sizeof(uint32_t); #else - size_t shared_size = 0; + size_t shared_size = 0; #endif -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - hefty_gpu_hash<<>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); + hefty_gpu_hash<<>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); } diff --git a/cuda_keccak512.cu b/cuda_keccak512.cu index 65db302521..c9b0a6cc14 100644 --- a/cuda_keccak512.cu +++ b/cuda_keccak512.cu @@ -264,7 +264,7 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) + // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); diff --git a/cuda_sha256.cu b/cuda_sha256.cu index 97b7051802..050f4b4eda 100644 --- a/cuda_sha256.cu +++ b/cuda_sha256.cu @@ -5,8 +5,6 @@ #include #include -#define W_ALIGNMENT 65 - // Folgende Definitionen später durch header ersetzen typedef unsigned int uint32_t; @@ -59,8 +57,6 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH nonceVector[thread] = nounce; // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory - //extern __shared__ unsigned char s[]; - //uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]); uint32_t W1[16]; uint32_t W2[16]; @@ -257,14 +253,13 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce) { - const int threadsperblock = 128; + const int threadsperblock = 256; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) - //size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte + // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); diff --git a/heavy.cu b/heavy.cu index 6ce1a663d2..453c88761b 100644 --- a/heavy.cu +++ b/heavy.cu @@ -163,6 +163,41 @@ extern "C" int cuda_num_devices() return GPU_N; } +static bool substringsearch(const char *haystack, const char *needle, int &match) +{ + int hlen = strlen(haystack); + int nlen = strlen(needle); + for (int i=0; i < hlen; ++i) + { + if (haystack[i] == ' ') continue; + int j=0, x = 0; + while(j < nlen) + { + if (haystack[i+x] == ' ') {++x; continue;} + if (needle[j] == ' ') {++j; continue;} + if (needle[j] == '#') return ++match == needle[j+1]-'0'; + if (tolower(haystack[i+x]) != tolower(needle[j])) break; + ++j; ++x; + } + if (j == nlen) return true; + } + return false; +} + +// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1) +extern "C" int cuda_finddevice(char *name) +{ + int num = cuda_num_devices(); + int match = 0; + for (int i=0; i < num; ++i) + { + cudaDeviceProp props; + if (cudaGetDeviceProperties(&props, i) == cudaSuccess) + if (substringsearch(props.name, name, match)) return i; + } + return -1; +} + // Zeitsynchronisations-Routine von cudaminer mit CPU sleep typedef struct { double value[8]; } tsumarray; cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)