From 2ca6ede92b2c43e485e01ca839d6131b15586ec9 Mon Sep 17 00:00:00 2001
From: Christian Buchner <buchner@Rennsemmel.(none)>
Date: Thu, 27 Mar 2014 00:47:12 +0100
Subject: [PATCH] update to version v0.5 (2014-03-27)

---
 Makefile.am         |   2 +-
 Makefile.in         |   2 +-
 README.txt          |  42 +++-
 ccminer.vcxproj     |  27 +--
 configure           |  20 +-
 configure.ac        |   2 +-
 cpu-miner.c         |  50 +++-
 cpuminer-config.h   |   4 +-
 cuda_blake512.cu    |   4 +-
 cuda_combine.cu     |   2 +-
 cuda_fugue256.cu    |   9 +-
 cuda_groestl512.cu  |   2 +-
 cuda_groestlcoin.cu | 276 ++++++++++++----------
 cuda_hefty1.cu      | 565 +++++++++++++++++++++++---------------------
 cuda_keccak512.cu   |   2 +-
 cuda_sha256.cu      |   9 +-
 heavy.cu            |  35 +++
 17 files changed, 588 insertions(+), 465 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index c523ffa6a4..3f8f816790 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -34,4 +34,4 @@ ccminer_CPPFLAGS	= -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
 
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
diff --git a/Makefile.in b/Makefile.in
index 0925b1f4d0..130d7ce2d4 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1035,7 +1035,7 @@ uninstall-am: uninstall-binPROGRAMS
 
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
diff --git a/README.txt b/README.txt
index 31dbac55f6..3c956e081f 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccMiner release 0.4 (Mar 24th 2014) - Groestlcoin Pool Release
+ccMiner release 0.5 (Mar 27th 2014) - "Hefty Optimization"
 -------------------------------------------------------------
 
 ***************************************************************
@@ -38,6 +38,11 @@ its command line interface and options.
                           fugue256    use to mine Fuguecoin
                           groestl     use to mine Groestlcoin
 
+  -d, --devices         gives a comma separated list of CUDA device IDs
+                        to operate on. Device IDs start counting from 0!
+                        Alternatively give string names of your card like
+                        gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
+
   -o, --url=URL         URL of mining server (default: " DEF_RPC_URL ")
   -O, --userpass=U:P    username:password pair for mining server
   -u, --user=USERNAME   username for mining server
@@ -63,8 +68,10 @@ its command line interface and options.
   -V, --version         display version information and exit
   -h, --help            display this help text and exit
 
+
 >>> Examples <<<
 
+
 Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
 
 ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
@@ -107,22 +114,33 @@ from your old clunkers.
 
 >>> RELEASE HISTORY <<<
 
-  March, 24 2014 fixed Groestl pool support
+  March, 27 2014  Heavycoin exchange rates soar, and as a result this coin
+                  gets some love: We greatly optimized the Hefty1 kernel
+                  for speed. Expect some hefty gains, especially on 750Ti's!
+
+                  By popular demand, we added the -d option as known from
+                  cudaminer.
+
+                  different compute capability builds are now provided until
+                  we figure out how to pack everything into a single executable
+                  in a Windows build.
+
+  March, 24 2014  fixed Groestl pool support
 
-                 went back to Compute 1.x for cuda_hefty1.cu kernel by
-                 default after numerous reports of ccminer v0.2/v0.3
-                 not working with HeavyCoin for some people.
+                  went back to Compute 1.x for cuda_hefty1.cu kernel by
+                  default after numerous reports of ccminer v0.2/v0.3
+                  not working with HeavyCoin for some people.
 
-  March, 23 2014 added Groestlcoin support. stratum status unknown
-                 (the only pool is currently down for fixing issues)
+  March, 23 2014  added Groestlcoin support. stratum status unknown
+                  (the only pool is currently down for fixing issues)
 
-  March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates
-                 on Fermi and Maxwell devices. Kepler may suffer slightly
-                 (3-5%)
+  March, 21 2014  use of shared memory in Fugue256 kernel boosts hash rates
+                  on Fermi and Maxwell devices. Kepler may suffer slightly
+                  (3-5%)
 
-                 Fixed Stratum for Fuguecoin. Tested on dwarfpool.
+                  Fixed Stratum for Fuguecoin. Tested on dwarfpool.
 
-  March, 18 2014 initial release.
+  March, 18 2014  initial release.
 
 
 >>> AUTHORS <<<
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 029fceb062..5424f0a422 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -95,12 +95,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
       <CInterleavedPTX>true</CInterleavedPTX>
     </CudaCompile>
     <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
     </CudaCompile>
     <CudaCompile>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
       <Include>
       </Include>
       <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@@ -127,12 +127,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
       <CInterleavedPTX>true</CInterleavedPTX>
     </CudaCompile>
     <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
     </CudaCompile>
     <CudaCompile>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
       <Include>
       </Include>
       <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@@ -163,12 +163,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
       <CInterleavedPTX>true</CInterleavedPTX>
     </CudaCompile>
     <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
     </CudaCompile>
     <CudaCompile>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
       <Include>
       </Include>
       <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@@ -199,12 +199,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
       <CInterleavedPTX>true</CInterleavedPTX>
     </CudaCompile>
     <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
     </CudaCompile>
     <CudaCompile>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
       <Include>
       </Include>
       <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@@ -277,16 +277,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
     <CudaCompile Include="cuda_fugue256.cu" />
     <CudaCompile Include="cuda_groestl512.cu" />
     <CudaCompile Include="cuda_groestlcoin.cu" />
-    <CudaCompile Include="cuda_hefty1.cu">
-      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_10,sm_10</CodeGeneration>
-      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">compute_10,sm_10</CodeGeneration>
-      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">124</MaxRegCount>
-      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">124</MaxRegCount>
-      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10</CodeGeneration>
-      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">124</MaxRegCount>
-      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10</CodeGeneration>
-      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">124</MaxRegCount>
-    </CudaCompile>
+    <CudaCompile Include="cuda_hefty1.cu" />
     <CudaCompile Include="cuda_keccak512.cu" />
     <CudaCompile Include="cuda_sha256.cu" />
     <CudaCompile Include="heavy.cu" />
diff --git a/configure b/configure
index c565c170e6..d826992518 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ccminer 2014.03.24.
+# Generated by GNU Autoconf 2.68 for ccminer 2014.03.27.
 #
 #
 # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -557,8 +557,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ccminer'
 PACKAGE_TARNAME='ccminer'
-PACKAGE_VERSION='2014.03.24'
-PACKAGE_STRING='ccminer 2014.03.24'
+PACKAGE_VERSION='2014.03.27'
+PACKAGE_STRING='ccminer 2014.03.27'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ccminer 2014.03.24 to adapt to many kinds of systems.
+\`configure' configures ccminer 2014.03.27 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1368,7 +1368,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ccminer 2014.03.24:";;
+     short | recursive ) echo "Configuration of ccminer 2014.03.27:";;
    esac
   cat <<\_ACEOF
 
@@ -1469,7 +1469,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ccminer configure 2014.03.24
+ccminer configure 2014.03.27
 generated by GNU Autoconf 2.68
 
 Copyright (C) 2010 Free Software Foundation, Inc.
@@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ccminer $as_me 2014.03.24, which was
+It was created by ccminer $as_me 2014.03.27, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   $ $0 $@
@@ -2901,7 +2901,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ccminer'
- VERSION='2014.03.24'
+ VERSION='2014.03.27'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ccminer $as_me 2014.03.24, which was
+This file was extended by ccminer $as_me 2014.03.27, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -7184,7 +7184,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ccminer config.status 2014.03.24
+ccminer config.status 2014.03.27
 configured by $0, generated by GNU Autoconf 2.68,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 18063f98f7..ce82fdc86d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.03.24])
+AC_INIT([ccminer], [2014.03.27])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index e88b79ed77..370a257f02 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -51,8 +51,13 @@
 // from heavy.cu
 #ifdef __cplusplus
 extern "C"
+{
 #endif
 int cuda_num_devices();
+int cuda_finddevice(char *name);
+#ifdef __cplusplus
+}
+#endif
 
 
 #ifdef __linux /* Linux specific policy and affinity management */
@@ -144,10 +149,11 @@ static int opt_scantime = 5;
 static json_t *opt_config;
 static const bool opt_time = true;
 static sha256_algos opt_algo = ALGO_HEAVY;
-static int opt_n_threads;
+static int opt_n_threads = 0;
 bool opt_trust_pool = false;
 uint16_t opt_vote = 9999;
 static int num_processors;
+int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
 static char *rpc_url;
 static char *rpc_userpass;
 static char *rpc_user, *rpc_pass;
@@ -185,7 +191,11 @@ Options:\n\
   -a, --algo=ALGO       specify the algorithm to use\n\
                         fugue256  Fuguecoin hash\n\
                         heavy     Heavycoin hash\n\
-  -v, --vote=VOTE       block reward vote\n\
+  -d, --devices         takes a comma separated list of CUDA devices to use.\n\
+                        Device IDs start counting from 0! Alternatively takes\n\
+                        string names of your cards like gtx780ti or gt640#2\n\
+                        (matching 2nd gt640 in the PC)\n\
+  -v, --vote=VOTE       block reward vote (for HeavyCoin)\n\
   -m, --trust-pool      trust the max block reward vote (maxvote) sent by the pool\n\
   -o, --url=URL         URL of mining server\n\
   -O, --userpass=U:P    username:password pair for mining server\n\
@@ -227,7 +237,7 @@ static char const short_options[] =
 #ifdef HAVE_SYSLOG_H
 	"S"
 #endif
-	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vmv:";
+	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:";
 
 static struct option const options[] = {
 	{ "algo", 1, NULL, 'a' },
@@ -259,6 +269,7 @@ static struct option const options[] = {
 	{ "user", 1, NULL, 'u' },
 	{ "userpass", 1, NULL, 'O' },
 	{ "version", 0, NULL, 'V' },
+	{ "devices", 1, NULL, 'd' },
 	{ 0, 0, 0, 0 }
 };
 
@@ -1251,6 +1262,32 @@ static void parse_arg (int key, char *arg)
 	case 'S':
 		use_syslog = true;
 		break;
+	case 'd': // CB
+		{
+			char * pch = strtok (arg,",");
+			opt_n_threads = 0;
+			while (pch != NULL) {
+				if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
+				{
+					if (atoi(pch) < num_processors)
+						device_map[opt_n_threads++] = atoi(pch);
+					else {
+						applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
+						exit(1);
+					}
+				} else {
+					int device = cuda_finddevice(pch);
+					if (device >= 0 && device < num_processors)
+						device_map[opt_n_threads++] = device;
+					else {
+						applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
+						exit(1);
+					}
+				}
+				pch = strtok (NULL, ",");
+			}
+		}
+		break;
 	case 'V':
 		show_version_and_exit();
 	case 'h':
@@ -1346,7 +1383,7 @@ static void signal_handler(int sig)
 }
 #endif
 
-#define PROGRAM_VERSION "0.4"
+#define PROGRAM_VERSION "0.5"
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
@@ -1370,6 +1407,9 @@ int main(int argc, char *argv[])
 	rpc_user = strdup("");
 	rpc_pass = strdup("");
 
+	pthread_mutex_init(&applog_lock, NULL);
+	num_processors = cuda_num_devices();
+
 	/* parse command line */
 	parse_cmdline(argc, argv);
 
@@ -1385,7 +1425,6 @@ int main(int argc, char *argv[])
 		sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
 	}
 
-	pthread_mutex_init(&applog_lock, NULL);
 	pthread_mutex_init(&stats_lock, NULL);
 	pthread_mutex_init(&g_work_lock, NULL);
 	pthread_mutex_init(&stratum.sock_lock, NULL);
@@ -1416,7 +1455,6 @@ int main(int argc, char *argv[])
 	}
 #endif
 
-	num_processors = cuda_num_devices();
 	if (num_processors == 0)
 	{
 		applog(LOG_ERR, "No CUDA devices found! terminating.");
diff --git a/cpuminer-config.h b/cpuminer-config.h
index 50914abdb7..8b63e2e3cf 100644
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@@ -152,7 +152,7 @@
 #define PACKAGE_NAME "ccminer"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.03.24"
+#define PACKAGE_STRING "ccminer 2014.03.27"
 
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
@@ -161,7 +161,7 @@
 #undef PACKAGE_URL
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.03.24"
+#define PACKAGE_VERSION "2014.03.27"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
diff --git a/cuda_blake512.cu b/cuda_blake512.cu
index 79fef85f94..3602b232cd 100644
--- a/cuda_blake512.cu
+++ b/cuda_blake512.cu
@@ -292,13 +292,13 @@ __host__ void blake512_cpu_setBlock(void *pdata)
 
 __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 {
-	const int threadsperblock = 128;
+	const int threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	// Gr��e des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
diff --git a/cuda_combine.cu b/cuda_combine.cu
index eabb2650ca..fb1033c155 100644
--- a/cuda_combine.cu
+++ b/cuda_combine.cu
@@ -138,7 +138,7 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	// Gr��e des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
diff --git a/cuda_fugue256.cu b/cuda_fugue256.cu
index bc8b9eeabe..7f0909948f 100644
--- a/cuda_fugue256.cu
+++ b/cuda_fugue256.cu
@@ -9,7 +9,10 @@
 
 #define USE_SHARED 1
 
-// heavy.cu
+// aus cpu-miner.c
+extern int device_map[8];
+
+// aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 
 // Folgende Definitionen sp�ter durch header ersetzen
@@ -732,7 +735,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 
 void fugue256_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(thr_id);
+    cudaSetDevice(device_map[thr_id]);
 
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
@@ -774,7 +777,7 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	// Gr��e des dynamischen Shared Memory Bereichs
 #if USE_SHARED
 	size_t shared_size = 4 * 256 * sizeof(uint32_t);
 #else
diff --git a/cuda_groestl512.cu b/cuda_groestl512.cu
index e29f4f6dbc..1aebcf341c 100644
--- a/cuda_groestl512.cu
+++ b/cuda_groestl512.cu
@@ -813,7 +813,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	// Gr��e des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index 442b1dc83d..56d1a8933a 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -9,6 +9,10 @@
 
 #define USE_SHARED 1
 
+// aus cpu-miner.c
+extern int device_map[8];
+
+// aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 
 // Folgende Definitionen sp�ter durch header ersetzen
@@ -20,13 +24,7 @@ typedef unsigned long long uint64_t;
 __constant__ uint32_t pTarget[8]; // Single GPU
 extern uint32_t *d_resultNonce[8];
 
-// globaler Speicher f�r unsere Ergebnisse
-uint32_t *d_hashGROESTLCOINoutput[8];
-
-__constant__ uint32_t groestlcoin_gpu_state[32];
 __constant__ uint32_t groestlcoin_gpu_msg[32];
-__constant__ uint32_t sha256coin_gpu_constantTable[64];
-__constant__ uint32_t sha256coin_gpu_register[8];
 
 #define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
 
@@ -83,7 +81,13 @@ extern uint32_t T2dn_cpu[];
 extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
 
-#define S(x, n)			(((x) >> (n)) | ((x) << (32 - (n))))
+#if __CUDA_ARCH__ < 350 
+    // Kepler (Compute 3.0)
+    #define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#else
+    // Kepler (Compute 3.5)
+    #define S(x, n) __funnelshift_r( x, x, n );
+#endif
 #define R(x, n)			((x) >> (n))
 #define Ch(x, y, z)		((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)	((x & (y | z)) | (y & z))
@@ -95,18 +99,57 @@ extern uint32_t T3dn_cpu[];
 #define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 
 
-__device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
+__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
 {
 	uint32_t t[32];
 
 //#pragma unroll 14
 	for(int r=0;r<14;r++)
 	{
-#pragma unroll 16
-		for(int k=0;k<16;k++)
+		switch(r)
 		{
-			a[(k*2)+0] ^= PC32up(k * 0x10, r);
-			//a[(k<<1)+1] ^= PC32dn(k * 0x10, r);
+			case 0:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
+			case 1:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
+			case 2:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
+			case 3:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
+			case 4:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
+			case 5:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
+			case 6:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
+			case 7:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
+			case 8:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
+			case 9:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
+			case 10:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
+			case 11:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
+			case 12:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
+			case 13:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
 		}
 
 		// RBTT
@@ -137,18 +180,57 @@ __device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
 	}
 }
 
-__device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
+__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
 {	
 //#pragma unroll 14
 	for(int r=0;r<14;r++)
 	{
 		uint32_t t[32];
 
-#pragma unroll 16
-		for(int k=0;k<16;k++)
+		switch(r)
 		{
-			a[(k*2)+0] ^= QC32up(k * 0x10, r);
-			a[(k*2)+1] ^= QC32dn(k * 0x10, r);
+			case 0:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
+			case 1:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
+			case 2:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
+			case 3:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
+			case 4:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
+			case 5:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
+			case 6:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
+			case 7:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
+			case 8:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
+			case 9:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
+			case 10:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
+			case 11:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
+			case 12:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
+			case 13:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
 		}
 
 		// RBTT
@@ -179,12 +261,12 @@ __device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
 	}
 }
 #if USE_SHARED
-__global__ void  __launch_bounds__(256) 
+__global__ void  /* __launch_bounds__(256) */
 #else
 __global__ void 
 #endif
 
- groestlcoin_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+ groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
@@ -204,146 +286,111 @@ __global__ void
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-	/////
-	///// Lieber groestl, mach, dass es abgeht!!!
-	/////
 		// GROESTL
 		uint32_t message[32];
 		uint32_t state[32];
-		uint32_t g[32];
-
 
 #pragma unroll 32
-		for(int k=0;k<32;k++)
-		{
-                        // TODO: die Vorbelegung mit Nullen braucht nicht zwingend aus dem
-                        //       constant Memory zu lesen. Das ist Verschwendung von Bandbreite.
-			state[k] = groestlcoin_gpu_state[k];
-			message[k] = groestlcoin_gpu_msg[k];
-		}
+		for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
 
 		uint32_t nounce = startNounce + thread;
 		message[19] = SWAB32(nounce);
 
 #pragma unroll 32
-		for(int u=0;u<32;u++)
-			g[u] = message[u] ^ state[u];  // TODO: state ist fast ueberall 0.
+		for(int u=0;u<32;u++) state[u] = message[u];
+		state[31] ^= 0x20000;
 
 		// Perm
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);        // TODO: g[] entspricht fast genau message[]
-		groestlcoin_perm_Q(message, mixtabs);  //       kann man das ausnutzen?
+		groestlcoin_perm_P(state, mixtabs);
+		state[31] ^= 0x20000;
+		groestlcoin_perm_Q(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(state, NULL);
+		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, NULL);
 #endif
-		
 #pragma unroll 32
-		for(int u=0;u<32;u++)
-		{
-                        // TODO: kann man evtl. das xor mit g[u] vorziehen hinter die groestlcoin_perm_P Funktion
-                        //       was den Registerbedarf senken koennte?
-			state[u] ^= g[u] ^ message[u];
-			g[u] = state[u];
-		}
+		for(int u=0;u<32;u++) state[u] ^= message[u];
+
+#pragma unroll 32
+		for(int u=0;u<32;u++) message[u] = state[u];
 
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_P(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(message, NULL);
 #endif
 
 #pragma unroll 32
-		for(int u=0;u<32;u++)
-			state[u] ^= g[u];
+		for(int u=0;u<32;u++) state[u] ^= message[u];
 
 		////
 		//// 2. Runde groestl
 		////
 #pragma unroll 16
-		for(int k=0;k<16;k++)
-			message[k] = state[k + 16];
-
-#pragma unroll 32
-		for(int k=0;k<32;k++)
-			state[k] = groestlcoin_gpu_state[k];
-
-#pragma unroll 16
-		for(int k=0;k<16;k++)
+		for(int k=0;k<16;k++) message[k] = state[k + 16];
+#pragma unroll 14
+		for(int k=1;k<15;k++)
 			message[k+16] = 0;
 
-		message[16] = 0x80;		
+		message[16] = 0x80;
 		message[31] = 0x01000000;
 
 #pragma unroll 32
 		for(int u=0;u<32;u++)
-			g[u] = message[u] ^ state[u];
+			state[u] = message[u];
+		state[31] ^= 0x20000;
 
 		// Perm
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_P(state, mixtabs);
+		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(state, NULL);
+		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, NULL);
 #endif
 		
 #pragma unroll 32
-		for(int u=0;u<32;u++)
-		{
-			state[u] ^= g[u] ^ message[u];
-			g[u] = state[u];
-		}
+		for(int u=0;u<32;u++) state[u] ^= message[u];
+
+#pragma unroll 32
+		for(int u=0;u<32;u++) message[u] = state[u];
 
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_P(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(message, NULL);
 #endif
 
 #pragma unroll 32
-		for(int u=0;u<32;u++)
-			state[u] ^= g[u];
-		
-/*
-	#pragma unroll 8
-		for(int k=0;k<8;k++)
-			hash[k] = state[k+16];
-*/
+		for(int u=0;u<32;u++) state[u] ^= message[u];
 
 		// kopiere Ergebnis
-		/*
-#pragma unroll 16
-		for(int k=0;k<16;k++)
-			((uint32_t*)outputHash)[16*thread+k] = state[k + 16];
-			*/
-		int i;
+		int i, position = -1;
 		bool rc = true;
-	
+
+#pragma unroll 8
 		for (i = 7; i >= 0; i--) {
 			if (state[i+16] > pTarget[i]) {
-				rc = false;
-				break;
-			}
-			if (state[i+16] < pTarget[i]) {
-				rc = true;
-				break;
-			}
+				if(position < i) {
+					position = i;
+					rc = false;
+				}
+	 		}
+	 		if (state[i+16] < pTarget[i]) {
+				if(position < i) {
+					position = i;
+					rc = true;
+				}
+	 		}
 		}
 
 		if(rc == true)
-		{
 			if(resNounce[0] > nounce)
-			{
 				resNounce[0] = nounce;
-				/*
-				#pragma unroll 8
-				for(int k=0;k<8;k++)					
-					((uint32_t*)outputHash)[k] = (hash[k]);
-				*/
-			}
-		}
-
 	}
 }
 
@@ -360,7 +407,7 @@ __global__ void
 // Setup-Funktionen
 __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(thr_id);
+    cudaSetDevice(device_map[thr_id]);
 	cudaDeviceSetCacheConfig( cudaFuncCachePreferShared );
 // Texturen mit obigem Makro initialisieren
 	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
@@ -372,23 +419,8 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
 	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
 
-	// setze register 
-        // TODO: fast vollstaendige Vorbelegung mit Nullen.
-        //       da besteht doch Optimierungspotenzial im GPU Kernel
-        //       denn mit Nullen braucht man nicht wirklich rechnen.
-	uint32_t groestl_state_init[32];
-	memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
-	groestl_state_init[31] = 0x20000;
-
-	// state speichern
-	cudaMemcpyToSymbol(	groestlcoin_gpu_state,
-						groestl_state_init,
-						128);
-
+	// Speicher f�r Gewinner-Nonce belegen
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
-
-	// Speicher f�r alle Ergebnisse belegen (nur f�r Debug)
-	cudaMalloc(&d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads);
 }
 
 __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@@ -430,7 +462,7 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	// Gr��e des dynamischen Shared Memory Bereichs
 #if USE_SHARED
 	size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
@@ -440,16 +472,10 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	//fprintf(stderr, "ThrID: %d\n", thr_id);
 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashGROESTLCOINoutput[thr_id], d_resultNonce[thr_id]);
+	groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
 
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
 	MyStreamSynchronize(NULL, 0, thr_id);
 
 	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
-	/// Debug
-	//cudaMemcpy(outputHashes, d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
-
-	// Nounce
-	//cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
diff --git a/cuda_hefty1.cu b/cuda_hefty1.cu
index f1333d3dcd..2f72c0fe46 100644
--- a/cuda_hefty1.cu
+++ b/cuda_hefty1.cu
@@ -2,26 +2,40 @@
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 
+// aus cpu-miner.c
+extern int device_map[8];
+
 #include <stdio.h>
 #include <memory.h>
 
+#define USE_SHARED 1
+
 // Folgende Definitionen sp�ter durch header ersetzen
 typedef unsigned int uint32_t;
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
 
+// diese Struktur wird in der Init Funktion angefordert
+static cudaDeviceProp props;
+
 // globaler Speicher f�r alle HeftyHashes aller Threads
 uint32_t *d_heftyHashes[8];
 
 /* Hash-Tabellen */
 __constant__ uint32_t hefty_gpu_constantTable[64];
+#if USE_SHARED
+#define heftyLookUp(x) (*((uint32_t*)heftytab + (x)))
+#else
+#define heftyLookUp(x) hefty_gpu_constantTable[x]
+#endif
 
 // muss expandiert werden
 __constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
 __constant__ uint32_t hefty_gpu_register[8];
 __constant__ uint32_t hefty_gpu_sponge[4];
 
-uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
+uint32_t hefty_cpu_hashTable[] = {
+    0x6a09e667UL,
     0xbb67ae85UL,
     0x3c6ef372UL,
     0xa54ff53aUL,
@@ -29,8 +43,9 @@ uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
     0x9b05688cUL,
     0x1f83d9abUL,
     0x5be0cd19UL };
+    
 uint32_t hefty_cpu_constantTable[] = {
-	0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
+    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
     0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
     0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
     0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
@@ -48,350 +63,352 @@ uint32_t hefty_cpu_constantTable[] = {
     0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
 };
 
-#define S(x, n)			(((x) >> (n)) | ((x) << (32 - (n))))
-#define R(x, n)			((x) >> (n))
-#define Ch(x, y, z)		((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define S0(x)			(S(x, 2) ^ S(x, 13) ^ S(x, 22))
-#define S1(x)			(S(x, 6) ^ S(x, 11) ^ S(x, 25))
-#define s0(x)			(S(x, 7) ^ S(x, 18) ^ R(x, 3))
-#define s1(x)			(S(x, 17) ^ S(x, 19) ^ R(x, 10))
+//#define S(x, n)          (((x) >> (n)) | ((x) << (32 - (n))))
+static __host__ __device__ uint32_t S(uint32_t x, int n)
+{
+    return (((x) >> (n)) | ((x) << (32 - (n))));
+}
+#define R(x, n)          ((x) >> (n))
+#define Ch(x, y, z)      ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)     ((x & (y | z)) | (y & z))
+#define S0(x)            (S(x, 2) ^ S(x, 13) ^ S(x, 22))
+#define S1(x)            (S(x, 6) ^ S(x, 11) ^ S(x, 25))
+#define s0(x)            (S(x, 7) ^ S(x, 18) ^ R(x, 3))
+#define s1(x)            (S(x, 17) ^ S(x, 19) ^ R(x, 10))
 
-#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 
 // uint8_t
-#define smoosh4(x)		( ((x)>>4) ^ ((x) & 0x0F) )
+#define smoosh4(x)       ( ((x)>>4) ^ ((x) & 0x0F) )
 __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
 {
-	uint16_t w = (x >> 16) ^ (x & 0xffff);
-	uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
-	return (n >> 2) ^ (n & 0x03);
+    uint16_t w = (x >> 16) ^ (x & 0xffff);
+    uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
+    return 24 - (((n >> 2) ^ (n & 0x03)) << 3);
 }
 // 4 auf einmal
-#define smoosh4Quad(x)	( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
-#define getByte(x,y)	( ((x) >> (y)) & 0xFF )
+#define smoosh4Quad(x)   ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
+#define getByte(x,y)     ( ((x) >> (y)) & 0xFF )
 
-__host__ __device__ void Mangle(uint32_t *inp)
+__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
 {
-	uint32_t r = smoosh4Quad(inp[0]);
-	//uint8_t r0 = smoosh4( (uint8_t)(inp[0] >> 24) );
-	//uint8_t r1 = smoosh4( (uint8_t)(inp[0] >> 16) );
-	//uint8_t r2 = smoosh4( (uint8_t)(inp[0] >> 8) );
-	//uint8_t r3 = smoosh4( (uint8_t)(inp[0] & 0xFF) );
-
-	inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
-	
-	switch (smoosh2(inp[1])) {
-      case 0: inp[2] ^= S(inp[0], 1 + getByte(r,24)); break;
-      case 1: inp[2] += S(~inp[0], 1 + getByte(r,16)); break;
-      case 2: inp[2] &= S(~inp[0], 1 + getByte(r,8)); break;
-      case 3: inp[2] ^= S(inp[0], 1 + getByte(r,0)); break;
-    }
+    uint32_t r = smoosh4Quad(inp[0]);
+    uint32_t inp0org;
+    uint32_t tmp0Mask, tmp1Mask;
+    uint32_t in1, in2, isAddition;
+    uint32_t tmp;
+    uint8_t b;
+
+    inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
+
+    r += 0x01010101;
+    tmp = smoosh2(inp[1]);
+    b = getByte(r,tmp);
+    inp0org = S(inp[0], b);
+    tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
+    tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
     
-	uint32_t tmp = smoosh2(inp[1] ^ inp[2]);
-    switch (tmp) {
-      case 0: inp[3] ^= S(inp[0], 2 + getByte(r,24)); break;
-      case 1: inp[3] += S(~inp[0], 2 + getByte(r,16)); break;
-      case 2: inp[3] &= S(~inp[0], 2 + getByte(r,8)); break;
-      case 3: inp[3] ^= S(inp[0], 2 + getByte(r,0)); break;
-    }
-
-	inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
+    in1 =    (inp[2] & ~inp0org) | 
+            (tmp1Mask & ~inp[2] & inp0org) |
+            (~tmp0Mask & ~inp[2] & inp0org);
+    in2 = inp[2] += ~inp0org;
+    isAddition = ~tmp0Mask & tmp1Mask;
+    inp[2] = isAddition ? in2 : in1;
+    
+    r += 0x01010101;
+    tmp = smoosh2(inp[1] ^ inp[2]);
+    b = getByte(r,tmp);
+    inp0org = S(inp[0], b);
+    tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
+    tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
+
+    in1 =    (inp[3] & ~inp0org) | 
+            (tmp1Mask & ~inp[3] & inp0org) |
+            (~tmp0Mask & ~inp[3] & inp0org);
+    in2 = inp[3] += ~inp0org;
+    isAddition = ~tmp0Mask & tmp1Mask;
+    inp[3] = isAddition ? in2 : in1;
+
+    inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
 }
 
 __host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
 {
-	inp[0] ^= x;
-	Mangle(inp);
+    inp[0] ^= x;
+    Mangle(inp);
 }
 
 __host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
 {
-	uint32_t y = inp[0];
-	Mangle(inp);
-	return y;
+    uint32_t y = inp[0];
+    Mangle(inp);
+    return y;
 }
 
 __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
 {
-	uint32_t r = Squeeze(sponge);
-
-	//uint8_t r0 = r >> 8;
-	uint8_t r1 = r & 0xFF;
-	uint32_t y = 1 << ((r >> 8) & 0x1F);
-
-	//uint32_t retVal;
-	//retVal = x;
-	
-	uint32_t resArr[4];
-	resArr[0] = x;
-	resArr[1] = x & ~y;
-	resArr[2] = x | y;
-	resArr[3] = x ^ y;
-	return resArr[r1 & 0x03];
-
-	/*
-	switch(r1 & 0x03)
-	{
-	case 0:
-		break;
-    case 1:
-        retVal = x & ~y;
-		break;
-    case 2:
-        retVal = x | y;
-		break;
-    case 3:
-        retVal = x ^ y;
-		break;
-    }
-	return retVal;
-	*/
+    uint32_t r = Squeeze(sponge);
+    uint32_t t = ((r >> 8) & 0x1F);
+    uint32_t y = 1 << t;
+
+    uint32_t a = (((r>>1) & 0x01) << t) & y;
+    uint32_t b = ((r & 0x01) << t) & y;
+    uint32_t c = x & y;
+
+    uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c);
+    return retVal;
 }
 
 __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
 {
-	uint32_t tmpBr;
-
-	uint32_t brG = Br(sponge, regs[6]);	
-	uint32_t brF = Br(sponge, regs[5]);
-	uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
-	uint32_t brE = Br(sponge, regs[4]);
-	uint32_t tmp2 = tmp1 + S1(brE);
-	uint32_t brC = Br(sponge, regs[2]);
-	uint32_t brB = Br(sponge, regs[1]);
-	uint32_t brA = Br(sponge, regs[0]);
-	uint32_t tmp3 = Maj(brA, brB, brC);
-	tmpBr = Br(sponge, regs[0]);
-	uint32_t tmp4 = tmp3 + S0(tmpBr);
-	tmpBr = Br(sponge, tmp2);
-
-	#pragma unroll 7
-	for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-	regs[0] = tmp2 + tmp4;
-	regs[4] += tmpBr;
+    uint32_t tmpBr;
+
+    uint32_t brG = Br(sponge, regs[6]);    
+    uint32_t brF = Br(sponge, regs[5]);
+    uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
+    uint32_t brE = Br(sponge, regs[4]);
+    uint32_t tmp2 = tmp1 + S1(brE);
+    uint32_t brC = Br(sponge, regs[2]);
+    uint32_t brB = Br(sponge, regs[1]);
+    uint32_t brA = Br(sponge, regs[0]);
+    uint32_t tmp3 = Maj(brA, brB, brC);
+    tmpBr = Br(sponge, regs[0]);
+    uint32_t tmp4 = tmp3 + S0(tmpBr);
+    tmpBr = Br(sponge, tmp2);
+
+    #pragma unroll 7
+    for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+    regs[0] = tmp2 + tmp4;
+    regs[4] += tmpBr;
 }
 
 __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
 {
-	uint32_t tmpBr;
-
-	uint32_t brG = Br(sponge, regs[6]);	
-	uint32_t brF = Br(sponge, regs[5]);
-	uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
-	uint32_t brE = Br(sponge, regs[4]);
-	uint32_t tmp2 = tmp1 + S1(brE);
-	uint32_t brC = Br(sponge, regs[2]);
-	uint32_t brB = Br(sponge, regs[1]);
-	uint32_t brA = Br(sponge, regs[0]);
-	uint32_t tmp3 = Maj(brA, brB, brC);
-	tmpBr = Br(sponge, regs[0]);
-	uint32_t tmp4 = tmp3 + S0(tmpBr);
-	tmpBr = Br(sponge, tmp2);
-
-	for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-	regs[0] = tmp2 + tmp4;
-	regs[4] += tmpBr;
+    uint32_t tmpBr;
+
+    uint32_t brG = Br(sponge, regs[6]);    
+    uint32_t brF = Br(sponge, regs[5]);
+    uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
+    uint32_t brE = Br(sponge, regs[4]);
+    uint32_t tmp2 = tmp1 + S1(brE);
+    uint32_t brC = Br(sponge, regs[2]);
+    uint32_t brB = Br(sponge, regs[1]);
+    uint32_t brA = Br(sponge, regs[0]);
+    uint32_t tmp3 = Maj(brA, brB, brC);
+    tmpBr = Br(sponge, regs[0]);
+    uint32_t tmp4 = tmp3 + S0(tmpBr);
+    tmpBr = Br(sponge, tmp2);
+
+    for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+    regs[0] = tmp2 + tmp4;
+    regs[4] += tmpBr;
 }
 
 // Die Hash-Funktion
 __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// bestimme den aktuellen Z�hler
-		uint32_t nounce = startNounce + thread;
-	
-		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
-#if USE_SHARED
-		extern __shared__ unsigned char s[];
-		uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
-#else
-		// reduktion von 256 byte auf 128 byte
-		uint32_t W1[16];
-		uint32_t W2[16];
+    #if USE_SHARED
+    extern __shared__ char heftytab[];
+    if(threadIdx.x < 64)
+    {
+        *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
+    }
+
+    __syncthreads();
 #endif
 
-		// Initialisiere die register a bis h mit der Hash-Tabelle
-		uint32_t regs[8];
-		uint32_t hash[8];
-		uint32_t sponge[4];
-	
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    if (thread < threads)
+    {
+        // bestimme den aktuellen Z�hler
+        uint32_t nounce = startNounce + thread;
+    
+        // jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
+        // reduktion von 256 byte auf 128 byte
+        uint32_t W1[16];
+        uint32_t W2[16];
+
+        // Initialisiere die register a bis h mit der Hash-Tabelle
+        uint32_t regs[8];
+        uint32_t hash[8];
+        uint32_t sponge[4];
+    
 #pragma unroll 4
-		for(int k=0; k < 4; k++)
-			sponge[k] = hefty_gpu_sponge[k];
+        for(int k=0; k < 4; k++)
+            sponge[k] = hefty_gpu_sponge[k];
 
-		// pre
+        // pre
 #pragma unroll 8
-		for (int k=0; k < 8; k++)
-		{
-			regs[k] = hefty_gpu_register[k];
-			hash[k] = regs[k];
-		}
-	
-		//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
+        for (int k=0; k < 8; k++)
+        {
+            regs[k] = hefty_gpu_register[k];
+            hash[k] = regs[k];
+        }
+    
+        //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
 #pragma unroll 16
-		for(int k=0;k<16;k++)
-			W1[k] = hefty_gpu_blockHeader[k];
-		W1[3] = SWAB32(nounce);
-
+        for(int k=0;k<16;k++)
+            W1[k] = hefty_gpu_blockHeader[k];
+        W1[3] = SWAB32(nounce);
 
-		// 2. Runde
+        // 2. Runde
 #pragma unroll 16
-		for(int j=0;j<16;j++)
-			Absorb(sponge, W1[j] ^ hefty_gpu_constantTable[j]);
+        for(int j=0;j<16;j++)
+            Absorb(sponge, W1[j] ^ heftyLookUp(j));
 
 // Progress W1 (Bytes 0...63)
 #pragma unroll 16
-		for(int j=0;j<16;j++)
-		{
-			Absorb(sponge, regs[3] ^ regs[7]);
-			hefty_gpu_round(regs, W1[j], hefty_gpu_constantTable[j], sponge);
-		}
+        for(int j=0;j<16;j++)
+        {
+            Absorb(sponge, regs[3] ^ regs[7]);
+            hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge);
+        }
 
 // Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
-		
+        
 #pragma unroll 3
-		for(int k=0;k<3;k++)
-		{
-	#pragma unroll 2
-			for(int j=0;j<2;j++)
-				W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-	#pragma unroll 5
-			for(int j=2;j<7;j++)
-				W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-
-	#pragma unroll 8
-			for(int j=7;j<15;j++)
-				W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
-
-			W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
-
-	#pragma unroll 16
-			for(int j=0;j<16;j++)
-			{
-				Absorb(sponge, regs[3] + regs[7]);
-				hefty_gpu_round(regs, W2[j], hefty_gpu_constantTable[j + 16 * (k+1)], sponge);
-			}
-	#pragma unroll 16
-			for(int j=0;j<16;j++)
-				W1[j] = W2[j];
-		}
-		
-
+        for(int k=0;k<3;k++)
+        {
+    #pragma unroll 2
+            for(int j=0;j<2;j++)
+                W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+    #pragma unroll 5
+            for(int j=2;j<7;j++)
+                W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+
+    #pragma unroll 8
+            for(int j=7;j<15;j++)
+                W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+
+            W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+    #pragma unroll 16
+            for(int j=0;j<16;j++)
+            {
+                Absorb(sponge, regs[3] + regs[7]);
+                hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge);
+            }
+    #pragma unroll 16
+            for(int j=0;j<16;j++)
+                W1[j] = W2[j];
+        }
+        
 #pragma unroll 8
-		for(int k=0;k<8;k++)
-			hash[k] += regs[k];
+        for(int k=0;k<8;k++)
+            hash[k] += regs[k];
 
 #pragma unroll 8
-		for(int k=0;k<8;k++)
-			((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
-	}
+        for(int k=0;k<8;k++)
+            ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
+    }
 }
 
 // Setup-Funktionen
 __host__ void hefty_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(thr_id);
+    cudaSetDevice(device_map[thr_id]);
 
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
-	cudaMemcpyToSymbol(	hefty_gpu_constantTable,
-						hefty_cpu_constantTable,
-						sizeof(uint32_t) * 64 );
+    cudaGetDeviceProperties(&props, device_map[thr_id]);
 
-	// Speicher f�r alle Hefty1 hashes belegen
-	cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
+    // Kopiere die Hash-Tabellen in den GPU-Speicher
+    cudaMemcpyToSymbol(    hefty_gpu_constantTable,
+                        hefty_cpu_constantTable,
+                        sizeof(uint32_t) * 64 );
+
+    // Speicher f�r alle Hefty1 hashes belegen
+    cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
 }
 
 __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
-	// data muss 84-Byte haben!
+    // data muss 84-Byte haben!
 {
-	// Nachricht expandieren und setzen
-	uint32_t msgBlock[32];
-
-	memset(msgBlock, 0, sizeof(uint32_t) * 32);
-	memcpy(&msgBlock[0], data, 84);
-	msgBlock[21] |= 0x80;
-	msgBlock[31] = 672; // bitlen
-	
-	for(int i=0;i<31;i++) // Byteorder drehen
-		msgBlock[i] = SWAB32(msgBlock[i]);
-
-	// die erste Runde wird auf der CPU durchgef�hrt, da diese f�r
-	// alle Threads gleich ist. Der Hash wird dann an die Threads
-	// �bergeben
-
-	// Erstelle expandierten Block W
-	uint32_t W[64];	
-	memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);	
-	for(int j=16;j<64;j++)
-		W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
-
-	// Initialisiere die register a bis h mit der Hash-Tabelle
-	uint32_t regs[8];
-	uint32_t hash[8];
-	uint32_t sponge[4];
-
-	// pre
-	memset(sponge, 0, sizeof(uint32_t) * 4);
+    // Nachricht expandieren und setzen
+    uint32_t msgBlock[32];
+
+    memset(msgBlock, 0, sizeof(uint32_t) * 32);
+    memcpy(&msgBlock[0], data, 84);
+    msgBlock[21] |= 0x80;
+    msgBlock[31] = 672; // bitlen
+    
+    for(int i=0;i<31;i++) // Byteorder drehen
+        msgBlock[i] = SWAB32(msgBlock[i]);
+
+    // die erste Runde wird auf der CPU durchgef�hrt, da diese f�r
+    // alle Threads gleich ist. Der Hash wird dann an die Threads
+    // �bergeben
+
+    // Erstelle expandierten Block W
+    uint32_t W[64];    
+    memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);    
+    for(int j=16;j<64;j++)
+        W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
+
+    // Initialisiere die register a bis h mit der Hash-Tabelle
+    uint32_t regs[8];
+    uint32_t hash[8];
+    uint32_t sponge[4];
+
+    // pre
+    memset(sponge, 0, sizeof(uint32_t) * 4);
     for (int k=0; k < 8; k++)
-	{
-		regs[k] = hefty_cpu_hashTable[k];
-		hash[k] = regs[k];
-	}	
-
-	// 1. Runde
-	for(int j=0;j<16;j++)
-		Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
-
-	for(int j=0;j<16;j++)
-	{
-		Absorb(sponge, regs[3] ^ regs[7]);
-		hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
-	}
-
-	for(int j=16;j<64;j++)
-	{
-		Absorb(sponge, regs[3] + regs[7]);
-		hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
-	}
-
-	for(int k=0;k<8;k++)
-		hash[k] += regs[k];
-
-	// sponge speichern
-
-	cudaMemcpyToSymbol(	hefty_gpu_sponge,
-						sponge,
-						sizeof(uint32_t) * 4 );
-	// hash speichern
-	cudaMemcpyToSymbol(	hefty_gpu_register,
-						hash,
-						sizeof(uint32_t) * 8 );
-
-	// Blockheader setzen (korrekte Nonce fehlt da drin noch)
-	cudaMemcpyToSymbol(	hefty_gpu_blockHeader,
-						&msgBlock[16],
-						64);
+    {
+        regs[k] = hefty_cpu_hashTable[k];
+        hash[k] = regs[k];
+    }    
+
+    // 1. Runde
+    for(int j=0;j<16;j++)
+        Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
+
+    for(int j=0;j<16;j++)
+    {
+        Absorb(sponge, regs[3] ^ regs[7]);
+        hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
+    }
+
+    for(int j=16;j<64;j++)
+    {
+        Absorb(sponge, regs[3] + regs[7]);
+        hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
+    }
+
+    for(int k=0;k<8;k++)
+        hash[k] += regs[k];
+
+    // sponge speichern
+
+    cudaMemcpyToSymbol( hefty_gpu_sponge,
+                        sponge,
+                        sizeof(uint32_t) * 4 );
+    // hash speichern
+    cudaMemcpyToSymbol( hefty_gpu_register,
+                        hash,
+                        sizeof(uint32_t) * 8 );
+
+    // Blockheader setzen (korrekte Nonce fehlt da drin noch)
+    cudaMemcpyToSymbol( hefty_gpu_blockHeader,
+                        &msgBlock[16],
+                        64);
 }
 
 __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
 {
-	const int threadsperblock = 128;
+    // Compute 3.x und 5.x Ger�te am besten mit 768 Threads ansteuern,
+    // alle anderen mit 512 Threads.
+    int threadsperblock = (props.major >= 3) ? 768 : 512;
 
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
+    // berechne wie viele Thread Blocks wir brauchen
+    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
-#if USE_SHARED
-	size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingef�gt gegen Bank Konflikte
+    // Gr��e des dynamischen Shared Memory Bereichs
+    #if USE_SHARED
+    size_t shared_size = 8 * 64 * sizeof(uint32_t);
 #else
-	size_t shared_size = 0;
+    size_t shared_size = 0;
 #endif
 
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 
-	hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
+    hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
 }
diff --git a/cuda_keccak512.cu b/cuda_keccak512.cu
index 65db302521..c9b0a6cc14 100644
--- a/cuda_keccak512.cu
+++ b/cuda_keccak512.cu
@@ -264,7 +264,7 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	// Gr��e des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
diff --git a/cuda_sha256.cu b/cuda_sha256.cu
index 97b7051802..050f4b4eda 100644
--- a/cuda_sha256.cu
+++ b/cuda_sha256.cu
@@ -5,8 +5,6 @@
 #include <stdio.h>
 #include <memory.h>
 
-#define W_ALIGNMENT 65
-
 // Folgende Definitionen sp�ter durch header ersetzen
 typedef unsigned int uint32_t;
 
@@ -59,8 +57,6 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH
 		nonceVector[thread] = nounce;
 	
 		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
-		//extern __shared__ unsigned char s[];
-		//uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
 		uint32_t W1[16];
 		uint32_t W2[16];
 
@@ -257,14 +253,13 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe
 
 __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
 {
-	const int threadsperblock = 128;
+	const int threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
-	//size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingef�gt gegen Bank Konflikte
+	// Gr��e des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
diff --git a/heavy.cu b/heavy.cu
index 6ce1a663d2..453c88761b 100644
--- a/heavy.cu
+++ b/heavy.cu
@@ -163,6 +163,41 @@ extern "C" int cuda_num_devices()
     return GPU_N;
 }
 
+static bool substringsearch(const char *haystack, const char *needle, int &match)
+{
+    int hlen = strlen(haystack);
+    int nlen = strlen(needle);
+    for (int i=0; i < hlen; ++i)
+    {
+        if (haystack[i] == ' ') continue;
+        int j=0, x = 0;
+        while(j < nlen)
+        {
+            if (haystack[i+x] == ' ') {++x; continue;}
+            if (needle[j] == ' ') {++j; continue;}
+            if (needle[j] == '#') return ++match == needle[j+1]-'0';
+            if (tolower(haystack[i+x]) != tolower(needle[j])) break;
+            ++j; ++x;
+        }
+        if (j == nlen) return true;
+    }
+    return false;
+}
+
+// CUDA Ger�t nach Namen finden (gibt Ger�te-Index zur�ck oder -1)
+extern "C" int cuda_finddevice(char *name)
+{
+    int num = cuda_num_devices();
+    int match = 0;
+    for (int i=0; i < num; ++i)
+    {
+        cudaDeviceProp props;
+        if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
+            if (substringsearch(props.name, name, match)) return i;
+    }
+    return -1;
+}
+
 // Zeitsynchronisations-Routine von cudaminer mit CPU sleep
 typedef struct { double value[8]; } tsumarray;
 cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)