Skip to content

Commit

Permalink
rewrite jha algo to be more compatible
Browse files Browse the repository at this point in the history
old "german" implementation based on the quark method is kept
in the source tree, but is currently broken. to be continued.

This is a quick fix which should be compatible with all cards.
  • Loading branch information
tpruvot committed May 8, 2017
1 parent 762c7f8 commit 5dfeee4
Show file tree
Hide file tree
Showing 11 changed files with 299 additions and 25 deletions.
265 changes: 265 additions & 0 deletions JHA/jha.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
/**
* JHA v8 algorithm - compatible implementation
* @author tpruvot@github 05-2017
*/

extern "C" {
#include "sph/sph_keccak.h"
#include "sph/sph_blake.h"
#include "sph/sph_groestl.h"
#include "sph/sph_jh.h"
#include "sph/sph_skein.h"
}

#include "miner.h"
#include "cuda_helper.h"
#include "quark/cuda_quark.h"

static uint32_t *d_hash[MAX_GPUS] = { 0 };
static uint32_t *d_hash_br2[MAX_GPUS];
static uint32_t *d_tempBranch[MAX_GPUS];

extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);

// CPU HASH
extern "C" void jha_hash(void *output, const void *input)
{
uint32_t hash[16];

sph_blake512_context ctx_blake;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;

sph_keccak512_init(&ctx_keccak);
sph_keccak512 (&ctx_keccak, input, 80);
sph_keccak512_close(&ctx_keccak, hash);

for (int rnd = 0; rnd < 3; rnd++)
{
if (hash[0] & 0x01) {
sph_groestl512_init(&ctx_groestl);
sph_groestl512 (&ctx_groestl, (&hash), 64);
sph_groestl512_close(&ctx_groestl, (&hash));
}
else {
sph_skein512_init(&ctx_skein);
sph_skein512 (&ctx_skein, (&hash), 64);
sph_skein512_close(&ctx_skein, (&hash));
}

if (hash[0] & 0x01) {
sph_blake512_init(&ctx_blake);
sph_blake512 (&ctx_blake, (&hash), 64);
sph_blake512_close(&ctx_blake, (&hash));
}
else {
sph_jh512_init(&ctx_jh);
sph_jh512 (&ctx_jh, (&hash), 64);
sph_jh512_close(&ctx_jh, (&hash));
}
}
memcpy(output, hash, 32);
}

__global__ __launch_bounds__(128, 8)
void jha_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
uint4 *psrc = (uint4*) (&d_hash[offset]);
d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 0x01;
if (d_NonceBranch[thread]) return;
// uint4 = 4x uint32_t = 16 bytes
uint4 *pdst = (uint4*) (&d_branch2[offset]);
pdst[0] = psrc[0];
pdst[1] = psrc[1];
pdst[2] = psrc[2];
pdst[3] = psrc[3];
}
}

__global__ __launch_bounds__(128, 8)
void jha_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads && !d_NonceBranch[thread])
{
const uint32_t offset = thread * 16U;
uint4 *pdst = (uint4*) (&d_hash[offset]);
uint4 *psrc = (uint4*) (&d_branch2[offset]);
pdst[0] = psrc[0];
pdst[1] = psrc[1];
pdst[2] = psrc[2];
pdst[3] = psrc[3];
}
}

__host__
uint32_t jha_filter_cpu(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
// extract algo permution hashes to a second branch buffer
jha_filter_gpu <<<grid, block>>> (threads, inpHashes, d_branch2, d_tempBranch[thr_id]);
return threads;
}

__host__
void jha_merge_cpu(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
// put back second branch hashes to the common buffer d_hash
jha_merge_gpu <<<grid, block>>> (threads, outpHashes, d_branch2, d_tempBranch[thr_id]);
}

static bool init[MAX_GPUS] = { 0 };

extern "C" int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t _ALIGN(64) endiandata[22];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id];

uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

if (opt_benchmark)
ptarget[7] = 0x000f;

if (!init[thr_id])
{
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
cuda_get_arch(thr_id);
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
CUDA_SAFE_CALL(cudaMalloc(&d_hash_br2[thr_id], (size_t) 64 * throughput));
CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch[thr_id], sizeof(uint32_t) * throughput));

jackpot_keccak512_cpu_init(thr_id, throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);

cuda_check_cpu_init(thr_id, throughput);

init[thr_id] = true;
}

for (int k=0; k < 22; k++)
be32enc(&endiandata[k], pdata[k]);

jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
cuda_check_cpu_setTarget(ptarget);

do {
int order = 0;

jackpot_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);

for (int rnd = 0; rnd < 3; rnd++)
{
jha_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
jha_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);

jha_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
jha_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
}

*hashes_done = pdata[19] - first_nonce + throughput;

CUDA_LOG_ERROR();

work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);

if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];

be32enc(&endiandata[19], work->nonces[0]);
jha_hash(vhash, endiandata);

if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
jha_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
}

if ((uint64_t) throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}

pdata[19] += throughput;

} while (!work_restart[thr_id].restart);

*hashes_done = pdata[19] - first_nonce;

CUDA_LOG_ERROR();

return 0;
}

// cleanup
extern "C" void free_jha(int thr_id)
{
if (!init[thr_id])
return;

cudaThreadSynchronize();

cudaFree(d_hash[thr_id]);
cudaFree(d_hash_br2[thr_id]);
cudaFree(d_tempBranch[thr_id]);

quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);

cuda_check_cpu_free(thr_id);
CUDA_LOG_ERROR();

cudaDeviceSynchronize();
init[thr_id] = false;
}
2 changes: 1 addition & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
crypto/cryptolight.cu crypto/cryptolight-core.cu crypto/cryptolight-cpu.cpp \
crypto/cryptonight.cu crypto/cryptonight-core.cu crypto/cryptonight-extra.cu \
crypto/cryptonight-cpu.cpp crypto/oaes_lib.cpp crypto/aesb.cpp crypto/cpu/c_keccak.c \
JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
JHA/jha.cu JHA/cuda_jha_keccak512.cu \
JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
Expand Down
11 changes: 7 additions & 4 deletions README.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

ccminer 2.0 (March 2017) "Cryptonight & other funny algos"
ccminer 2.0 (May 2017)
---------------------------------------------------------------

***************************************************************
Expand Down Expand Up @@ -33,7 +33,7 @@ HeavyCoin & MjollnirCoin
FugueCoin
GroestlCoin & Myriad-Groestl
Lbry Credits
JackpotCoin
JackpotCoin (JHA)
QuarkCoin family & AnimeCoin
TalkCoin
DarkCoin and other X11 coins
Expand Down Expand Up @@ -77,6 +77,7 @@ its command line interface and options.

-a, --algo=ALGO specify the algorithm to use
bastion use to mine Joincoin
bitcore use to mine Bitcore's Timetravel10
blake use to mine Saffroncoin (Blake256)
blakecoin use to mine Old Blake 256
blake2s use to mine Nevacoin (Blake2-S 256)
Expand All @@ -91,7 +92,7 @@ its command line interface and options.
fugue256 use to mine Fuguecoin
groestl use to mine Groestlcoin
heavy use to mine Heavycoin
jackpot use to mine Jackpotcoin
jha use to mine JackpotCoin
keccak use to mine Maxcoin
lbry use to mine LBRY Credits
luffa use to mine Joincoin
Expand Down Expand Up @@ -171,6 +172,7 @@ its command line interface and options.
--max-log-rate Interval to reduce per gpu hashrate logs (default: 3)
--pstate=0 will force the Geforce 9xx to run in P0 P-State
--plimit=150W set the gpu power limit, allow multiple values for N cards
on windows this parameter use percentages (like OC tools)
--tlimit=85 Set the gpu thermal limit (windows only)
--keep-clocks prevent reset clocks and/or power limit on exit
--hide-diff Hide submitted shares diff and net difficulty
Expand Down Expand Up @@ -275,7 +277,7 @@ features.

>>> RELEASE HISTORY <<<

Mar. 08th 2017 v2.0
May. 08th 2017 v2.0
Handle cryptonight, wildkeccak and cryptonight-lite
Add a serie of new algos: timetravel, bastion, hmq1725, sha256t
Import lyra2z from djm34 work...
Expand All @@ -284,6 +286,7 @@ features.
Store the share diff of second nonce(s) in most algos
Hardware monitoring thread to get more accurate power readings
Small changes for the quiet mode & max-log-rate to reduce logs
Add bitcore and a compatible jha algo (quark and jackpot to fix)

Dec. 21th 2016 v1.8.4
Improve streebog based algos, veltor and sib (from alexis work)
Expand Down
6 changes: 4 additions & 2 deletions algos.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ enum sha_algos {
ALGO_HEAVY, /* Heavycoin hash */
ALGO_HMQ1725,
ALGO_KECCAK,
ALGO_JACKPOT,
ALGO_JHA,
ALGO_LBRY,
ALGO_LUFFA,
ALGO_LYRA2,
Expand Down Expand Up @@ -83,7 +83,7 @@ static const char *algo_names[] = {
"heavy",
"hmq1725",
"keccak",
"jackpot",
"jha",
"lbry",
"luffa",
"lyra2",
Expand Down Expand Up @@ -151,6 +151,8 @@ static inline int algo_to_int(char* arg)
i = ALGO_LUFFA;
else if (!strcasecmp("hmq17", arg))
i = ALGO_HMQ1725;
else if (!strcasecmp("jackpot", arg))
i = ALGO_JHA;
else if (!strcasecmp("lyra2re", arg))
i = ALGO_LYRA2;
else if (!strcasecmp("lyra2rev2", arg))
Expand Down
8 changes: 5 additions & 3 deletions bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ void algo_free_all(int thr_id)
{
// only initialized algos will be freed
free_bastion(thr_id);
free_bitcore(thr_id);
free_blake256(thr_id);
free_blake2s(thr_id);
free_bmw(thr_id);
Expand All @@ -60,7 +61,8 @@ void algo_free_all(int thr_id)
free_groestlcoin(thr_id);
free_heavy(thr_id);
free_hmq17(thr_id);
free_jackpot(thr_id);
//free_jackpot(thr_id);
free_jha(thr_id);
free_lbry(thr_id);
free_luffa(thr_id);
free_lyra2(thr_id);
Expand Down Expand Up @@ -120,15 +122,15 @@ bool bench_algo_switch_next(int thr_id)
if (algo == ALGO_CRYPTOLIGHT) algo++;
if (algo == ALGO_CRYPTONIGHT) algo++;
if (algo == ALGO_WILDKECCAK) algo++;
if (algo == ALGO_JACKPOT) algo++; // to fix
//if (algo == ALGO_JACKPOT) algo++; // to fix
if (algo == ALGO_QUARK) algo++; // to fix
if (algo == ALGO_LBRY && CUDART_VERSION < 7000) algo++;

if (device_sm[dev_id] && device_sm[dev_id] < 300) {
// incompatible SM 2.1 kernels...
if (algo == ALGO_GROESTL) algo++;
if (algo == ALGO_MYR_GR) algo++;
if (algo == ALGO_JACKPOT) algo++; // compact shuffle
//if (algo == ALGO_JACKPOT) algo++; // compact shuffle
if (algo == ALGO_NEOSCRYPT) algo++;
if (algo == ALGO_WHIRLPOOLX) algo++;
}
Expand Down
Loading

0 comments on commit 5dfeee4

Please sign in to comment.