Skip to content

Commit

Permalink
cryptonight: extra cleanup, to be continued...
Browse files Browse the repository at this point in the history
  • Loading branch information
tpruvot committed Jun 22, 2018
1 parent 793c9f1 commit 1cc5b6d
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 204 deletions.
3 changes: 2 additions & 1 deletion crypto/cn_groestl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState* __restrict__ ctx, BitSequence* __restri
for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
output[j] = s[i];
}

#if 0
for (i = 0; i < GROESTL_COLS512; i++) {
ctx->chaining[i] = 0;
}
for (i = 0; i < GROESTL_SIZE512; i++) {
ctx->buffer[i] = 0;
}
#endif
}

__device__
Expand Down
4 changes: 2 additions & 2 deletions crypto/cryptolight-core.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *

if(thread < threads)
{
const int oft = thread * 52 + sub + 16; // not aligned 16!
const int oft = thread * 50 + sub + 16; // not aligned 16!
const int long_oft = (thread << LONG_SHL_IDX) + sub;
uint32_t __align__(16) key[40];
uint32_t __align__(16) text[4];
Expand Down Expand Up @@ -222,7 +222,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
if(thread < threads)
{
const int long_oft = (thread << LONG_SHL_IDX) + sub;
const int oft = thread * 52 + sub + 16;
const int oft = thread * 50 + sub + 16;
uint32_t __align__(16) key[40];
uint32_t __align__(16) text[4];

Expand Down
2 changes: 1 addition & 1 deletion crypto/cryptolight.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_

cudaMalloc(&d_long_state[thr_id], alloc);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput);
cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
Expand Down
6 changes: 3 additions & 3 deletions crypto/cryptolight.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)

void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);

void cryptonight_extra_setData(int thr_id, const void *data, const void *pTargetIn);
void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
void cryptonight_extra_free(int thr_id);
void cryptonight_extra_prepare(int thr_id, int threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak1_2);
void cryptonight_extra_final(int thr_id, int threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state);
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state);
118 changes: 1 addition & 117 deletions crypto/cryptonight-core.cu
Original file line number Diff line number Diff line change
Expand Up @@ -126,121 +126,6 @@ void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, cons

// --------------------------------------------------------------------------------------------------------------

#if 0
#if UINTPTR_MAX == UINT64_MAX
#define LPTR "l"
#else
#define LPTR "r"
#endif

__device__ __forceinline__ uint64_t loadGlobal64(uint64_t * const addr) {
uint64_t x;
asm volatile("ld.global.cg.u64 %0, [%1];" : "=l"(x) : LPTR (addr));
return x;
}

__device__ __forceinline__ uint32_t loadGlobal32(uint32_t * const addr) {
uint32_t x;
asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : LPTR (addr));
return x;
}

__device__ __forceinline__ void storeGlobal32(uint32_t* addr, uint32_t const & val) {
asm volatile("st.global.cg.u32 [%0], %1;" : : LPTR (addr), "r"(val));
}

__device__ __forceinline__ uint32_t variant1_1(const uint32_t src)
{
const uint32_t tmp = (src >> 24); // __byte_perm(src, 0, 0x7773);
const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
return (src & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
}

__global__
void monero_phase2_messed(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx,
uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint64_t * __restrict__ d_tweak)
{
__shared__ uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
if (thread >= threads)
return;

const uint32_t batchsize = ITER >> (2U + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;

const uint32_t subthr = threadIdx.x & 3;
const uint32_t thrctx = thread << 2;
uint32_t * ctx_a = &d_ctx_a[thrctx];
uint32_t * ctx_b = &d_ctx_b[thrctx];
uint32_t * long_state = &d_long_state[thread << LONG_SHL32];

uint32_t d[2], t1[2], t2[2];
uint32_t a = ctx_a[subthr];
d[1] = ctx_b[subthr];

uint32_t tweak[2];
AS_UINT2(&tweak) = AS_UINT2(&d_tweak[thread]);

for (uint32_t i = start; i < end; i++)
{
#pragma unroll 2
for (int x = 0; x < 2; x++)
{
uint32_t j = ((__shfl(a, 0, 4) & 0x1FFFF0) >> 2) + subthr;

const uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j);
const uint32_t x_1 = __shfl(x_0, subthr + 1, 4);
const uint32_t x_2 = __shfl(x_0, subthr + 2, 4);
const uint32_t x_3 = __shfl(x_0, subthr + 3, 4);

// t_fn = aes shared mem read
d[x] = a ^ t_fn0(x_0 & 0xff) ^
t_fn1((x_1 >> 8) & 0xff) ^
t_fn2((x_2 >> 16) & 0xff) ^
t_fn3((x_3 >> 24));
t1[0] = __shfl(d[x], 0, 4);

uint32_t z = d[0] ^ d[1];
if (subthr == 2) z = variant1_1(z);
storeGlobal32(long_state + j, z);

// -----------------------------------------------------------
j = ((*t1 & 0x1FFFF0) >> 2) + subthr;

uint32_t yy[2];
AS_U64(yy) = loadGlobal64<uint64_t>(((uint64_t *)long_state) + (j >> 1));

t1[1] = __shfl(d[x], 1, 4);

uint32_t sub2 = (threadIdx.x & 2);
t2[0] = __shfl(a, sub2, 4);
t2[1] = __shfl(a, sub2 + 1U, 4);

uint32_t zz[2];
zz[0] = __shfl(yy[0], 0, 4);
zz[1] = __shfl(yy[1], 0, 4);

AS_U64(t2) += sub2 ? (AS_U64(t1) * AS_U64(zz)) : __umul64hi(AS_U64(t1), AS_U64(zz));

uint32_t s1 = subthr & 1U;
z = AS_U64(t2) >> (s1 * 32U); // hi or lo dword
//z = __byte_perm(t2[0], t2[1], s1 ? 0x7654 : 0x3210);
storeGlobal32(long_state + j, sub2 ? tweak[s1] ^ z : z);
a = z ^ yy[s1];
}
}

if (bfactor) {
ctx_a[subthr] = a;
ctx_b[subthr] = d[1];
}
}
#endif

__device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z)
{
const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
Expand Down Expand Up @@ -345,7 +230,7 @@ void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict_
extern int device_bfactor[MAX_GPUS];

__host__
void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
{
dim3 grid(blocks);
Expand All @@ -371,7 +256,6 @@ void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long
cryptonight_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
else
monero_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
//monero_phase2_messed <<<grid, b>>> (throughput, bfactor, i, (uint32_t*) d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
}
Expand Down
3 changes: 1 addition & 2 deletions crypto/cryptonight-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ extern "C" {
#define VARIANT1_1(p) \
if (variant > 0) { \
const uint8_t tmp = ((const uint8_t*)(p))[11]; \
static const uint32_t table = 0x75310; \
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \
((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \
((uint8_t*)(p))[11] = tmp ^ ((0x75310 >> index) & 0x30); \
}

struct cryptonight_ctx {
Expand Down
Loading

0 comments on commit 1cc5b6d

Please sign in to comment.