diff --git a/Makefile.am b/Makefile.am index f0e63de9b..bf68583ed 100644 --- a/Makefile.am +++ b/Makefile.am @@ -68,6 +68,11 @@ sgminer_SOURCES += algorithm/x14.c algorithm/x14.h sgminer_SOURCES += algorithm/fresh.c algorithm/fresh.h sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h +sgminer_SOURCES += algorithm/pluck.c algorithm/pluck.h +sgminer_SOURCES += algorithm/credits.c algorithm/credits.h +sgminer_SOURCES += algorithm/Lyra2RE_old.c algorithm/Lyra2RE_old.h algorithm/Lyra2_old.c algorithm/Lyra2_old.h algorithm/Sponge_old.c algorithm/Sponge_old.h +sgminer_SOURCES += algorithm/Lyra2RE.c algorithm/Lyra2RE.h algorithm/Lyra2.c algorithm/Lyra2.h algorithm/Sponge.c algorithm/Sponge.h +sgminer_SOURCES += algorithm/yescrypt.h algorithm/yescrypt.c algorithm/yescrypt_core.h algorithm/yescrypt-opt.c algorithm/yescryptcommon.c algorithm/sysendian.h bin_SCRIPTS = $(top_srcdir)/kernel/*.cl diff --git a/algorithm.c b/algorithm.c index 2f5494969..8b7630301 100644 --- a/algorithm.c +++ b/algorithm.c @@ -31,14 +31,20 @@ #include "algorithm/fresh.h" #include "algorithm/whirlcoin.h" #include "algorithm/neoscrypt.h" +#include "algorithm/Lyra2RE.h" //lyra new version +#include "algorithm/Lyra2RE_old.h" //lyra old version +#include "algorithm/pluck.h" +#include "algorithm/yescrypt.h" +#include "algorithm/credits.h" #include "compat.h" #include #include - +bool opt_lyra; const char *algorithm_type_str[] = { "Unknown", + "credits", "Scrypt", "NScrypt", "X11", @@ -52,7 +58,12 @@ const char *algorithm_type_str[] = { "NIST", "Fresh", "Whirlcoin", - "Neoscrypt" + "Neoscrypt", + "Lyra2RE", + "Lyra2REv2" + "pluck", + "yescrypt", + "yescrypt-multi" }; void sha256(const unsigned char *message, unsigned int len, unsigned char *digest) @@ -178,6 +189,150 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb return status; } +static cl_int queue_pluck_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel = &clState->kernel; + unsigned int num = 0; + cl_uint le_target; + cl_int status = 0; + + +// le_target = (*(cl_uint *)(blk->work->device_target + 28)); + le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]); +// memcpy(clState->cldata, blk->work->data, 80); + flip80(clState->cldata, blk->work->data); +//int i; +//for (i = 0; i<20; i++) ((uint32_t*)clState->cldata)[i] = ((uint32_t*)blk->work->data)[i]; // don't flip + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); + + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(le_target); + + return status; +} + +static cl_int queue_credits_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel = &clState->kernel; + unsigned int num = 0; + cl_ulong le_target; + cl_int status = 0; + + + // le_target = (*(cl_uint *)(blk->work->device_target + 24)); + le_target = (cl_ulong)le64toh(((uint64_t *)blk->work->/*device_*/target)[3]); + // le_target = (cl_uint)((uint32_t *)blk->work->target)[6]; + + + memcpy(clState->cldata, blk->work->data, 168); +// flip168(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 168, clState->cldata, 0, NULL, NULL); + + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + CL_SET_ARG(blk->work->midstate); + + return status; +} + + +static cl_int queue_yescrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel = &clState->kernel; + unsigned int num = 0; + cl_uint le_target; + cl_int status = 0; + + +// le_target = (*(cl_uint *)(blk->work->device_target + 28)); + le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]); +// le_target = (cl_uint)((uint32_t *)blk->work->target)[7]; + + +// memcpy(clState->cldata, blk->work->data, 80); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); + + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->buffer1); + CL_SET_ARG(clState->buffer2); + CL_SET_ARG(le_target); + + return status; +} + +static cl_int queue_yescrypt_multikernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ +// cl_kernel *kernel = &clState->kernel; + cl_kernel *kernel; + unsigned int num = 0; + cl_uint le_target; + cl_int status = 0; + + + // le_target = (*(cl_uint *)(blk->work->device_target + 28)); + le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]); + memcpy(clState->cldata, blk->work->data, 80); +// flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); +//pbkdf and initial sha + kernel = &clState->kernel; + + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->buffer1); + CL_SET_ARG(clState->buffer2); + CL_SET_ARG(clState->buffer3); + CL_SET_ARG(le_target); + +//inactive kernel + num = 0; + kernel = clState->extra_kernels; + CL_SET_ARG_N(0,clState->buffer1); + CL_SET_ARG_N(1,clState->buffer2); +// CL_SET_ARG_N(3, clState->buffer3); + +//mix2_2 + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->padbuffer8); + CL_SET_ARG_N(1,clState->buffer1); + CL_SET_ARG_N(2,clState->buffer2); + //mix2_2 +//inactive kernel + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1); + CL_SET_ARG_N(1, clState->buffer2); + //mix2_2 + + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->padbuffer8); + CL_SET_ARG_N(1, clState->buffer1); + CL_SET_ARG_N(2, clState->buffer2); + + //inactive kernel + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1); + CL_SET_ARG_N(1, clState->buffer2); + //mix2_2 + + +//pbkdf and finalization + num=0; + CL_NEXTKERNEL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(clState->buffer2); + CL_SET_ARG(clState->buffer3); + CL_SET_ARG(le_target); + + return status; +} + static cl_int queue_maxcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) { cl_kernel *kernel = &clState->kernel; @@ -211,6 +366,108 @@ static cl_int queue_sph_kernel(struct __clState *clState, struct _dev_blk_ctx *b return status; } +static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel; + unsigned int num; + cl_int status = 0; + cl_ulong le_target; + +// le_target = *(cl_uint *)(blk->work->device_target + 28); + le_target = *(cl_ulong *)(blk->work->device_target + 24); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); + + // blake - search + kernel = &clState->kernel; + num = 0; +// CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(blk->work->blk.ctx_a); + CL_SET_ARG(blk->work->blk.ctx_b); + CL_SET_ARG(blk->work->blk.ctx_c); + CL_SET_ARG(blk->work->blk.ctx_d); + CL_SET_ARG(blk->work->blk.ctx_e); + CL_SET_ARG(blk->work->blk.ctx_f); + CL_SET_ARG(blk->work->blk.ctx_g); + CL_SET_ARG(blk->work->blk.ctx_h); + CL_SET_ARG(blk->work->blk.cty_a); + CL_SET_ARG(blk->work->blk.cty_b); + CL_SET_ARG(blk->work->blk.cty_c); + + // bmw - search1 + kernel = clState->extra_kernels; + CL_SET_ARG_0(clState->padbuffer8); + // groestl - search2 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // skein - search3 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // jh - search4 + num = 0; + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + + return status; +} + +static cl_int queue_lyra2REv2_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel; + unsigned int num; + cl_int status = 0; + cl_ulong le_target; + + // le_target = *(cl_uint *)(blk->work->device_target + 28); + le_target = *(cl_ulong *)(blk->work->device_target + 24); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); + + // blake - search + kernel = &clState->kernel; + num = 0; + // CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->buffer1); + CL_SET_ARG(blk->work->blk.ctx_a); + CL_SET_ARG(blk->work->blk.ctx_b); + CL_SET_ARG(blk->work->blk.ctx_c); + CL_SET_ARG(blk->work->blk.ctx_d); + CL_SET_ARG(blk->work->blk.ctx_e); + CL_SET_ARG(blk->work->blk.ctx_f); + CL_SET_ARG(blk->work->blk.ctx_g); + CL_SET_ARG(blk->work->blk.ctx_h); + CL_SET_ARG(blk->work->blk.cty_a); + CL_SET_ARG(blk->work->blk.cty_b); + CL_SET_ARG(blk->work->blk.cty_c); + + // keccak - search1 + kernel = clState->extra_kernels; + CL_SET_ARG_0(clState->buffer1); + // cubehash - search2 + num = 0; + CL_NEXTKERNEL_SET_ARG_0(clState->buffer1); + // lyra - search3 + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1); + CL_SET_ARG_N(1, clState->padbuffer8); + // skein -search4 + num = 0; + CL_NEXTKERNEL_SET_ARG_0(clState->buffer1); + // cubehash - search5 + num = 0; + CL_NEXTKERNEL_SET_ARG_0(clState->buffer1); + // bmw - search6 + num = 0; + CL_NEXTKERNEL_SET_ARG(clState->buffer1); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + + return status; +} + + + + static cl_int queue_darkcoin_mod_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) { cl_kernel *kernel; @@ -669,6 +926,29 @@ static algorithm_settings_t algos[] = { A_NEOSCRYPT("neoscrypt"), #undef A_NEOSCRYPT +#define A_PLUCK(a) \ + { a, ALGO_PLUCK, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, pluck_regenhash, queue_pluck_kernel, gen_hash, append_neoscrypt_compiler_options} + A_PLUCK("pluck"), +#undef A_PLUCK + +#define A_CREDITS(a) \ + { a, ALGO_CRE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, credits_regenhash, queue_credits_kernel, gen_hash, NULL} + A_CREDITS("credits"), +#undef A_CREDITS + + + +#define A_YESCRYPT(a) \ + { a, ALGO_YESCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, yescrypt_regenhash, queue_yescrypt_kernel, gen_hash, append_neoscrypt_compiler_options} + A_YESCRYPT("yescrypt"), +#undef A_YESCRYPT + +#define A_YESCRYPT_MULTI(a) \ + { a, ALGO_YESCRYPT_MULTI, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 6,-1,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , yescrypt_regenhash, queue_yescrypt_multikernel, gen_hash, append_neoscrypt_compiler_options} + A_YESCRYPT_MULTI("yescrypt-multi"), +#undef A_YESCRYPT_MULTI + + // kernels starting from this will have difficulty calculated by using quarkcoin algorithm #define A_QUARK(a, b) \ { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options} @@ -705,6 +985,11 @@ static algorithm_settings_t algos[] = { { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL}, + { "Lyra2RE", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4,2 * 8 * 4194304 , 0, lyra2reold_regenhash, queue_lyra2RE_kernel, gen_hash, NULL}, + + { "Lyra2REv2", ALGO_LYRA2REv2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, lyra2re_regenhash, queue_lyra2REv2_kernel, gen_hash, append_neoscrypt_compiler_options }, + + // kernels starting from this will have difficulty calculated by using fuguecoin algorithm #define A_FUGUE(a, b, c) \ { a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, c, NULL} @@ -786,6 +1071,15 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa ALGO_ALIAS("nist5", "talkcoin-mod"); ALGO_ALIAS("keccak", "maxcoin"); ALGO_ALIAS("whirlpool", "whirlcoin"); + ALGO_ALIAS("Lyra2REv2", "Lyra2REv2"); + ALGO_ALIAS("lyra2rev2", "Lyra2REv2"); + ALGO_ALIAS("lyra2v2", "Lyra2REv2"); + ALGO_ALIAS("Lyra2RE", "Lyra2RE"); + ALGO_ALIAS("lyra2re", "Lyra2RE"); + ALGO_ALIAS("lyra2", "Lyra2RE"); + + + #undef ALGO_ALIAS #undef ALGO_ALIAS_NF @@ -811,7 +1105,7 @@ void set_algorithm(algorithm_t* algo, const char* newname_alias) // use old nfactor if it was previously set and is different than the one set by alias if ((old_nfactor > 0) && (old_nfactor != nfactor)) nfactor = old_nfactor; - + if (algo->type == ALGO_LYRA2RE || algo->type == ALGO_LYRA2REv2 ) { opt_lyra = true; } set_algorithm_nfactor(algo, nfactor); //reapply kernelfile if was set diff --git a/algorithm.h b/algorithm.h index bbe63beb3..e24421de0 100644 --- a/algorithm.h +++ b/algorithm.h @@ -12,6 +12,7 @@ typedef enum { ALGO_UNK, + ALGO_CRE, ALGO_SCRYPT, ALGO_NSCRYPT, ALGO_X11, @@ -25,7 +26,12 @@ typedef enum { ALGO_NIST, ALGO_FRESH, ALGO_WHIRL, - ALGO_NEOSCRYPT + ALGO_NEOSCRYPT, + ALGO_LYRA2RE, + ALGO_LYRA2REv2, + ALGO_PLUCK, + ALGO_YESCRYPT, + ALGO_YESCRYPT_MULTI, } algorithm_type_t; extern const char *algorithm_type_str[]; diff --git a/algorithm/Lyra2.c b/algorithm/Lyra2.c new file mode 100644 index 000000000..aa7d207e4 --- /dev/null +++ b/algorithm/Lyra2.c @@ -0,0 +1,213 @@ +/** + * Implementation of the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include "Lyra2.h" +#include "Sponge.h" + +/** + * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords + * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits, + * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all + * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value + * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols). + * + * @param K The derived key to be output by the algorithm + * @param kLen Desired key length + * @param pwd User password + * @param pwdlen Password length + * @param salt Salt + * @param saltlen Salt length + * @param timeCost Parameter to determine the processing time (T) + * @param nRows Number or rows of the memory matrix (R) + * @param nCols Number of columns of the memory matrix (C) + * + * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) + */ +int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) { + + //============================= Basic variables ============================// + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + //==========================================================================/ + + //========== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + + i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); + uint64_t *wholeMatrix = malloc(i); + if (wholeMatrix == NULL) { + return -1; + } + memset(wholeMatrix, 0, i); + + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*)); + if (memMatrix == NULL) { + return -1; + } + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < nRows; i++) { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } + //==========================================================================/ + + //============= Getting the password + salt + basil padded with 10*1 ===============// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible + + //First, we clean enough blocks for the password, salt, basil and padding + uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + byte *ptrByte = (byte*) wholeMatrix; + memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); + + //Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + //Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &saltlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &timeCost, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nRows, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nCols, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block + //==========================================================================/ + + //======================= Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + uint64_t *state = malloc(16 * sizeof (uint64_t)); + if (state == NULL) { + return -1; + } + initState(state); + //==========================================================================/ + + //================================ Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for (i = 0; i < nBlocksInput; i++) { + absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil) + } + + //Initializes M[0] and M[1] + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); + + do { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + + + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < nRows); + //==========================================================================/ + + //============================ Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + //Selects a pseudorandom index row* + //------------------------------------------------------------------------------------------ + //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //------------------------------------------------------------------------------------------ + //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + } while (row != 0); + } + //==========================================================================/ + + //============================ Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlock(state, memMatrix[rowa]); + + //Squeezes the key + squeeze(state, K, kLen); + //==========================================================================/ + + //========================= Freeing the memory =============================// + free(memMatrix); + free(wholeMatrix); + + //Wiping out the sponge's internal state before freeing it + memset(state, 0, 16 * sizeof (uint64_t)); + free(state); + //==========================================================================/ + + return 0; +} diff --git a/algorithm/Lyra2.h b/algorithm/Lyra2.h new file mode 100644 index 000000000..c79089457 --- /dev/null +++ b/algorithm/Lyra2.h @@ -0,0 +1,42 @@ +/** + * Header file for the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef LYRA2_H_ +#define LYRA2_H_ + +#include + +typedef unsigned char byte; + +//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED) +#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t) +#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes + + +#ifdef BLOCK_LEN_BITS + #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes +#else //default block lenght: 768 bits + #define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes +#endif + +int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); + +#endif /* LYRA2_H_ */ diff --git a/algorithm/Lyra2RE.c b/algorithm/Lyra2RE.c new file mode 100644 index 000000000..f6d36df83 --- /dev/null +++ b/algorithm/Lyra2RE.c @@ -0,0 +1,174 @@ +/*- + * Copyright 2014 James Lovejoy + * Copyright 2014 phm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_keccak.h" +#include "sph/sph_bmw.h" +#include "sph/sph_cubehash.h" +#include "Lyra2.h" + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + + +inline void lyra2rehash(void *state, const void *input) +{ + sph_blake256_context ctx_blake; + sph_bmw256_context ctx_bmw; + sph_keccak256_context ctx_keccak; + sph_skein256_context ctx_skein; + sph_cubehash256_context ctx_cube; + uint32_t hashA[8], hashB[8]; + + sph_blake256_init(&ctx_blake); + sph_blake256 (&ctx_blake, input, 80); + sph_blake256_close (&ctx_blake, hashA); + + sph_keccak256_init(&ctx_keccak); + sph_keccak256 (&ctx_keccak,hashA, 32); + sph_keccak256_close(&ctx_keccak, hashB); + + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashB, 32); + sph_cubehash256_close(&ctx_cube, hashA); + + LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); + + sph_skein256_init(&ctx_skein); + sph_skein256 (&ctx_skein, hashB, 32); + sph_skein256_close(&ctx_skein, hashA); + + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashA, 32); + sph_cubehash256_close(&ctx_cube, hashB); + + sph_bmw256_init(&ctx_bmw); + sph_bmw256 (&ctx_bmw, hashB, 32); + sph_bmw256_close(&ctx_bmw, hashA); + +//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]); + + memcpy(state, hashA, 32); +} + +static const uint32_t diff1targ = 0x0000ffff; + + +/* Used externally as confirmation of correct OCL code */ +int lyra2re_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + lyra2rehash(ohash, data); + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + if (tmp_hash7 > diff1targ) + return -1; + if (tmp_hash7 > Htarg) + return 0; + return 1; +} + +void lyra2re_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + lyra2rehash(ohash, data); +} + +bool scanhash_lyra2re(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while(1) { + uint32_t ostate[8]; + + *nonce = ++n; + data[19] = (n); + lyra2rehash(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", + (long unsigned int)data[7]); + + if (unlikely(tmp_hash7 <= Htarg)) { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) { + *last_nonce = n; + break; + } + } + + return ret; +} + + + diff --git a/algorithm/Lyra2RE.h b/algorithm/Lyra2RE.h new file mode 100644 index 000000000..ca4a04fb8 --- /dev/null +++ b/algorithm/Lyra2RE.h @@ -0,0 +1,11 @@ +#ifndef LYRA2RE_H +#define LYRA2RE_H + +#include "miner.h" +#define LYRA_SCRATCHBUF_SIZE (1536) // matrix size [12][4][4] uint64_t or equivalent +#define LYRA_SECBUF_SIZE (4) // (not used) +extern int lyra2re_test(unsigned char *pdata, const unsigned char *ptarget, + uint32_t nonce); +extern void lyra2re_regenhash(struct work *work); + +#endif /* LYRA2RE_H */ diff --git a/algorithm/Lyra2RE_old.c b/algorithm/Lyra2RE_old.c new file mode 100644 index 000000000..b3f3b4fd9 --- /dev/null +++ b/algorithm/Lyra2RE_old.c @@ -0,0 +1,169 @@ +/*- + * Copyright 2014 James Lovejoy + * Copyright 2014 phm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_keccak.h" +#include "Lyra2_old.h" + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + + +inline void lyra2rehash_old(void *state, const void *input) +{ + sph_blake256_context ctx_blake; + sph_groestl256_context ctx_groestl; + sph_keccak256_context ctx_keccak; + sph_skein256_context ctx_skein; + + uint32_t hashA[8], hashB[8]; + + sph_blake256_init(&ctx_blake); + sph_blake256 (&ctx_blake, input, 80); + sph_blake256_close (&ctx_blake, hashA); + + + + + sph_keccak256_init(&ctx_keccak); + sph_keccak256 (&ctx_keccak,hashA, 32); + sph_keccak256_close(&ctx_keccak, hashB); + + LYRA2O(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + + + sph_skein256_init(&ctx_skein); + sph_skein256 (&ctx_skein, hashA, 32); + sph_skein256_close(&ctx_skein, hashB); + + + sph_groestl256_init(&ctx_groestl); + sph_groestl256 (&ctx_groestl, hashB, 32); + sph_groestl256_close(&ctx_groestl, hashA); + +//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]); + + memcpy(state, hashA, 32); +} + +static const uint32_t diff1targ = 0x0000ffff; + + +/* Used externally as confirmation of correct OCL code */ +int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + lyra2rehash_old(ohash, data); + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + if (tmp_hash7 > diff1targ) + return -1; + if (tmp_hash7 > Htarg) + return 0; + return 1; +} + +void lyra2reold_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + lyra2rehash_old(ohash, data); +} + +bool scanhash_lyra2reold(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while(1) { + uint32_t ostate[8]; + + *nonce = ++n; + data[19] = (n); + lyra2rehash(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", + (long unsigned int)data[7]); + + if (unlikely(tmp_hash7 <= Htarg)) { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) { + *last_nonce = n; + break; + } + } + + return ret; +} + + + diff --git a/algorithm/Lyra2RE_old.h b/algorithm/Lyra2RE_old.h new file mode 100644 index 000000000..0788dfb35 --- /dev/null +++ b/algorithm/Lyra2RE_old.h @@ -0,0 +1,10 @@ +#ifndef LYRA2REOLD_H +#define LYRA2REOLD_H + +#include "miner.h" + +extern int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, + uint32_t nonce); +extern void lyra2reold_regenhash(struct work *work); + +#endif /* LYRA2RE_H */ diff --git a/algorithm/Lyra2_old.c b/algorithm/Lyra2_old.c new file mode 100644 index 000000000..f78c49032 --- /dev/null +++ b/algorithm/Lyra2_old.c @@ -0,0 +1,208 @@ +/** + * Implementation of the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include "Lyra2_old.h" +#include "Sponge_old.h" + +/** + * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords + * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits, + * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all + * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value + * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols). + * + * @param K The derived key to be output by the algorithm + * @param kLen Desired key length + * @param pwd User password + * @param pwdlen Password length + * @param salt Salt + * @param saltlen Salt length + * @param timeCost Parameter to determine the processing time (T) + * @param nRows Number or rows of the memory matrix (R) + * @param nCols Number of columns of the memory matrix (C) + * + * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) + */ +int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) { + + //============================= Basic variables ============================// + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + //==========================================================================/ + + //========== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); + uint64_t *wholeMatrix = malloc(i); + if (wholeMatrix == NULL) { + return -1; + } + memset(wholeMatrix, 0, i); + + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*)); + if (memMatrix == NULL) { + return -1; + } + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < nRows; i++) { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } + //==========================================================================/ + + //============= Getting the password + salt + basil padded with 10*1 ===============// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible + + //First, we clean enough blocks for the password, salt, basil and padding + uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + byte *ptrByte = (byte*) wholeMatrix; + memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); + + //Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + //Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &saltlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &timeCost, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nRows, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nCols, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block + //==========================================================================/ + + //======================= Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + uint64_t *state = malloc(16 * sizeof (uint64_t)); + if (state == NULL) { + return -1; + } + initState(state); + //==========================================================================/ + + //================================ Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for (i = 0; i < nBlocksInput; i++) { + absorbBlockBlake2SafeO(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil) + } + + //Initializes M[0] and M[1] + reducedSqueezeRow0O(state, memMatrix[0]); //The locally copied password is most likely overwritten here + reducedDuplexRow1O(state, memMatrix[0], memMatrix[1]); + + do { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + reducedDuplexRowSetupO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + + + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < nRows); + //==========================================================================/ + + //============================ Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + //Selects a pseudorandom index row* + //------------------------------------------------------------------------------------------ + //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRowO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //------------------------------------------------------------------------------------------ + //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + } while (row != 0); + } + //==========================================================================/ + + //============================ Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlockO(state, memMatrix[rowa]); + + //Squeezes the key + squeezeO(state, K, kLen); + //==========================================================================/ + + //========================= Freeing the memory =============================// + free(memMatrix); + free(wholeMatrix); + + //Wiping out the sponge's internal state before freeing it + memset(state, 0, 16 * sizeof (uint64_t)); + free(state); + //==========================================================================/ + + return 0; +} diff --git a/algorithm/Lyra2_old.h b/algorithm/Lyra2_old.h new file mode 100644 index 000000000..9dbe56682 --- /dev/null +++ b/algorithm/Lyra2_old.h @@ -0,0 +1,50 @@ +/** + * Header file for the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef LYRA2OLD_H_ +#define LYRA2OLD_H_ + +#include + +typedef unsigned char byte; + +//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED) +#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t) +#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes + + +#ifdef BLOCK_LEN_BITS + #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes +#else //default block lenght: 768 bits + #define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes +#endif + +#ifndef N_COLS + #define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default +#endif + +#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks +#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row + + +int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); + +#endif /* LYRA2_H_ */ diff --git a/algorithm/Sponge.c b/algorithm/Sponge.c new file mode 100644 index 000000000..8ece6f99d --- /dev/null +++ b/algorithm/Sponge.c @@ -0,0 +1,745 @@ +/** + * A simple implementation of Blake2b's internal permutation + * in the form of a sponge. + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include "Sponge.h" +#include "Lyra2.h" + + + +/** + * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder + * receive Blake2b's IV as per Blake2b's specification. Note: Even though sponges + * typically have their internal state initialized with zeros, Blake2b's G function + * has a fixed point: if the internal state and message are both filled with zeros. the + * resulting permutation will always be a block filled with zeros; this happens because + * Blake2b does not use the constants originally employed in Blake2 inside its G function, + * relying on the IV for avoiding possible fixed points. + * + * @param state The 1024-bit array to be initialized + */ +inline void initState(uint64_t state[/*16*/]) { + //First 512 bis are zeros + memset(state, 0, 64); + //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV + state[8] = blake2b_IV[0]; + state[9] = blake2b_IV[1]; + state[10] = blake2b_IV[2]; + state[11] = blake2b_IV[3]; + state[12] = blake2b_IV[4]; + state[13] = blake2b_IV[5]; + state[14] = blake2b_IV[6]; + state[15] = blake2b_IV[7]; +} + +/** + * Execute Blake2b's G function, with all 12 rounds. + * + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + */ +inline static void blake2bLyra(uint64_t *v) { + ROUND_LYRA(0); + ROUND_LYRA(1); + ROUND_LYRA(2); + ROUND_LYRA(3); + ROUND_LYRA(4); + ROUND_LYRA(5); + ROUND_LYRA(6); + ROUND_LYRA(7); + ROUND_LYRA(8); + ROUND_LYRA(9); + ROUND_LYRA(10); + ROUND_LYRA(11); +} + +/** + * Executes a reduced version of Blake2b's G function with only one round + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + */ +inline static void reducedBlake2bLyra(uint64_t *v) { + ROUND_LYRA(0); +} + +/** + * Performs a squeeze operation, using Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param out Array that will receive the data squeezed + * @param len The number of bytes to be squeezed into the "out" array + */ +inline void squeeze(uint64_t *state, byte *out, unsigned int len) { + int fullBlocks = len / BLOCK_LEN_BYTES; + byte *ptr = out; + int i; + //Squeezes full blocks + for (i = 0; i < fullBlocks; i++) { + memcpy(ptr, state, BLOCK_LEN_BYTES); + blake2bLyra(state); + ptr += BLOCK_LEN_BYTES; + } + + //Squeezes remaining bytes + memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); +} + +/** + * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words + * of type uint64_t), using Blake2b's G function as the internal permutation + * + * @param state The current state of the sponge + * @param in The block to be absorbed (BLOCK_LEN_INT64 words) + */ +inline void absorbBlock(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + state[8] ^= in[8]; + state[9] ^= in[9]; + state[10] ^= in[10]; + state[11] ^= in[11]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); +} + +/** + * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 + * words of type uint64_t), using Blake2b's G function as the internal permutation + * + * @param state The current state of the sponge + * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) + */ +inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state + + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + + + //Applies the transformation f to the sponge's state + blake2bLyra(state); + +} + +/** + * Performs a reduced squeeze operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param rowOut Row to receive the data squeezed + */ +inline void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, uint64_t nCols) { + uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] + int i; + //M[row][C-1-col] = H.reduced_squeeze() + for (i = 0; i < nCols; i++) { + ptrWord[0] = state[0]; + ptrWord[1] = state[1]; + ptrWord[2] = state[2]; + ptrWord[3] = state[3]; + ptrWord[4] = state[4]; + ptrWord[5] = state[5]; + ptrWord[6] = state[6]; + ptrWord[7] = state[7]; + ptrWord[8] = state[8]; + ptrWord[9] = state[9]; + ptrWord[10] = state[10]; + ptrWord[11] = state[11]; + + //Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + } +} + +/** + * Performs a reduced duplex operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param rowIn Row to feed the sponge + * @param rowOut Row to receive the sponge's output + */ +inline void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; + + for (i = 0; i < nCols; i++) { + + //Absorbing "M[prev][col]" + state[0] ^= (ptrWordIn[0]); + state[1] ^= (ptrWordIn[1]); + state[2] ^= (ptrWordIn[2]); + state[3] ^= (ptrWordIn[3]); + state[4] ^= (ptrWordIn[4]); + state[5] ^= (ptrWordIn[5]); + state[6] ^= (ptrWordIn[6]); + state[7] ^= (ptrWordIn[7]); + state[8] ^= (ptrWordIn[8]); + state[9] ^= (ptrWordIn[9]); + state[10] ^= (ptrWordIn[10]); + state[11] ^= (ptrWordIn[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][C-1-col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + + //Input: next column (i.e., next block in sequence) + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } +} + +/** + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * the wordwise addition of two columns, ignoring carries between words). The + * output of this operation, "rand", is then used to make + * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left and N_COLS is a system parameter. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +inline void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; + + for (i = 0; i < nCols; i++) { + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } +} + +/** + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * the wordwise addition of two columns, ignoring carries between words). The + * output of this operation, "rand", is then used to make + * "M[rowOut][col] = M[rowOut][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +inline void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) { + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + + for (i = 0; i < nCols; i++) { + + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[rowOut][col] = M[rowOut][col] XOR rand + ptrWordOut[0] ^= state[0]; + ptrWordOut[1] ^= state[1]; + ptrWordOut[2] ^= state[2]; + ptrWordOut[3] ^= state[3]; + ptrWordOut[4] ^= state[4]; + ptrWordOut[5] ^= state[5]; + ptrWordOut[6] ^= state[6]; + ptrWordOut[7] ^= state[7]; + ptrWordOut[8] ^= state[8]; + ptrWordOut[9] ^= state[9]; + ptrWordOut[10] ^= state[10]; + ptrWordOut[11] ^= state[11]; + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } +} + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * Performs a duplex operation over "M[rowInOut] [+] M[rowIn]", writing the output "rand" + * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +/* +inline void reducedDuplexRowSetupOLD(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[rowInOut] XOR M[rowIn]" + state[0] ^= ptrWordInOut[0] ^ ptrWordIn[0]; + state[1] ^= ptrWordInOut[1] ^ ptrWordIn[1]; + state[2] ^= ptrWordInOut[2] ^ ptrWordIn[2]; + state[3] ^= ptrWordInOut[3] ^ ptrWordIn[3]; + state[4] ^= ptrWordInOut[4] ^ ptrWordIn[4]; + state[5] ^= ptrWordInOut[5] ^ ptrWordIn[5]; + state[6] ^= ptrWordInOut[6] ^ ptrWordIn[6]; + state[7] ^= ptrWordInOut[7] ^ ptrWordIn[7]; + state[8] ^= ptrWordInOut[8] ^ ptrWordIn[8]; + state[9] ^= ptrWordInOut[9] ^ ptrWordIn[9]; + state[10] ^= ptrWordInOut[10] ^ ptrWordIn[10]; + state[11] ^= ptrWordInOut[11] ^ ptrWordIn[11]; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][col] = rand + ptrWordOut[0] = state[0]; + ptrWordOut[1] = state[1]; + ptrWordOut[2] = state[2]; + ptrWordOut[3] = state[3]; + ptrWordOut[4] = state[4]; + ptrWordOut[5] = state[5]; + ptrWordOut[6] = state[6]; + ptrWordOut[7] = state[7]; + ptrWordOut[8] = state[8]; + ptrWordOut[9] = state[9]; + ptrWordOut[10] = state[10]; + ptrWordOut[11] = state[11]; + + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[10]; + ptrWordInOut[1] ^= state[11]; + ptrWordInOut[2] ^= state[0]; + ptrWordInOut[3] ^= state[1]; + ptrWordInOut[4] ^= state[2]; + ptrWordInOut[5] ^= state[3]; + ptrWordInOut[6] ^= state[4]; + ptrWordInOut[7] ^= state[5]; + ptrWordInOut[8] ^= state[6]; + ptrWordInOut[9] ^= state[7]; + ptrWordInOut[10] ^= state[8]; + ptrWordInOut[11] ^= state[9]; + + //Goes to next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + ptrWordOut += BLOCK_LEN_INT64; + } +} +*/ + +/** + * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" + * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +/* +inline void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[rowInOut] XOR M[rowIn]" + state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; + state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; + state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; + state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; + state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; + state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; + state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; + state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; + state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; + state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; + state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; + state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[10]; + ptrWordInOut[1] ^= state[11]; + ptrWordInOut[2] ^= state[0]; + ptrWordInOut[3] ^= state[1]; + ptrWordInOut[4] ^= state[2]; + ptrWordInOut[5] ^= state[3]; + ptrWordInOut[6] ^= state[4]; + ptrWordInOut[7] ^= state[5]; + ptrWordInOut[8] ^= state[6]; + ptrWordInOut[9] ^= state[7]; + ptrWordInOut[10] ^= state[8]; + ptrWordInOut[11] ^= state[9]; + + + //M[row][col] = rand + ptrWordOut[0] = state[0] ^ ptrWordIn[0]; + ptrWordOut[1] = state[1] ^ ptrWordIn[1]; + ptrWordOut[2] = state[2] ^ ptrWordIn[2]; + ptrWordOut[3] = state[3] ^ ptrWordIn[3]; + ptrWordOut[4] = state[4] ^ ptrWordIn[4]; + ptrWordOut[5] = state[5] ^ ptrWordIn[5]; + ptrWordOut[6] = state[6] ^ ptrWordIn[6]; + ptrWordOut[7] = state[7] ^ ptrWordIn[7]; + ptrWordOut[8] = state[8] ^ ptrWordIn[8]; + ptrWordOut[9] = state[9] ^ ptrWordIn[9]; + ptrWordOut[10] = state[10] ^ ptrWordIn[10]; + ptrWordOut[11] = state[11] ^ ptrWordIn[11]; + + //Goes to next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + ptrWordOut += BLOCK_LEN_INT64; + } +} +*/ + +/** + * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" + * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +/* +inline void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut; + int i; + + for (i = 0; i < N_COLS / 2; i++) { + //Absorbing "M[rowInOut] XOR M[rowIn]" + state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; + state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; + state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; + state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; + state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; + state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; + state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; + state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; + state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; + state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; + state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; + state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[10]; + ptrWordInOut[1] ^= state[11]; + ptrWordInOut[2] ^= state[0]; + ptrWordInOut[3] ^= state[1]; + ptrWordInOut[4] ^= state[2]; + ptrWordInOut[5] ^= state[3]; + ptrWordInOut[6] ^= state[4]; + ptrWordInOut[7] ^= state[5]; + ptrWordInOut[8] ^= state[6]; + ptrWordInOut[9] ^= state[7]; + ptrWordInOut[10] ^= state[8]; + ptrWordInOut[11] ^= state[9]; + + + //M[row][col] = rand + ptrWordOut[0] = state[0] ^ ptrWordIn[0]; + ptrWordOut[1] = state[1] ^ ptrWordIn[1]; + ptrWordOut[2] = state[2] ^ ptrWordIn[2]; + ptrWordOut[3] = state[3] ^ ptrWordIn[3]; + ptrWordOut[4] = state[4] ^ ptrWordIn[4]; + ptrWordOut[5] = state[5] ^ ptrWordIn[5]; + ptrWordOut[6] = state[6] ^ ptrWordIn[6]; + ptrWordOut[7] = state[7] ^ ptrWordIn[7]; + ptrWordOut[8] = state[8] ^ ptrWordIn[8]; + ptrWordOut[9] = state[9] ^ ptrWordIn[9]; + ptrWordOut[10] = state[10] ^ ptrWordIn[10]; + ptrWordOut[11] = state[11] ^ ptrWordIn[11]; + + //Goes to next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + ptrWordOut += 2 * BLOCK_LEN_INT64; + } + + ptrWordOut = rowOut + BLOCK_LEN_INT64; + for (i = 0; i < N_COLS / 2; i++) { + //Absorbing "M[rowInOut] XOR M[rowIn]" + state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; + state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; + state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; + state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; + state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; + state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; + state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; + state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; + state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; + state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; + state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; + state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[10]; + ptrWordInOut[1] ^= state[11]; + ptrWordInOut[2] ^= state[0]; + ptrWordInOut[3] ^= state[1]; + ptrWordInOut[4] ^= state[2]; + ptrWordInOut[5] ^= state[3]; + ptrWordInOut[6] ^= state[4]; + ptrWordInOut[7] ^= state[5]; + ptrWordInOut[8] ^= state[6]; + ptrWordInOut[9] ^= state[7]; + ptrWordInOut[10] ^= state[8]; + ptrWordInOut[11] ^= state[9]; + + + //M[row][col] = rand + ptrWordOut[0] = state[0] ^ ptrWordIn[0]; + ptrWordOut[1] = state[1] ^ ptrWordIn[1]; + ptrWordOut[2] = state[2] ^ ptrWordIn[2]; + ptrWordOut[3] = state[3] ^ ptrWordIn[3]; + ptrWordOut[4] = state[4] ^ ptrWordIn[4]; + ptrWordOut[5] = state[5] ^ ptrWordIn[5]; + ptrWordOut[6] = state[6] ^ ptrWordIn[6]; + ptrWordOut[7] = state[7] ^ ptrWordIn[7]; + ptrWordOut[8] = state[8] ^ ptrWordIn[8]; + ptrWordOut[9] = state[9] ^ ptrWordIn[9]; + ptrWordOut[10] = state[10] ^ ptrWordIn[10]; + ptrWordOut[11] = state[11] ^ ptrWordIn[11]; + + //Goes to next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + ptrWordOut += 2 * BLOCK_LEN_INT64; + } +} +*/ + +/** + * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", using the output "rand" + * to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)", + * where rotW is a 64-bit rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +/* +inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[rowInOut] XOR M[rowIn]" + state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; + state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; + state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; + state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; + state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; + state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; + state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; + state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; + state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; + state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; + state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; + state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[rowOut][col] = M[rowOut][col] XOR rand + ptrWordOut[0] ^= state[0]; + ptrWordOut[1] ^= state[1]; + ptrWordOut[2] ^= state[2]; + ptrWordOut[3] ^= state[3]; + ptrWordOut[4] ^= state[4]; + ptrWordOut[5] ^= state[5]; + ptrWordOut[6] ^= state[6]; + ptrWordOut[7] ^= state[7]; + ptrWordOut[8] ^= state[8]; + ptrWordOut[9] ^= state[9]; + ptrWordOut[10] ^= state[10]; + ptrWordOut[11] ^= state[11]; + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + + + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } +} +*/ + +/** + Prints an array of unsigned chars + */ +void printArray(unsigned char *array, unsigned int size, char *name) { + int i; + printf("%s: ", name); + for (i = 0; i < size; i++) { + printf("%2x|", array[i]); + } + printf("\n"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/algorithm/Sponge.h b/algorithm/Sponge.h new file mode 100644 index 000000000..4ea1dc939 --- /dev/null +++ b/algorithm/Sponge.h @@ -0,0 +1,108 @@ +/** + * Header file for Blake2b's internal permutation in the form of a sponge. + * This code is based on the original Blake2b's implementation provided by + * Samuel Neves (https://blake2.net/) + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef SPONGE_H_ +#define SPONGE_H_ + +#include + +#if defined(__GNUC__) +#define ALIGN __attribute__ ((aligned(32))) +#elif defined(_MSC_VER) +#define ALIGN __declspec(align(32)) +#else +#define ALIGN +#endif + + +/*Blake2b IV Array*/ +static const uint64_t blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +/*Blake2b's rotation*/ +static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ + return ( w >> c ) | ( w << ( 64 - c ) ); +} + +/*Blake2b's G function*/ +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while(0) + + +/*One Round of the Blake2b's compression function*/ +#define ROUND_LYRA(r) \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); + + +//---- Housekeeping +void initState(uint64_t state[/*16*/]); + +//---- Squeezes +void squeeze(uint64_t *state, unsigned char *out, unsigned int len); +void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols); + +//---- Absorbs +void absorbBlock(uint64_t *state, const uint64_t *in); +void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); + +//---- Duplexes +void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols); +void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); + +//---- Misc +void printArray(unsigned char *array, unsigned int size, char *name); + +//////////////////////////////////////////////////////////////////////////////////////////////// + + +////TESTS//// +//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2); +//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +///////////// + + +#endif /* SPONGE_H_ */ diff --git a/algorithm/Sponge_old.c b/algorithm/Sponge_old.c new file mode 100644 index 000000000..aa6c3017a --- /dev/null +++ b/algorithm/Sponge_old.c @@ -0,0 +1,405 @@ +/** + * A simple implementation of Blake2b's internal permutation + * in the form of a sponge. + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include "Sponge_old.h" +#include "Lyra2_old.h" + + + +/** + * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder + * receive Blake2b's IV as per Blake2b's specification. Note: Even though sponges + * typically have their internal state initialized with zeros, Blake2b's G function + * has a fixed point: if the internal state and message are both filled with zeros. the + * resulting permutation will always be a block filled with zeros; this happens because + * Blake2b does not use the constants originally employed in Blake2 inside its G function, + * relying on the IV for avoiding possible fixed points. + * + * @param state The 1024-bit array to be initialized + */ +inline void initStateO(uint64_t state[/*16*/]) { + //First 512 bis are zeros + memset(state, 0, 64); + //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV + state[8] = blake2b_IV[0]; + state[9] = blake2b_IV[1]; + state[10] = blake2b_IV[2]; + state[11] = blake2b_IV[3]; + state[12] = blake2b_IV[4]; + state[13] = blake2b_IV[5]; + state[14] = blake2b_IV[6]; + state[15] = blake2b_IV[7]; +} + +/** + * Execute Blake2b's G function, with all 12 rounds. + * + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + */ +inline static void blake2bLyra(uint64_t *v) { + ROUND_LYRA(0); + ROUND_LYRA(1); + ROUND_LYRA(2); + ROUND_LYRA(3); + ROUND_LYRA(4); + ROUND_LYRA(5); + ROUND_LYRA(6); + ROUND_LYRA(7); + ROUND_LYRA(8); + ROUND_LYRA(9); + ROUND_LYRA(10); + ROUND_LYRA(11); +} + +/** + * Executes a reduced version of Blake2b's G function with only one round + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + */ +inline static void reducedBlake2bLyra(uint64_t *v) { + ROUND_LYRA(0); +} + +/** + * Performs a squeeze operation, using Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param out Array that will receive the data squeezed + * @param len The number of bytes to be squeezed into the "out" array + */ +inline void squeezeO(uint64_t *state, byte *out, unsigned int len) { + int fullBlocks = len / BLOCK_LEN_BYTES; + byte *ptr = out; + int i; + //Squeezes full blocks + for (i = 0; i < fullBlocks; i++) { + memcpy(ptr, state, BLOCK_LEN_BYTES); + blake2bLyra(state); + ptr += BLOCK_LEN_BYTES; + } + + //Squeezes remaining bytes + memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); +} + +/** + * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words + * of type uint64_t), using Blake2b's G function as the internal permutation + * + * @param state The current state of the sponge + * @param in The block to be absorbed (BLOCK_LEN_INT64 words) + */ +inline void absorbBlockO(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + state[8] ^= in[8]; + state[9] ^= in[9]; + state[10] ^= in[10]; + state[11] ^= in[11]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); +} + +/** + * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 + * words of type uint64_t), using Blake2b's G function as the internal permutation + * + * @param state The current state of the sponge + * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) + */ +inline void absorbBlockBlake2SafeO(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); +} + +/** + * Performs a reduced squeeze operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param rowOut Row to receive the data squeezed + */ +inline void reducedSqueezeRow0O(uint64_t* state, uint64_t* rowOut) { + uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] + int i; + //M[row][C-1-col] = H.reduced_squeeze() + for (i = 0; i < N_COLS; i++) { + ptrWord[0] = state[0]; + ptrWord[1] = state[1]; + ptrWord[2] = state[2]; + ptrWord[3] = state[3]; + ptrWord[4] = state[4]; + ptrWord[5] = state[5]; + ptrWord[6] = state[6]; + ptrWord[7] = state[7]; + ptrWord[8] = state[8]; + ptrWord[9] = state[9]; + ptrWord[10] = state[10]; + ptrWord[11] = state[11]; + + //Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + } +} + +/** + * Performs a reduced duplex operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param rowIn Row to feed the sponge + * @param rowOut Row to receive the sponge's output + */ +inline void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[prev][col]" + state[0] ^= (ptrWordIn[0]); + state[1] ^= (ptrWordIn[1]); + state[2] ^= (ptrWordIn[2]); + state[3] ^= (ptrWordIn[3]); + state[4] ^= (ptrWordIn[4]); + state[5] ^= (ptrWordIn[5]); + state[6] ^= (ptrWordIn[6]); + state[7] ^= (ptrWordIn[7]); + state[8] ^= (ptrWordIn[8]); + state[9] ^= (ptrWordIn[9]); + state[10] ^= (ptrWordIn[10]); + state[11] ^= (ptrWordIn[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][C-1-col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + + //Input: next column (i.e., next block in sequence) + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } +} + +/** + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * the wordwise addition of two columns, ignoring carries between words). The + * output of this operation, "rand", is then used to make + * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left and N_COLS is a system parameter. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +inline void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } +} + +/** + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * the wordwise addition of two columns, ignoring carries between words). The + * output of this operation, "rand", is then used to make + * "M[rowOut][col] = M[rowOut][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +inline void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[rowOut][col] = M[rowOut][col] XOR rand + ptrWordOut[0] ^= state[0]; + ptrWordOut[1] ^= state[1]; + ptrWordOut[2] ^= state[2]; + ptrWordOut[3] ^= state[3]; + ptrWordOut[4] ^= state[4]; + ptrWordOut[5] ^= state[5]; + ptrWordOut[6] ^= state[6]; + ptrWordOut[7] ^= state[7]; + ptrWordOut[8] ^= state[8]; + ptrWordOut[9] ^= state[9]; + ptrWordOut[10] ^= state[10]; + ptrWordOut[11] ^= state[11]; + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } +} + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + Prints an array of unsigned chars + */ +void printArrayO(unsigned char *array, unsigned int size, char *name) { + int i; + printf("%s: ", name); + for (i = 0; i < size; i++) { + printf("%2x|", array[i]); + } + printf("\n"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/algorithm/Sponge_old.h b/algorithm/Sponge_old.h new file mode 100644 index 000000000..f8b7de250 --- /dev/null +++ b/algorithm/Sponge_old.h @@ -0,0 +1,98 @@ +/** + * Header file for Blake2b's internal permutation in the form of a sponge. + * This code is based on the original Blake2b's implementation provided by + * Samuel Neves (https://blake2.net/) + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef SPONGEOLD_H_ +#define SPONGEOLD_H_ + +#include + +#if defined(__GNUC__) +#define ALIGN __attribute__ ((aligned(32))) +#elif defined(_MSC_VER) +#define ALIGN __declspec(align(32)) +#else +#define ALIGN +#endif + + +/*Blake2b IV Array*/ +static const uint64_t blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +/*Blake2b's rotation*/ +static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ + return ( w >> c ) | ( w << ( 64 - c ) ); +} + +/*Blake2b's G function*/ +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while(0) + + +/*One Round of the Blake2b's compression function*/ +#define ROUND_LYRA(r) \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); + + +//---- Housekeeping +void initStateO(uint64_t state[/*16*/]); + +//---- Squeezes +void squeezeO(uint64_t *state, unsigned char *out, unsigned int len); +void reducedSqueezeRow0O(uint64_t* state, uint64_t* row); + +//---- Absorbs +void absorbBlockO(uint64_t *state, const uint64_t *in); +void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); + +//---- Duplexes +void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut); +void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); + +//---- Misc +void printArrayO(unsigned char *array, unsigned int size, char *name); + +//////////////////////////////////////////////////////////////////////////////////////////////// + + +#endif /* SPONGE_H_ */ diff --git a/algorithm/credits.c b/algorithm/credits.c new file mode 100644 index 000000000..b69514bca --- /dev/null +++ b/algorithm/credits.c @@ -0,0 +1,148 @@ +/*- + * Copyright 2015 djm34 + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#include "sph/sph_sha2.h" + +static const uint32_t diff1targ = 0x0000ffff; + + + +inline void credits_hash(void *state, const void *input) +{ + sph_sha256_context sha1, sha2; + uint32_t hash[8], hash2[8]; + + sph_sha256_init(&sha1); + sph_sha256(&sha1, input, 168); + sph_sha256_close(&sha1, hash); + + + sph_sha256_init(&sha2); + sph_sha256(&sha2, hash, 32); + sph_sha256_close(&sha2, hash2); + + memcpy(state, hash2, 32); + +} +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + +/* Used externally as confirmation of correct OCL code */ +int credits_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[42], ohash[8]; + printf("coming here credits test\n"); + + be32enc_vect(data, (const uint32_t *)pdata, 42); + data[35] = htobe32(nonce); + credits_hash((unsigned char*)data,(unsigned char*)ohash); + + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + + if (tmp_hash7 > diff1targ) + return -1; + + if (tmp_hash7 > Htarg) + return 0; + + return 1; +} + +void credits_regenhash(struct work *work) +{ + uint32_t data[42]; + uint32_t *nonce = (uint32_t *)(work->data + 140); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 42); + data[35] = htobe32(*nonce); + + credits_hash((unsigned char*)ohash, (unsigned char*)data); + +} + + +bool scanhash_credits(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 140); + uint32_t data[42]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 35); + + + while (1) + { + uint32_t ostate[8]; + + *nonce = ++n; + data[35] = (n); + credits_hash(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", (long unsigned int)ostate[7]); + + if (unlikely(tmp_hash7 <= Htarg)) + { + ((uint32_t *)pdata)[35] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) + { + *last_nonce = n; + break; + } + } + + return ret; +} \ No newline at end of file diff --git a/algorithm/credits.h b/algorithm/credits.h new file mode 100644 index 000000000..9d74ad20d --- /dev/null +++ b/algorithm/credits.h @@ -0,0 +1,10 @@ +#ifndef CREDITS_H +#define CREDITS_H + +#include "miner.h" + + +extern int credits_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); +extern void credits_regenhash(struct work *work); + +#endif /* CREDITS_H */ diff --git a/algorithm/pluck.c b/algorithm/pluck.c new file mode 100644 index 000000000..093dd68ee --- /dev/null +++ b/algorithm/pluck.c @@ -0,0 +1,482 @@ +/*- + * Copyright 2014 James Lovejoy + * Copyright 2014 phm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + + + +static const uint32_t sha256_h[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 +}; + +static const uint32_t sha256_k[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +void sha256_init(uint32_t *state) +{ + memcpy(state, sha256_h, 32); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + do { \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; \ + } while (0) + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + sha256_k[i]) + + +/* +* SHA256 block compression function. The 256-bit state is transformed via +* the 512-bit input block to produce a new state. +*/ +void sha256_transform(uint32_t *state, const uint32_t *block, int swap) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + + /* 1. Prepare message schedule W. */ + if (swap) { + for (i = 0; i < 16; i++) + W[i] = swab32(block[i]); + } + else + memcpy(W, block, 64); + for (i = 16; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i + 1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} +static inline void be32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} +static inline uint32_t be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} +#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) +//note, this is 64 bytes +static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) +{ +#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15; + int i; + + x00 = (B[0] ^= Bx[0]); + x01 = (B[1] ^= Bx[1]); + x02 = (B[2] ^= Bx[2]); + x03 = (B[3] ^= Bx[3]); + x04 = (B[4] ^= Bx[4]); + x05 = (B[5] ^= Bx[5]); + x06 = (B[6] ^= Bx[6]); + x07 = (B[7] ^= Bx[7]); + x08 = (B[8] ^= Bx[8]); + x09 = (B[9] ^= Bx[9]); + x10 = (B[10] ^= Bx[10]); + x11 = (B[11] ^= Bx[11]); + x12 = (B[12] ^= Bx[12]); + x13 = (B[13] ^= Bx[13]); + x14 = (B[14] ^= Bx[14]); + x15 = (B[15] ^= Bx[15]); + for (i = 0; i < 8; i += 2) { + /* Operate on columns. */ + x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7); + x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7); + + x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9); + x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9); + + x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13); + x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13); + + x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18); + x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18); + + /* Operate on rows. */ + x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7); + x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7); + + x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9); + x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9); + + x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13); + x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13); + + x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18); + x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18); + } + B[0] += x00; + B[1] += x01; + B[2] += x02; + B[3] += x03; + B[4] += x04; + B[5] += x05; + B[6] += x06; + B[7] += x07; + B[8] += x08; + B[9] += x09; + B[10] += x10; + B[11] += x11; + B[12] += x12; + B[13] += x13; + B[14] += x14; + B[15] += x15; +#undef ROTL +} + +void sha256_hash(unsigned char *hash, const unsigned char *data, int len) +{ + uint32_t S[16], T[16]; + int i, r; + + sha256_init(S); + for (r = len; r > -9; r -= 64) { + if (r < 64) + memset(T, 0, 64); + memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); + if (r >= 0 && r < 64) + ((unsigned char *)T)[r] = 0x80; + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + + if (r < 56) + T[15] = 8 * len; + sha256_transform(S, T, 0); + } + for (i = 0; i < 8; i++) + be32enc((uint32_t *)hash + i, S[i]); +} + +void sha256_hash512(unsigned char *hash, const unsigned char *data) +{ + uint32_t S[16], T[16]; + int i; + + sha256_init(S); + + memcpy(T, data, 64); + + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + sha256_transform(S, T, 0); + + memset(T, 0, 64); + //memcpy(T, data + 64, 0); + ((unsigned char *)T)[0] = 0x80; + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + T[15] = 8 * 64; + sha256_transform(S, T, 0); + + for (i = 0; i < 8; i++) + be32enc((uint32_t *)hash + i, S[i]); +} + +inline void pluckrehash(void *state, const void *input) +{ + + int i,j; + uint32_t data[20]; + + const int HASH_MEMORY = 128 * 1024; + uint8_t * scratchbuf = (uint8_t*)malloc(HASH_MEMORY); + memcpy(data,input,80); + + uint8_t hashbuffer[128*1024]; //don't allocate this on stack, since it's huge.. + int size = HASH_MEMORY; + memset(hashbuffer, 0, 64); + sha256_hash(&hashbuffer[0], (uint8_t*)data, 80); + for (i = 64; i < size - 32; i += 32) + { + int randmax = i - 4; //we could use size here, but then it's probable to use 0 as the value in most cases + uint32_t joint[16]; + uint32_t randbuffer[16]; + + uint32_t randseed[16]; + memcpy(randseed, &hashbuffer[i - 64], 64); + if (i>128) + { + memcpy(randbuffer, &hashbuffer[i - 128], 64); + } + else + { + memset(&randbuffer, 0, 64); + } + + xor_salsa8(randbuffer, randseed); + + memcpy(joint, &hashbuffer[i - 32], 32); + //use the last hash value as the seed + for (j = 32; j < 64; j += 4) + { + uint32_t rand = randbuffer[(j - 32) / 4] % (randmax - 32); + joint[j / 4] = *((uint32_t*)&hashbuffer[rand]); + + } + sha256_hash512(&hashbuffer[i], (uint8_t*)joint); + + memcpy(randseed, &hashbuffer[i - 32], 64); + if (i>128) + { + memcpy(randbuffer, &hashbuffer[i - 128], 64); + } + else + { + memset(randbuffer, 0, 64); + } + xor_salsa8(randbuffer, randseed); + for (j = 0; j < 32; j += 2) + { + uint32_t rand = randbuffer[j / 2] % randmax; + *((uint32_t*)&hashbuffer[rand]) = *((uint32_t*)&hashbuffer[j + i - 4]); + } + } + + + //printf("cpu hashbuffer %08x nonce %08x\n", ((uint32_t*)hashbuffer)[7],data[19]); + + memcpy(state, hashbuffer, 32); +} + +static const uint32_t diff1targ = 0x0000ffff; + + +/* Used externally as confirmation of correct OCL code */ +int pluck_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + pluckrehash(ohash, data); + + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + + if (tmp_hash7 > diff1targ) + return -1; + + if (tmp_hash7 > Htarg) + return 0; + + return 1; +} + +void pluck_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + + pluckrehash(ohash, data); +} + + +bool scanhash_pluck(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while (1) + { + uint32_t ostate[8]; + + *nonce = ++n; + data[19] = (n); + pluckrehash(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]); + + if (unlikely(tmp_hash7 <= Htarg)) + { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) + { + *last_nonce = n; + break; + } + } + + return ret; +} \ No newline at end of file diff --git a/algorithm/pluck.h b/algorithm/pluck.h new file mode 100644 index 000000000..619eb0137 --- /dev/null +++ b/algorithm/pluck.h @@ -0,0 +1,12 @@ +#ifndef PLUCK_H +#define PLUCK_H + +#include "miner.h" +#define PLUCK_SCRATCHBUF_SIZE (128 * 1024) +#define PLUCK_SECBUF_SIZE (64 * 1024) + +extern int pluck_test(unsigned char *pdata, const unsigned char *ptarget, + uint32_t nonce); +extern void pluck_regenhash(struct work *work); + +#endif /* PLUCK_H */ diff --git a/algorithm/sysendian.h b/algorithm/sysendian.h new file mode 100644 index 000000000..31ac985fb --- /dev/null +++ b/algorithm/sysendian.h @@ -0,0 +1,140 @@ +/*- + * Copyright 2007-2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ +#ifndef _SYSENDIAN_H_ +#define _SYSENDIAN_H_ + +/* If we don't have be64enc, the we have isn't usable. */ +#if !HAVE_DECL_BE64ENC +#undef HAVE_SYS_ENDIAN_H +#endif + +#ifdef HAVE_SYS_ENDIAN_H + +#include + +#else + +#include + +#if !HAVE_DECL_LE32DEC +static uint32_t le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} +#endif + +#if !HAVE_DECL_BE32ENC +static void be32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} +#endif + +#if !HAVE_DECL_BE32DEC +static uint32_t be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} +#endif + +#if !HAVE_DECL_LE32ENC +static void le32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} +#endif + +static uint64_t +be64dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) + + ((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) + + ((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) + + ((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56)); +} + +static void +be64enc(void *pp, uint64_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[7] = x & 0xff; + p[6] = (x >> 8) & 0xff; + p[5] = (x >> 16) & 0xff; + p[4] = (x >> 24) & 0xff; + p[3] = (x >> 32) & 0xff; + p[2] = (x >> 40) & 0xff; + p[1] = (x >> 48) & 0xff; + p[0] = (x >> 56) & 0xff; +} + + + +static uint64_t +le64dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) + + ((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) + + ((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) + + ((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56)); +} + +static void +le64enc(void *pp, uint64_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; + p[4] = (x >> 32) & 0xff; + p[5] = (x >> 40) & 0xff; + p[6] = (x >> 48) & 0xff; + p[7] = (x >> 56) & 0xff; +} +#endif /* !HAVE_SYS_ENDIAN_H */ + +#endif /* !_SYSENDIAN_H_ */ diff --git a/algorithm/yescrypt-opt.c b/algorithm/yescrypt-opt.c new file mode 100644 index 000000000..b54be469d --- /dev/null +++ b/algorithm/yescrypt-opt.c @@ -0,0 +1,1364 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013,2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#ifdef __i386__ +#warning "This implementation does not use SIMD, and thus it runs a lot slower than the SIMD-enabled implementation. Enable at least SSE2 in the C compiler and use yescrypt-best.c instead unless you're building this SIMD-less implementation on purpose (portability to older CPUs or testing)." +#elif defined(__x86_64__) +#warning "This implementation does not use SIMD, and thus it runs a lot slower than the SIMD-enabled implementation. Use yescrypt-best.c instead unless you're building this SIMD-less implementation on purpose (for testing only)." +#endif + +#include +#include +#include +#include "algorithm/yescrypt_core.h" +#include "sph/sha256_Y.h" +#include "algorithm/sysendian.h" + +// #include "sph/yescrypt-platform.c" +#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024) + +#ifdef __x86_64__ +#define HUGEPAGE_SIZE (2 * 1024 * 1024) +#else +#undef HUGEPAGE_SIZE +#endif + + +static void * +alloc_region(yescrypt_region_t * region, size_t size) +{ + size_t base_size = size; + uint8_t * base, *aligned; +#ifdef MAP_ANON + int flags = +#ifdef MAP_NOCORE + MAP_NOCORE | +#endif + MAP_ANON | MAP_PRIVATE; +#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) + size_t new_size = size; + const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; + if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { + flags |= MAP_HUGETLB; + /* + * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of + * huge page size, so let's round up to huge page size here. + */ + new_size = size + hugepage_mask; + new_size &= ~hugepage_mask; + } + base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (base != MAP_FAILED) { + base_size = new_size; + } + else + if (flags & MAP_HUGETLB) { + flags &= ~MAP_HUGETLB; + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + } + +#else + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); +#endif + if (base == MAP_FAILED) + base = NULL; + aligned = base; +#elif defined(HAVE_POSIX_MEMALIGN) + if ((errno = posix_memalign((void **)&base, 64, size)) != 0) + base = NULL; + aligned = base; +#else + base = aligned = NULL; + if (size + 63 < size) { + errno = ENOMEM; + } + else if ((base = malloc(size + 63)) != NULL) { + aligned = base + 63; + aligned -= (uintptr_t)aligned & 63; + } +#endif + region->base = base; + region->aligned = aligned; + region->base_size = base ? base_size : 0; + region->aligned_size = base ? size : 0; + return aligned; +} + +static void init_region(yescrypt_region_t * region) +{ + region->base = region->aligned = NULL; + region->base_size = region->aligned_size = 0; +} + +static int +free_region(yescrypt_region_t * region) +{ + if (region->base) { +#ifdef MAP_ANON + if (munmap(region->base, region->base_size)) + return -1; +#else + free(region->base); +#endif + } + init_region(region); + return 0; +} + +int +yescrypt_init_shared(yescrypt_shared_t * shared, +const uint8_t * param, size_t paramlen, +uint64_t N, uint32_t r, uint32_t p, +yescrypt_init_shared_flags_t flags, uint32_t mask, +uint8_t * buf, size_t buflen) +{ + yescrypt_shared1_t * shared1 = &shared->shared1; + yescrypt_shared_t dummy, half1, half2; + // yescrypt_shared_t * half2; + uint8_t salt[32]; + + if (flags & YESCRYPT_SHARED_PREALLOCATED) { + if (!shared1->aligned || !shared1->aligned_size) + return -1; + } + else { + init_region(shared1); + } + shared->mask1 = 1; + if (!param && !paramlen && !N && !r && !p && !buf && !buflen) + return 0; + + init_region(&dummy.shared1); + dummy.mask1 = 1; + if (yescrypt_kdf(&dummy, shared1, + param, paramlen, NULL, 0, N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, + salt, sizeof(salt))) + goto out; + + half1 = half2 = *shared; + half1.shared1.aligned_size /= 2; + half2.shared1.aligned_size = half1.shared1.aligned_size; + half2.shared1.aligned = (char*)half2.shared1.aligned + half1.shared1.aligned_size; + + N /= 2; + + if (p > 1 && yescrypt_kdf(&half1, &half2.shared1, + param, paramlen, salt, sizeof(salt), N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2, + salt, sizeof(salt))) + goto out; + + if (yescrypt_kdf(&half2, &half1.shared1, + param, paramlen, salt, sizeof(salt), N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, + salt, sizeof(salt))) + goto out; + + if (yescrypt_kdf(&half1, &half2.shared1, + param, paramlen, salt, sizeof(salt), N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, + buf, buflen)) + goto out; + + shared->mask1 = mask; + + return 0; + +out: + if (!(flags & YESCRYPT_SHARED_PREALLOCATED)) + free_region(shared1); + return -1; +} + +int +yescrypt_free_shared(yescrypt_shared_t * shared) +{ + return free_region(&shared->shared1); +} + +int +yescrypt_init_local(yescrypt_local_t * local) +{ + init_region(local); + return 0; +} + +int +yescrypt_free_local(yescrypt_local_t * local) +{ + return free_region(local); +} + + +static void +blkcpy(uint64_t * dest, const uint64_t * src, size_t count) +{ + do { + *dest++ = *src++; *dest++ = *src++; + *dest++ = *src++; *dest++ = *src++; + } while (count -= 4); +}; + +static void +blkxor(uint64_t * dest, const uint64_t * src, size_t count) +{ + do { + *dest++ ^= *src++; *dest++ ^= *src++; + *dest++ ^= *src++; *dest++ ^= *src++; + } while (count -= 4); +}; + +typedef union { + uint32_t w[16]; + uint64_t d[8]; +} salsa20_blk_t; + +static void +salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout) +{ +#define COMBINE(out, in1, in2) \ + Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); + COMBINE(0, 0, 2) + COMBINE(1, 5, 7) + COMBINE(2, 2, 4) + COMBINE(3, 7, 1) + COMBINE(4, 4, 6) + COMBINE(5, 1, 3) + COMBINE(6, 6, 0) + COMBINE(7, 3, 5) +#undef COMBINE +} + +static void +salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout) +{ +#define COMBINE(out, in1, in2) \ + Bout->w[out * 2] = Bin->d[in1]; \ + Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; + COMBINE(0, 0, 6) + COMBINE(1, 5, 3) + COMBINE(2, 2, 0) + COMBINE(3, 7, 5) + COMBINE(4, 4, 2) + COMBINE(5, 1, 7) + COMBINE(6, 6, 4) + COMBINE(7, 3, 1) +#undef COMBINE +} + +/** + * salsa20_8(B): + * Apply the salsa20/8 core to the provided block. + */ + +static void +salsa20_8(uint64_t B[8]) +{ + size_t i; + salsa20_blk_t X; + +#define x X.w + + salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X); + + for (i = 0; i < 8; i += 2) { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } +#undef x + + { + salsa20_blk_t Y; + salsa20_simd_shuffle(&X, &Y); + for (i = 0; i < 16; i += 4) { + ((salsa20_blk_t *)B)->w[i] += Y.w[i]; + ((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1]; + ((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2]; + ((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3]; + } + } +} + +/** + * blockmix_salsa8(Bin, Bout, X, r): + * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r + * bytes in length; the output Bout must also be the same size. The + * temporary space X must be 64 bytes. + */ +static void +blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r) +{ + size_t i; + + /* 1: X <-- B_{2r - 1} */ + blkcpy(X, &Bin[(2 * r - 1) * 8], 8); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < 2 * r; i += 2) { + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 8], 8); + salsa20_8(X); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 4], X, 8); + + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 8 + 8], 8); + salsa20_8(X); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 4 + r * 8], X, 8); + } + +} + +/* These are tunable */ +#define S_BITS 8 +#define S_SIMD 2 +#define S_P 4 +#define S_ROUNDS 6 + +/* Number of S-boxes. Not tunable, hard-coded in a few places. */ +#define S_N 2 + +/* Derived values. Not tunable on their own. */ +#define S_SIZE1 (1 << S_BITS) +#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8) +#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK) +#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD) +#define S_P_SIZE (S_P * S_SIMD) +#define S_MIN_R ((S_P * S_SIMD + 15) / 16) + +/** + * pwxform(B): + * Transform the provided block using the provided S-boxes. + */ + +static void +block_pwxform(uint64_t * B, const uint64_t * S) +{ + uint64_t(*X)[S_SIMD] = (uint64_t(*)[S_SIMD])B; + const uint8_t *S0 = (const uint8_t *)S; + const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD); + size_t i, j; + + for (j = 0; j < S_P; j++) { + + uint64_t *Xj = X[j]; + uint64_t x0 = Xj[0]; + uint64_t x1 = Xj[1]; + + for (i = 0; i < S_ROUNDS; i++) { + uint64_t x = x0 & S_MASK2; + const uint64_t *p0, *p1; + + p0 = (const uint64_t *)(S0 + (uint32_t)x); + p1 = (const uint64_t *)(S1 + (x >> 32)); + + x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0; + x0 += p0[0]; + x0 ^= p1[0]; + + x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1; + x1 += p0[1]; + x1 ^= p1[1]; + } + Xj[0] = x0; + Xj[1] = x1; + } + + + +} + + +/** + * blockmix_pwxform(Bin, Bout, S, r): + * Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin). The input Bin must + * be 128r bytes in length; the output Bout must also be the same size. + * + * S lacks const qualifier to match blockmix_salsa8()'s prototype, which we + * need to refer to both functions via the same function pointers. + */ +static void +blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r) +{ + size_t r1, r2, i; + // S_P_SIZE = 8; + /* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */ + + r1 = r * 128 / (S_P_SIZE * 8); + /* X <-- B_{r1 - 1} */ + blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE); + + /* X <-- X \xor B_i */ + blkxor(Bout, Bin, S_P_SIZE); + + /* X <-- H'(X) */ + /* B'_i <-- X */ + block_pwxform(Bout, S); + + /* for i = 0 to r1 - 1 do */ + for (i = 1; i < r1; i++) { + /* X <-- X \xor B_i */ + blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],S_P_SIZE); + blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE); + + /* X <-- H'(X) */ + /* B'_i <-- X */ + block_pwxform(&Bout[i * S_P_SIZE], S); + } + + /* Handle partial blocks */ + if (i * S_P_SIZE < r * 16) { + blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],r * 16 - i * S_P_SIZE); +} + + i = (r1 - 1) * S_P_SIZE / 8; + /* Convert 128-byte blocks to 64-byte blocks */ + r2 = r * 2; + + /* B'_i <-- H(B'_i) */ + salsa20_8(&Bout[i * 8]); + + + i++; +/// not used yescrypt + + for (; i < r2; i++) { + /* B'_i <-- H(B'_i \xor B'_{i-1}) */ + blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8); + salsa20_8(&Bout[i * 8]); + } +} + + + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static uint64_t +integerify(const uint64_t * B, size_t r) +{ +/* + * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit + * word of B_{2r-1} due to SIMD shuffling. The 64-bit value we return is also + * in host byte order, as it should be. + */ + const uint64_t * X = &B[(2 * r - 1) * 8]; + uint32_t lo = X[0]; + uint32_t hi = X[6] >> 32; + return ((uint64_t)hi << 32) + lo; +} + +/** + * smix1(B, r, N, flags, V, NROM, shared, XY, S): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r + 64 bytes in length. The value N must be even and + * no smaller than 2. + */ +static void +smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, + uint64_t * XY, uint64_t * S) +{ + void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = (S ? blockmix_pwxform : blockmix_salsa8); + const uint64_t * VROM = shared->shared1.aligned; + uint32_t VROM_mask = shared->mask1; + size_t s = 16 * r; + uint64_t * X = V; + uint64_t * Y = &XY[s]; + uint64_t * Z = S ? S : &XY[2 * s]; + uint64_t n, i, j; + size_t k; + + /* 1: X <-- B */ + /* 3: V_i <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8]; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + + salsa20_simd_shuffle(tmp, dst); + } + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + + blockmix(X, Y, Z, r); + blkcpy(&V[s], Y, s); + X = XY; + + if (NROM && (VROM_mask & 1)) { + if ((1 & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j = integerify(Y, r) & (NROM - 1); + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + } + + blockmix(Y, X, Z, r); + + + /* 2: for i = 0 to N - 1 do */ + for (n = 1, i = 2; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if ((i & (i - 1)) == 0) + n <<= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(X, r) & (n - 1); + j += i - n; + + /* X <-- X \xor V_j */ + blkxor(X, &V[j * s], s); + + /* 4: X <-- H(X) */ + blockmix(X, Y, Z, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * s], Y, s); + + j = integerify(Y, r); + if (((i + 1) & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j &= NROM - 1; + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + } else { + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i + 1 - n; + + /* X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + } + + blockmix(Y, X, Z, r); + } + } else { + yescrypt_flags_t rw = flags & YESCRYPT_RW; + /* 4: X <-- H(X) */ + blockmix(Y, X, Z, r); + + /* 2: for i = 0 to N - 1 do */ + for (n = 1, i = 2; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if (rw) { + if ((i & (i - 1)) == 0) + n <<= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(X, r) & (n - 1); + j += i - n; + + /* X <-- X \xor V_j */ + blkxor(X, &V[j * s], s); + } + + /* 4: X <-- H(X) */ + blockmix(X, Y, Z, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * s], Y, s); + + if (rw) { + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(Y, r) & (n - 1); + j += (i + 1) - n; + + + /* X <-- X \xor V_j */ + blkxor(Y, &V[j * s], s); + } + + /* 4: X <-- H(X) */ + blockmix(Y, X, Z, r); + } + } + + /* B' <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8]; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + + + +/** + * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r + 64 bytes in length. The value N must be a + * power of 2 greater than 1. The value Nloop must be even. + */ +static void +smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop, + yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, + uint64_t * XY, uint64_t * S) +{ + + void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = + (S ? blockmix_pwxform : blockmix_salsa8); + const uint64_t * VROM = shared->shared1.aligned; + uint32_t VROM_mask = shared->mask1 | 1; + size_t s = 16 * r; + yescrypt_flags_t rw = flags & YESCRYPT_RW; + uint64_t * X = XY; + uint64_t * Y = &XY[s]; + uint64_t * Z = S ? S : &XY[2 * s]; + uint64_t i, j; + size_t k; + + if (Nloop == 0) + return; + + /* X <-- B' */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8]; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + salsa20_simd_shuffle(tmp, dst); + } + if (NROM) { + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i += 2) { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], X, s); + blockmix(X, Y, Z, r); + + j = integerify(Y, r); + if (((i + 1) & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j &= NROM - 1; + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + } else { + /* 7: j <-- Integerify(X) mod N */ + j &= N - 1; + + /* 8: X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], Y, s); + } + + blockmix(Y, X, Z, r); + } + } else { + + /* 6: for i = 0 to N - 1 do */ + i = Nloop / 2; + do { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], X, s); + blockmix(X, Y, Z, r); + + /* 7: j <-- Integerify(X) mod N */ + j = integerify(Y, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], Y, s); + blockmix(Y, X, Z, r); + } while (--i); + } + + /* 10: B' <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8]; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + + + + +/** + * p2floor(x): + * Largest power of 2 not greater than argument. + */ +static uint64_t +p2floor(uint64_t x) +{ + uint64_t y; + while ((y = x & (x - 1))) + x = y; + return x; +} + +/** + * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage + * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is + * required with OpenMP-enabled builds). The value N must be a power of 2 + * greater than 1. + */ +static void +smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, + yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, + uint64_t * XY, uint64_t * S) +{ + size_t s = 16 * r; + uint64_t Nchunk = N / p, Nloop_all, Nloop_rw; + uint32_t i; + + Nloop_all = Nchunk; + if (flags & YESCRYPT_RW) { + if (t <= 1) { + if (t) + Nloop_all *= 2; /* 2/3 */ + Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ + } else { + Nloop_all *= t - 1; + } + } else if (t) { + if (t == 1) + Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ + Nloop_all *= t; + } + + Nloop_rw = 0; + if (flags & __YESCRYPT_INIT_SHARED) + Nloop_rw = Nloop_all; + else if (flags & YESCRYPT_RW) + Nloop_rw = Nloop_all / p; + + Nchunk &= ~(uint64_t)1; /* round down to even */ + Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ + Nloop_rw &= ~(uint64_t)1; /* round down to even */ + + + for (i = 0; i < p; i++) { + uint64_t Vchunk = i * Nchunk; + uint64_t * Bp = &B[i * s]; + uint64_t * Vp = &V[Vchunk * s]; + uint64_t * XYp = XY; + + uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + + if (Sp) + smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM,Sp, NROM, shared, XYp, NULL); + + + + if (!(flags & __YESCRYPT_INIT_SHARED_2)) + smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); + + + smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, NROM, shared, XYp, Sp); + + + + } + if (Nloop_all > Nloop_rw) { + + for (i = 0; i < p; i++) { + uint64_t * Bp = &B[i * s]; + + uint64_t * XYp = XY; + + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + smix2(Bp, r, N, Nloop_all - Nloop_rw,flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); + + } + } + + + + +} + +static void +smix_old(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, +yescrypt_flags_t flags, +uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, +uint64_t * XY, uint64_t * S) +{ + size_t s = 16 * r; + uint64_t Nchunk = N / p, Nloop_all, Nloop_rw; + uint32_t i; + + Nloop_all = Nchunk; + if (flags & YESCRYPT_RW) { + if (t <= 1) { + if (t) + Nloop_all *= 2; /* 2/3 */ + Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ + } + else { + Nloop_all *= t - 1; + } + } + else if (t) { + if (t == 1) + Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ + Nloop_all *= t; + } + + Nloop_rw = 0; + if (flags & __YESCRYPT_INIT_SHARED) + Nloop_rw = Nloop_all; + else if (flags & YESCRYPT_RW) + Nloop_rw = Nloop_all / p; + + Nchunk &= ~(uint64_t)1; /* round down to even */ + Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ + Nloop_rw &= ~(uint64_t)1; /* round down to even */ + + + for (i = 0; i < p; i++) { + uint64_t Vchunk = i * Nchunk; + uint64_t * Bp = &B[i * s]; + uint64_t * Vp = &V[Vchunk * s]; + uint64_t * XYp = XY; + + uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + + if (Sp) { + smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM, Sp, NROM, shared, XYp, NULL); + + + } + if (!(flags & __YESCRYPT_INIT_SHARED_2)) { + smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); + } + + + smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, NROM, shared, XYp, Sp); + } + + if (Nloop_all > Nloop_rw) { + + for (i = 0; i < p; i++) { + uint64_t * Bp = &B[i * s]; + + uint64_t * XYp = XY; + + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + smix2(Bp, r, N, Nloop_all - Nloop_rw, flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); + } + } +} + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters r, p, and buflen must satisfy + * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power + * of 2 greater than 1. + * + * t controls computation time while not affecting peak memory usage. shared + * and flags may request special modes as described in yescrypt.h. local is + * the thread-local data structure, allowing to preserve and reuse a memory + * allocation across calls, thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + */ +int +yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, + uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, + uint8_t * buf, size_t buflen) +{ + yescrypt_region_t tmp; + uint64_t NROM; + size_t B_size, V_size, XY_size, need; + uint64_t * B, * V, * XY, * S; + uint64_t sha256[4]; + + /* + * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, + * so don't let it have side-effects. Without this adjustment, it'd + * enable the SHA-256 password pre-hashing and output post-hashing, + * because any deviation from classic scrypt implies those. + */ + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + /* Sanity-check parameters */ + if (flags & ~YESCRYPT_KNOWN_FLAGS) { + errno = EINVAL; + return -1; + } +#if SIZE_MAX > UINT32_MAX + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { + errno = EFBIG; + return -1; + } +#endif + if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { + errno = EFBIG; + return -1; + } + if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) { + errno = EINVAL; + return -1; + } + if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) { + errno = EINVAL; + return -1; + } +#if S_MIN_R > 1 + if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) { + errno = EINVAL; + return -1; + } +#endif + if ((p > SIZE_MAX / ((size_t)256 * r + 64)) || +#if SIZE_MAX / 256 <= UINT32_MAX + (r > SIZE_MAX / 256) || +#endif + (N > SIZE_MAX / 128 / r)) { + errno = ENOMEM; + return -1; + } + if (N > UINT64_MAX / ((uint64_t)t + 1)) { + errno = EFBIG; + return -1; + } + + if ((flags & YESCRYPT_PWXFORM) && + p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) { + errno = ENOMEM; + return -1; + } + + NROM = 0; + if (shared->shared1.aligned) { + NROM = shared->shared1.aligned_size / ((size_t)128 * r); + if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || + !(flags & YESCRYPT_RW)) { + errno = EINVAL; + return -1; + } + } + + /* Allocate memory */ + V = NULL; + V_size = (size_t)128 * r * N; + + need = V_size; + if (flags & __YESCRYPT_INIT_SHARED) { + if (local->aligned_size < need) { + if (local->base || local->aligned || + local->base_size || local->aligned_size) { + errno = EINVAL; + return -1; + } + if (!alloc_region(local, need)) + return -1; + } + V = (uint64_t *)local->aligned; + need = 0; + } + B_size = (size_t)128 * r * p; + need += B_size; + if (need < B_size) { + errno = ENOMEM; + return -1; + } + XY_size = (size_t)256 * r + 64; + + need += XY_size; + if (need < XY_size) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_PWXFORM) { + size_t S_size = S_SIZE_ALL * sizeof(*S); + + if (flags & YESCRYPT_PARALLEL_SMIX) + S_size *= p; + + need += S_size; + if (need < S_size) { + errno = ENOMEM; + return -1; + } + } + if (flags & __YESCRYPT_INIT_SHARED) { + if (!alloc_region(&tmp, need)) + return -1; + B = (uint64_t *)tmp.aligned; + XY = (uint64_t *)((uint8_t *)B + B_size); + } else { + init_region(&tmp); + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint64_t *)local->aligned; + V = (uint64_t *)((uint8_t *)B + B_size); + XY = (uint64_t *)((uint8_t *)V + V_size); + } + S = NULL; + if (flags & YESCRYPT_PWXFORM) + S = (uint64_t *)((uint8_t *)XY + XY_size); + + + if (t || flags) { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, passwd, passwdlen); + SHA256_Final_Y((uint8_t *)sha256, &ctx); + passwd = (uint8_t *)sha256; + passwdlen = sizeof(sha256); + } + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,(uint8_t *)B, B_size); + + if (t || flags) + { + blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); + } + if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) { + smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); + } else { + uint32_t i; + /* 2: for i = 0 to p - 1 do */ + for (i = 0; i < p; i++) { + /* 3: B_i <-- MF(B_i, N) */ + smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V, NROM, shared, XY, S); + } + } + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + + PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen); + /* + * Except when computing classic scrypt, allow all computation so far + * to be performed on the client. The final steps below match those of + * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so + * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of + * SCRAM's use of SHA-1) would be usable with yescrypt hashes. + */ + if ((t || flags) && buflen == sizeof(sha256)) { + /* Compute ClientKey */ + + { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, buf, buflen); + HMAC_SHA256_Update_Y(&ctx, salt, saltlen); + HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); + } + /* Compute StoredKey */ + { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256)); + SHA256_Final_Y(buf, &ctx); + } + } + + if (free_region(&tmp)) + return -1; + + /* Success! */ + return 0; +} + +int +yescrypt_kdf_old(const yescrypt_shared_t * shared, yescrypt_local_t * local, +const uint8_t * passwd, size_t passwdlen, +const uint8_t * salt, size_t saltlen, +uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, +uint8_t * buf, size_t buflen) +{ + yescrypt_region_t tmp; + uint64_t NROM; + size_t B_size, V_size, XY_size, need; + uint64_t * B, *V, *XY, *S; + uint64_t sha256[4]; + + /* + * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, + * so don't let it have side-effects. Without this adjustment, it'd + * enable the SHA-256 password pre-hashing and output post-hashing, + * because any deviation from classic scrypt implies those. + */ + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + /* Sanity-check parameters */ + if (flags & ~YESCRYPT_KNOWN_FLAGS) { + errno = EINVAL; + return -1; + } +#if SIZE_MAX > UINT32_MAX + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { + errno = EFBIG; + return -1; + } +#endif + if ((uint64_t)(r)* (uint64_t)(p) >= (1 << 30)) { + errno = EFBIG; + return -1; + } + if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) { + errno = EINVAL; + return -1; + } + if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) { + errno = EINVAL; + return -1; + } +#if S_MIN_R > 1 + if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) { + errno = EINVAL; + return -1; + } +#endif + if ((p > SIZE_MAX / ((size_t)256 * r + 64)) || +#if SIZE_MAX / 256 <= UINT32_MAX + (r > SIZE_MAX / 256) || +#endif + (N > SIZE_MAX / 128 / r)) { + errno = ENOMEM; + return -1; + } + if (N > UINT64_MAX / ((uint64_t)t + 1)) { + errno = EFBIG; + return -1; + } + + if ((flags & YESCRYPT_PWXFORM) && + p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) { + errno = ENOMEM; + return -1; + } + + NROM = 0; + if (shared->shared1.aligned) { + NROM = shared->shared1.aligned_size / ((size_t)128 * r); + if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || + !(flags & YESCRYPT_RW)) { + errno = EINVAL; + return -1; + } + } + + /* Allocate memory */ + V = NULL; + V_size = (size_t)128 * r * N; + + need = V_size; + if (flags & __YESCRYPT_INIT_SHARED) { + if (local->aligned_size < need) { + if (local->base || local->aligned || + local->base_size || local->aligned_size) { + errno = EINVAL; + return -1; + } + if (!alloc_region(local, need)) + return -1; + } + V = (uint64_t *)local->aligned; + need = 0; + } + B_size = (size_t)128 * r * p; + need += B_size; + if (need < B_size) { + errno = ENOMEM; + return -1; + } + XY_size = (size_t)256 * r + 64; + + need += XY_size; + if (need < XY_size) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_PWXFORM) { + size_t S_size = S_SIZE_ALL * sizeof(*S); + + if (flags & YESCRYPT_PARALLEL_SMIX) + S_size *= p; + + need += S_size; + if (need < S_size) { + errno = ENOMEM; + return -1; + } + } + if (flags & __YESCRYPT_INIT_SHARED) { + if (!alloc_region(&tmp, need)) + return -1; + B = (uint64_t *)tmp.aligned; + XY = (uint64_t *)((uint8_t *)B + B_size); + } + else { + init_region(&tmp); + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint64_t *)local->aligned; + V = (uint64_t *)((uint8_t *)B + B_size); + XY = (uint64_t *)((uint8_t *)V + V_size); + } + S = NULL; + if (flags & YESCRYPT_PWXFORM) + S = (uint64_t *)((uint8_t *)XY + XY_size); + + + if (t || flags) { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, passwd, passwdlen); + SHA256_Final_Y((uint8_t *)sha256, &ctx); + passwd = (uint8_t *)sha256; + passwdlen = sizeof(sha256); + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, (uint8_t *)B, B_size); + + + if (t || flags) + { + blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); + } + smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); + + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen); + + /* + * Except when computing classic scrypt, allow all computation so far + * to be performed on the client. The final steps below match those of + * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so + * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of + * SCRAM's use of SHA-1) would be usable with yescrypt hashes. + */ + if ((t || flags) && buflen == sizeof(sha256)) { + /* Compute ClientKey */ + + { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, buf, buflen); + HMAC_SHA256_Update_Y(&ctx, salt, saltlen); + HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); + } + /* Compute StoredKey */ + { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256)); + SHA256_Final_Y(buf, &ctx); + } + } + + if (free_region(&tmp)) + return -1; + + /* Success! */ + return 0; +} + diff --git a/algorithm/yescrypt.c b/algorithm/yescrypt.c new file mode 100644 index 000000000..de00d0f33 --- /dev/null +++ b/algorithm/yescrypt.c @@ -0,0 +1,128 @@ +/*- + * Copyright 2015 djm34 + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#include "algorithm/yescrypt_core.h" + +static const uint32_t diff1targ = 0x0000ffff; + +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + +/* Used externally as confirmation of correct OCL code */ +int yescrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + yescrypt_hash((unsigned char*)data,(unsigned char*)ohash); + + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + + if (tmp_hash7 > diff1targ) + return -1; + + if (tmp_hash7 > Htarg) + return 0; + + return 1; +} + +void yescrypt_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + + yescrypt_hash((unsigned char*)data, (unsigned char*)ohash); + +} + + +bool scanhash_yescrypt(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while (1) + { + uint32_t ostate[8]; + + *nonce = ++n; + data[19] = (n); + + yescrypt_hash((unsigned char*)data, (unsigned char*)ostate); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]); + + if (unlikely(tmp_hash7 <= Htarg)) + { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) + { + *last_nonce = n; + break; + } + } + + return ret; +} \ No newline at end of file diff --git a/algorithm/yescrypt.h b/algorithm/yescrypt.h new file mode 100644 index 000000000..b51cb4959 --- /dev/null +++ b/algorithm/yescrypt.h @@ -0,0 +1,10 @@ +#ifndef YESCRYPT_H +#define YESCRYPT_H + +#include "miner.h" +#define YESCRYPT_SCRATCHBUF_SIZE (128 * 2048 * 8 ) //uchar +#define YESCRYP_SECBUF_SIZE (128*64*8) +extern int yescrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); +extern void yescrypt_regenhash(struct work *work); + +#endif /* YESCRYPT_H */ diff --git a/algorithm/yescrypt_core.h b/algorithm/yescrypt_core.h new file mode 100644 index 000000000..64b9a11f6 --- /dev/null +++ b/algorithm/yescrypt_core.h @@ -0,0 +1,376 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013,2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ +#ifndef _YESCRYPT_H_ +#define _YESCRYPT_H_ + +#include +#include /* for size_t */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +//extern void yescrypt_hash_sp(const unsigned char *input, unsigned char *output); +extern void yescrypt_hash(const unsigned char *input, unsigned char *output); + + + +/** + * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen) and write the result into buf. The parameters r, p, and buflen + * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N + * must be a power of 2 greater than 1. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as buf is local to the thread. + */ +extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __salt, size_t __saltlen, + uint64_t __N, uint32_t __r, uint32_t __p, + uint8_t * __buf, size_t __buflen); + +/** + * Internal type used by the memory allocator. Please do not use it directly. + * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since + * they might differ from each other in a future version. + */ +typedef struct { + void * base, * aligned; + size_t base_size, aligned_size; +} yescrypt_region_t; + +/** + * Types for shared (ROM) and thread-local (RAM) data structures. + */ +typedef yescrypt_region_t yescrypt_shared1_t; +typedef struct { + yescrypt_shared1_t shared1; + uint32_t mask1; +} yescrypt_shared_t; +typedef yescrypt_region_t yescrypt_local_t; + +/** + * Possible values for yescrypt_init_shared()'s flags argument. + */ +typedef enum { + YESCRYPT_SHARED_DEFAULTS = 0, + YESCRYPT_SHARED_PREALLOCATED = 0x100 +} yescrypt_init_shared_flags_t; + +/** + * Possible values for the flags argument of yescrypt_kdf(), + * yescrypt_gensalt_r(), yescrypt_gensalt(). These may be OR'ed together, + * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive. + * Please refer to the description of yescrypt_kdf() below for the meaning of + * these flags. + */ +typedef enum { +/* public */ + YESCRYPT_WORM = 0, + YESCRYPT_RW = 1, + YESCRYPT_PARALLEL_SMIX = 2, + YESCRYPT_PWXFORM = 4, +/* private */ + __YESCRYPT_INIT_SHARED_1 = 0x10000, + __YESCRYPT_INIT_SHARED_2 = 0x20000, + __YESCRYPT_INIT_SHARED = 0x30000 +} yescrypt_flags_t; + +#define YESCRYPT_KNOWN_FLAGS \ + (YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \ + __YESCRYPT_INIT_SHARED) + +/** + * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask, + * buf, buflen): + * Optionally allocate memory for and initialize the shared (ROM) data + * structure. The parameters N, r, and p must satisfy the same conditions as + * with crypto_scrypt(). param and paramlen specify a local parameter with + * which the ROM is seeded. If buf is not NULL, then it is used to return + * buflen bytes of message digest for the initialized ROM (the caller may use + * this to verify that the ROM has been computed in the same way that it was on + * a previous run). + * + * Return 0 on success; or -1 on error. + * + * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the + * ROM is assumed to have been preallocated by the caller, with + * shared->shared1.aligned being the start address of the ROM and + * shared->shared1.aligned_size being its size (which must be consistent with + * N, r, and p). This may be used e.g. when the ROM is to be placed in a SysV + * shared memory segment allocated by the caller. + * + * mask controls the frequency of ROM accesses by yescrypt_kdf(). Normally it + * should be set to 1, to interleave RAM and ROM accesses, which works well + * when both regions reside in the machine's RAM anyway. Other values may be + * used e.g. when the ROM is memory-mapped from a disk file. Recommended mask + * values are powers of 2 minus 1 or minus 2. Here's the effect of some mask + * values: + * mask value ROM accesses in SMix 1st loop ROM accesses in SMix 2nd loop + * 0 0 1/2 + * 1 1/2 1/2 + * 2 0 1/4 + * 3 1/4 1/4 + * 6 0 1/8 + * 7 1/8 1/8 + * 14 0 1/16 + * 15 1/16 1/16 + * 1022 0 1/1024 + * 1023 1/1024 1/1024 + * + * Actual computation of the ROM contents may be avoided, if you don't intend + * to use a ROM but need a dummy shared structure, by calling this function + * with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the + * arguments starting with param and on. + * + * MT-safe as long as shared is local to the thread. + */ +extern int yescrypt_init_shared(yescrypt_shared_t * __shared, + const uint8_t * __param, size_t __paramlen, + uint64_t __N, uint32_t __r, uint32_t __p, + yescrypt_init_shared_flags_t __flags, uint32_t __mask, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_free_shared(shared): + * Free memory that had been allocated with yescrypt_init_shared(). + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as shared is local to the thread. + */ +extern int yescrypt_free_shared(yescrypt_shared_t * __shared); + +/** + * yescrypt_init_local(local): + * Initialize the thread-local (RAM) data structure. Actual memory allocation + * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r(). + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yescrypt_init_local(yescrypt_local_t * __local); + +/** + * yescrypt_free_local(local): + * Free memory that may have been allocated for an initialized thread-local + * (RAM) data structure. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yescrypt_free_local(yescrypt_local_t * __local); + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters N, r, p, and buflen must satisfy + * the same conditions as with crypto_scrypt(). t controls computation time + * while not affecting peak memory usage. shared and flags may request + * special modes as described below. local is the thread-local data + * structure, allowing to preserve and reuse a memory allocation across calls, + * thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + * + * t controls computation time. t = 0 is optimal in terms of achieving the + * highest area-time for ASIC attackers. Thus, higher computation time, if + * affordable, is best achieved by increasing N rather than by increasing t. + * However, if the higher memory usage (which goes along with higher N) is not + * affordable, or if fine-tuning of the time is needed (recall that N must be a + * power of 2), then t = 1 or above may be used to increase time while staying + * at the same peak memory usage. t = 1 increases the time by 25% and + * decreases the normalized area-time to 96% of optimal. (Of course, in + * absolute terms the area-time increases with higher t. It's just that it + * would increase slightly more with higher N*r rather than with higher t.) + * t = 2 increases the time by another 20% and decreases the normalized + * area-time to 89% of optimal. Thus, these two values are reasonable to use + * for fine-tuning. Values of t higher than 2 result in further increase in + * time while reducing the efficiency much further (e.g., down to around 50% of + * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact + * numbers varying by the flags settings). + * + * Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and + * passing a dummy shared structure (see the description of + * yescrypt_init_shared() above for how to produce one). In this mode, the + * thread-local memory region (RAM) is first sequentially written to and then + * randomly read from. This algorithm is friendly towards time-memory + * tradeoffs (TMTO), available both to defenders (albeit not in this + * implementation) and to attackers. + * + * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local + * memory region (RAM), which makes TMTO a lot less efficient. This may be + * used to slow down the kinds of attackers who would otherwise benefit from + * classic scrypt's efficient TMTO. Since classic scrypt's TMTO allows not + * only for the tradeoff, but also for a decrease of attacker's area-time (by + * up to a constant factor), setting YESCRYPT_RW substantially increases the + * cost of attacks in area-time terms as well. Yet another benefit of it is + * that optimal area-time is reached at an earlier time than with classic + * scrypt, and t = 0 actually corresponds to this earlier completion time, + * resulting in quicker hash computations (and thus in higher request rate + * capacity). Due to these properties, YESCRYPT_RW should almost always be + * set, except when compatibility with classic scrypt or TMTO-friendliness are + * desired. + * + * YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a + * lower level as compared to where it is in classic scrypt. This reduces + * flexibility for efficient computation (for both attackers and defenders) by + * requiring that, short of resorting to TMTO, the full amount of memory be + * allocated as needed for the specified p, regardless of whether that + * parallelism is actually being fully made use of or not. (For comparison, a + * single instance of classic scrypt may be computed in less memory without any + * CPU time overhead, but in more real time, by not making full use of the + * parallelism.) This may be desirable when the defender has enough memory + * with sufficiently low latency and high bandwidth for efficient full parallel + * execution, yet the required memory size is high enough that some likely + * attackers might end up being forced to choose between using higher latency + * memory than they could use otherwise (waiting for data longer) or using TMTO + * (waiting for data more times per one hash computation). The area-time cost + * for other kinds of attackers (who would use the same memory type and TMTO + * factor or no TMTO either way) remains roughly the same, given the same + * running time for the defender. In the TMTO-friendly YESCRYPT_WORM mode, as + * long as the defender has enough memory that is just as fast as the smaller + * per-thread regions would be, doesn't expect to ever need greater + * flexibility (except possibly via TMTO), and doesn't need backwards + * compatibility with classic scrypt, there are no other serious drawbacks to + * this setting. In the YESCRYPT_RW mode, which is meant to discourage TMTO, + * this new approach to parallelization makes TMTO less inefficient. (This is + * an unfortunate side-effect of avoiding some random writes, as we have to in + * order to allow for parallel threads to access a common memory region without + * synchronization overhead.) Thus, in this mode this setting poses an extra + * tradeoff of its own (higher area-time cost for a subset of attackers vs. + * better TMTO resistance). Setting YESCRYPT_PARALLEL_SMIX also changes the + * way the running time is to be controlled from N*r*p (for classic scrypt) to + * N*r (in this modification). All of this applies only when p > 1. For + * p = 1, this setting is a no-op. + * + * Passing a real shared structure, with ROM contents previously computed by + * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for + * the thread-local RAM region. In order to allow for initialization of the + * ROM to be split into a separate program, the shared->shared1.aligned and + * shared->shared1.aligned_size fields may be set by the caller of + * yescrypt_kdf() manually rather than with yescrypt_init_shared(). + * + * local must be initialized with yescrypt_init_local(). + * + * MT-safe as long as local and buf are local to the thread. + */ +extern int yescrypt_kdf(const yescrypt_shared_t * __shared, + yescrypt_local_t * __local, + const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __salt, size_t __saltlen, + uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t, + yescrypt_flags_t __flags, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen): + * Compute and encode an scrypt or enhanced scrypt hash of passwd given the + * parameters and salt value encoded in setting. If the shared structure is + * not dummy, a ROM is used and YESCRYPT_RW is required. Otherwise, whether to + * use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff + * discouraging modification) is determined by the setting string. shared and + * local must be initialized as described above for yescrypt_kdf(). buf must + * be large enough (as indicated by buflen) to hold the encoded hash string. + * + * Return the encoded hash string on success; or NULL on error. + * + * MT-safe as long as local and buf are local to the thread. + */ +extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared, + yescrypt_local_t * __local, + const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __setting, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt(passwd, setting): + * Compute and encode an scrypt or enhanced scrypt hash of passwd given the + * parameters and salt value encoded in setting. Whether to use the + * YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff + * discouraging modification) is determined by the setting string. + * + * Return the encoded hash string on success; or NULL on error. + * + * This is a crypt(3)-like interface, which is simpler to use than + * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM, + * and it is slower than yescrypt_r() for repeated calls because it allocates + * and frees memory on each call. + * + * MT-unsafe. + */ +extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting); + +/** + * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen): + * Generate a setting string for use with yescrypt_r() and yescrypt() by + * encoding into it the parameters N_log2 (which is to be set to base 2 + * logarithm of the desired value for N), r, p, flags, and a salt given by src + * (of srclen bytes). buf must be large enough (as indicated by buflen) to + * hold the setting string. + * + * Return the setting string on success; or NULL on error. + * + * MT-safe as long as buf is local to the thread. + */ +extern uint8_t * yescrypt_gensalt_r( + uint32_t __N_log2, uint32_t __r, uint32_t __p, + yescrypt_flags_t __flags, + const uint8_t * __src, size_t __srclen, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_gensalt(N_log2, r, p, flags, src, srclen): + * Generate a setting string for use with yescrypt_r() and yescrypt(). This + * function is the same as yescrypt_gensalt_r() except that it uses a static + * buffer and thus is not MT-safe. + * + * Return the setting string on success; or NULL on error. + * + * MT-unsafe. + */ +extern uint8_t * yescrypt_gensalt( + uint32_t __N_log2, uint32_t __r, uint32_t __p, + yescrypt_flags_t __flags, + const uint8_t * __src, size_t __srclen); + +#ifdef __cplusplus +} +#endif + +#endif /* !_YESCRYPT_H_ */ diff --git a/algorithm/yescryptcommon.c b/algorithm/yescryptcommon.c new file mode 100644 index 000000000..cf7067d02 --- /dev/null +++ b/algorithm/yescryptcommon.c @@ -0,0 +1,360 @@ +/*- + * Copyright 2013,2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include "algorithm/yescrypt_core.h" + +#define BYTES2CHARS(bytes) \ + ((((bytes) * 8) + 5) / 6) + +#define HASH_SIZE 32 /* bytes */ +#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */ +#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM) +static const char * const itoa64 = + "./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +static uint8_t * encode64_uint32(uint8_t * dst, size_t dstlen, + uint32_t src, uint32_t srcbits) +{ + uint32_t bit; + + for (bit = 0; bit < srcbits; bit += 6) { + if (dstlen < 1) + return NULL; + *dst++ = itoa64[src & 0x3f]; + dstlen--; + src >>= 6; + } + + return dst; +} + +static uint8_t * encode64(uint8_t * dst, size_t dstlen, + const uint8_t * src, size_t srclen) +{ + size_t i; + + for (i = 0; i < srclen; ) { + uint8_t * dnext; + uint32_t value = 0, bits = 0; + do { + value |= (uint32_t)src[i++] << bits; + bits += 8; + } while (bits < 24 && i < srclen); + dnext = encode64_uint32(dst, dstlen, value, bits); + if (!dnext) + return NULL; + dstlen -= dnext - dst; + dst = dnext; + } + + return dst; +} + +static int decode64_one(uint32_t * dst, uint8_t src) +{ + const char * ptr = strchr(itoa64, src); + if (ptr) { + *dst = ptr - itoa64; + return 0; + } + *dst = 0; + return -1; +} + +static const uint8_t * decode64_uint32(uint32_t * dst, uint32_t dstbits, + const uint8_t * src) +{ + uint32_t bit; + uint32_t value; + + value = 0; + for (bit = 0; bit < dstbits; bit += 6) { + uint32_t one; + if (decode64_one(&one, *src)) { + *dst = 0; + return NULL; + } + src++; + value |= one << bit; + } + + *dst = value; + return src; +} + +uint8_t * +yescrypt_r(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * setting, + uint8_t * buf, size_t buflen) +{ + uint8_t hash[HASH_SIZE]; + const uint8_t * src, * salt; + uint8_t * dst; + size_t prefixlen, saltlen, need; + uint8_t version; + uint64_t N; + uint32_t r, p; + yescrypt_flags_t flags = YESCRYPT_WORM; + fflush(stdout); + if (setting[0] != '$' || setting[1] != '7') + { + fflush(stdout); + return NULL; + } + fflush(stdout); + src = setting + 2; + fflush(stdout); + switch ((version = *src)) { + case '$': + fflush(stdout); + break; + case 'X': + src++; + flags = YESCRYPT_RW; + fflush(stdout); + break; + default: + { + fflush(stdout); + return NULL; + } + } + + fflush(stdout); + if (*src != '$') { + uint32_t decoded_flags; + if (decode64_one(&decoded_flags, *src)) + + { + fflush(stdout); + return NULL; + } + flags = decoded_flags; + if (*++src != '$') + { + fflush(stdout); + return NULL; + } + } + src++; + + { + uint32_t N_log2; + if (decode64_one(&N_log2, *src)) + { + return NULL; + } + src++; + N = (uint64_t)1 << N_log2; + } + + src = decode64_uint32(&r, 30, src); + if (!src) + { + return NULL; + } + + src = decode64_uint32(&p, 30, src); + if (!src) + { + return NULL; + } + + prefixlen = src - setting; + + salt = src; + src = (uint8_t *)strrchr((char *)salt, '$'); + if (src) + saltlen = src - salt; + else + saltlen = strlen((char *)salt); + + need = prefixlen + saltlen + 1 + HASH_LEN + 1; + if (need > buflen || need < saltlen) + + { + fflush(stdout); + return NULL; + } + +fflush(stdout); + if (yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + N, r, p, 0, flags, hash, sizeof(hash))) + { + fflush(stdout); + return NULL; + } + + dst = buf; + memcpy(dst, setting, prefixlen + saltlen); + dst += prefixlen + saltlen; + *dst++ = '$'; + + dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash)); + /* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its + * memory allocations yet anyway. */ + if (!dst || dst >= buf + buflen) /* Can't happen */ + { + return NULL; + } + + *dst = 0; /* NUL termination */ + fflush(stdout); + return buf; +} + +uint8_t * +yescrypt(const uint8_t * passwd, const uint8_t * setting) +{ + static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1]; + yescrypt_shared_t shared; + yescrypt_local_t local; + uint8_t * retval; + if (yescrypt_init_shared(&shared, NULL, 0, + 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0)) + return NULL; + if (yescrypt_init_local(&local)) { + yescrypt_free_shared(&shared); + return NULL; + } + retval = yescrypt_r(&shared, &local, + passwd, 80, setting, buf, sizeof(buf)); + // printf("hashse='%s'\n", (char *)retval); + if (yescrypt_free_local(&local)) { + yescrypt_free_shared(&shared); + return NULL; + } + if (yescrypt_free_shared(&shared)) + return NULL; + return retval; + +} + +uint8_t * +yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p, + yescrypt_flags_t flags, + const uint8_t * src, size_t srclen, + uint8_t * buf, size_t buflen) +{ + uint8_t * dst; + size_t prefixlen = 3 + 1 + 5 + 5; + size_t saltlen = BYTES2CHARS(srclen); + size_t need; + + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + if (flags) { + if (flags & ~0x3f) + return NULL; + + prefixlen++; + if (flags != YESCRYPT_RW) + prefixlen++; + } + + need = prefixlen + saltlen + 1; + if (need > buflen || need < saltlen || saltlen < srclen) + return NULL; + + if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30))) + return NULL; + + dst = buf; + *dst++ = '$'; + *dst++ = '7'; + if (flags) { + *dst++ = 'X'; /* eXperimental, subject to change */ + if (flags != YESCRYPT_RW) + *dst++ = itoa64[flags]; + } + *dst++ = '$'; + + *dst++ = itoa64[N_log2]; + + dst = encode64_uint32(dst, buflen - (dst - buf), r, 30); + if (!dst) /* Can't happen */ + return NULL; + + dst = encode64_uint32(dst, buflen - (dst - buf), p, 30); + if (!dst) /* Can't happen */ + return NULL; + + dst = encode64(dst, buflen - (dst - buf), src, srclen); + if (!dst || dst >= buf + buflen) /* Can't happen */ + return NULL; + + *dst = 0; /* NUL termination */ + + return buf; +} + +uint8_t * +yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, + yescrypt_flags_t flags, + const uint8_t * src, size_t srclen) +{ + static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1]; + return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, + buf, sizeof(buf)); +} + +static int +yescrypt_bsty(const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, + uint8_t * buf, size_t buflen) +{ + static __thread int initialized = 0; + static __thread yescrypt_shared_t shared; + static __thread yescrypt_local_t local; + +// static __declspec(thread) int initialized = 0; +// static __declspec(thread) yescrypt_shared_t shared; +// static __declspec(thread) yescrypt_local_t local; + + int retval; + if (!initialized) { +/* "shared" could in fact be shared, but it's simpler to keep it private + * along with "local". It's dummy and tiny anyway. */ + if (yescrypt_init_shared(&shared, NULL, 0, + 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0)) + return -1; + if (yescrypt_init_local(&local)) { + yescrypt_free_shared(&shared); + return -1; + } + initialized = 1; + } + retval = yescrypt_kdf(&shared, &local, + passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS, + buf, buflen); + + return retval; +} + +void yescrypt_hash(const unsigned char *input, unsigned char *output) +{ + + yescrypt_bsty((const uint8_t *)input, 80, (const uint8_t *) input, 80, 2048, 8, 1, (uint8_t *)output, 32); +} diff --git a/driver-opencl.c b/driver-opencl.c index 48ffc517d..5b01fc963 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -49,6 +49,7 @@ extern bool opt_loginput; extern char *opt_kernel_path; extern int gpur_thr_id; extern bool opt_noadl; +extern bool opt_lyra; extern void *miner_thread(void *userdata); extern int dev_from_id(int thr_id); @@ -257,14 +258,14 @@ char *set_gpu_threads(const char *_arg) if (nextptr == NULL) return "Invalid parameters for set_gpu_threads"; val = atoi(nextptr); - if (val < 1 || val > 10) + if (val < 1 || val > 20) // gpu_threads increase max value to 20 return "Invalid value passed to set_gpu_threads"; gpus[device++].threads = val; while ((nextptr = strtok(NULL, ",")) != NULL) { val = atoi(nextptr); - if (val < 1 || val > 10) + if (val < 1 || val > 20) // gpu_threads increase max value to 20 return "Invalid value passed to set_gpu_threads"; gpus[device++].threads = val; @@ -1357,7 +1358,12 @@ static bool opencl_thread_init(struct thr_info *thr) static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work *work) { - work->blk.work = work; + + if (opt_lyra) { + work->blk.work = work; + precalc_hash_blake256(&work->blk, 0, (uint32_t *)(work->data)); + } + else {work->blk.work = work;} thr->pool_no = work->pool->pool_no; return true; } @@ -1425,7 +1431,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { - applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); + applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) %d", status,i); return -1; } } @@ -1456,6 +1462,9 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, } applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id); postcalc_hash_async(thr, work, thrdata->res); +// postcalc_hash(thr); +// submit_tested_work(thr, work); +// submit_work_async(work); memset(thrdata->res, 0, buffersize); /* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */ clFinish(clState->commandQueue); @@ -1477,6 +1486,12 @@ static void opencl_thread_shutdown(struct thr_info *thr) clFinish(clState->commandQueue); clReleaseMemObject(clState->outputBuffer); clReleaseMemObject(clState->CLbuffer0); + if (clState->buffer1) + clReleaseMemObject(clState->buffer1); + if (clState->buffer2) + clReleaseMemObject(clState->buffer2); + if (clState->buffer3) + clReleaseMemObject(clState->buffer3); if (clState->padbuffer8) clReleaseMemObject(clState->padbuffer8); clReleaseKernel(clState->kernel); diff --git a/example.bat b/example.bat new file mode 100644 index 000000000..a1f93d202 --- /dev/null +++ b/example.bat @@ -0,0 +1,16 @@ +rem setx GPU_MAX_HEAP_SIZE 100 +setx GPU_USE_SYNC_OBJECTS 1 +setx GPU_MAX_ALLOC_PERCENT 100 +del *.bin +@rem sgminer.exe +@rem pause +@rem sgminer.exe --no-submit-stale --kernel Lyra2RE -o stratum+tcp://pool.verters.com:4444 -u djm34t.user -p password --gpu-platform 1 -I 17 -g 16 -w 32 +sgminer.exe --kernel Lyra2REv2 -o http://127.0.0.1:7785/ -u Dominique -p MyPass --gpu-platform 1 -I 18 -g 8 -w 32 +@rem sgminer.exe --no-submit-stale --kernel pluck -o stratum+tcp://sup.suprnova.cc:7777 -u djm34.2 -p password --gpu-platform 2 --thread-concurrency 8192 -w 4 -I 12 +@rem sgminer.exe --no-submit-stale --kernel yescrypt -o stratum+tcp://mine2.bsty.nonce-pool.com:4095 -u djm34.1 -p password --gpu-platform 1 -w 32 --thread-concurrency 512 --text-only --debug +@rem sgminer.exe --no-submit-stale --kernel yescrypt -o stratum+tcp://mine2.bsty.nonce-pool.com:4095 -u djm34.1 -p password --gpu-platform 1 -w 8 --thread-concurrency 1024 -I 9 + +@rem sgminer.exe --no-submit-stale --kernel nscrypto -o http://127.0.0.1:9989/ -u dom -p password --gpu-platform 1 -I 20 +@rem sgminer.exe --no-submit-stale --kernel nscrypto -o stratum+tcp://drop.suprnova.cc:7890 -u djm34.1 -p password --gpu-platform 0 -I 18 --device 0 --remove-disabled --text-only --debug + +pause \ No newline at end of file diff --git a/findnonce.c b/findnonce.c index be9ba0dfd..8489c960c 100644 --- a/findnonce.c +++ b/findnonce.c @@ -214,6 +214,7 @@ static void *postcalc_hash(void *userdata) void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res) { + struct pc_data *pcd = (struct pc_data *)malloc(sizeof(struct pc_data)); int buffersize; @@ -225,8 +226,7 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res) pcd->thr = thr; pcd->work = copy_work(work); buffersize = BUFFERSIZE; - - memcpy(&pcd->res, res, buffersize); + memcpy(&pcd->res, res, buffersize); if (pthread_create(&pcd->pth, NULL, postcalc_hash, (void *)pcd)) { applog(LOG_ERR, "Failed to create postcalc_hash thread"); @@ -234,3 +234,142 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res) free(pcd); } } + +// BLAKE 256 14 rounds (standard) + +typedef struct +{ + uint32_t h[8]; + uint32_t t; +} blake_state256; + +#define NB_ROUNDS32 14 + +const uint8_t blake_sigma[][16] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + +const uint32_t blake_u256[16] = +{ + 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, + 0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, + 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c, + 0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917 +}; + +#define ROT32(x,n) (((x)<<(32-n))|( (x)>>(n))) +//#define ROT32(x,n) (rotate((uint)x, (uint)32-n)) +#define ADD32(x,y) ((uint32_t)((x) + (y))) +#define XOR32(x,y) ((uint32_t)((x) ^ (y))) + +#define G(a,b,c,d,i) \ +do {\ + v[a] += XOR32(m[blake_sigma[r][i]], blake_u256[blake_sigma[r][i+1]]) + v[b];\ + v[d] = ROT32(XOR32(v[d],v[a]),16);\ + v[c] += v[d];\ + v[b] = ROT32(XOR32(v[b],v[c]),12);\ + v[a] += XOR32(m[blake_sigma[r][i+1]], blake_u256[blake_sigma[r][i]]) + v[b]; \ + v[d] = ROT32(XOR32(v[d],v[a]), 8);\ + v[c] += v[d];\ + v[b] = ROT32(XOR32(v[b],v[c]), 7);\ + } while (0) + + +// compress a block +void blake256_compress_block(blake_state256 *S, uint32_t *m) +{ + uint32_t v[16]; + int i, r; + for (i = 0; i < 8; ++i) v[i] = S->h[i]; + + v[8] = blake_u256[0]; + v[9] = blake_u256[1]; + v[10] = blake_u256[2]; + v[11] = blake_u256[3]; + v[12] = blake_u256[4]; + v[13] = blake_u256[5]; + v[14] = blake_u256[6]; + v[15] = blake_u256[7]; + + v[12] ^= S->t; + v[13] ^= S->t; + + for (r = 0; r < NB_ROUNDS32; ++r) + { + /* column step */ + G(0, 4, 8, 12, 0); + G(1, 5, 9, 13, 2); + G(2, 6, 10, 14, 4); + G(3, 7, 11, 15, 6); + /* diagonal step */ + G(0, 5, 10, 15, 8); + G(1, 6, 11, 12, 10); + G(2, 7, 8, 13, 12); + G(3, 4, 9, 14, 14); + } + + for (i = 0; i < 16; ++i) S->h[i & 7] ^= v[i]; +} + + +void blake256_init(blake_state256 *S) +{ + S->h[0] = 0x6a09e667; + S->h[1] = 0xbb67ae85; + S->h[2] = 0x3c6ef372; + S->h[3] = 0xa54ff53a; + S->h[4] = 0x510e527f; + S->h[5] = 0x9b05688c; + S->h[6] = 0x1f83d9ab; + S->h[7] = 0x5be0cd19; + S->t = 0; +} + + +void blake256_update(blake_state256 *S, const uint32_t *in) +{ + uint32_t m[16]; + int i; + S->t = 512; + for (i = 0; i < 16; ++i) m[i] = in[i]; + blake256_compress_block(S, m); +} + + + +void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) +{ +blake_state256 S; +blake256_init(&S); +blake256_update(&S, data); + +blk->ctx_a = S.h[0]; +blk->ctx_b = S.h[1]; +blk->ctx_c = S.h[2]; +blk->ctx_d = S.h[3]; +blk->ctx_e = S.h[4]; +blk->ctx_f = S.h[5]; +blk->ctx_g = S.h[6]; +blk->ctx_h = S.h[7]; + +blk->cty_a = data[16]; +blk->cty_b = data[17]; +blk->cty_c = data[18]; + +} diff --git a/findnonce.h b/findnonce.h index 9376a57be..354cbf017 100644 --- a/findnonce.h +++ b/findnonce.h @@ -10,5 +10,5 @@ extern void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data); extern void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res); - +extern void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *data); #endif /*FINDNONCE_H*/ diff --git a/kernel/Lyra2.cl b/kernel/Lyra2.cl new file mode 100644 index 000000000..276e0c4e5 --- /dev/null +++ b/kernel/Lyra2.cl @@ -0,0 +1,178 @@ +/* +* Lyra2 kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* Copyright (c) 2014 djm34 +* +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ + +/*Blake2b IV Array*/ +__constant static const sph_u64 blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +/*Blake2b's rotation*/ + + +static inline uint2 ror2(uint2 v, unsigned a) { + uint2 result; + unsigned n = 64 - a; + if (n == 32) { return (uint2)(v.y,v.x); } + if (n < 32) { + result.y = ((v.y << (n)) | (v.x >> (32 - n))); + result.x = ((v.x << (n)) | (v.y >> (32 - n))); + } + else { + result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); + result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); + } + return result; +} +static inline uint2 ror2l(uint2 v, unsigned a) { + uint2 result; + result.y = ((v.x << (32-a)) | (v.y >> (a))); + result.x = ((v.y << (32-a)) | (v.x >> (a))); + return result; +} +static inline uint2 ror2r(uint2 v, unsigned a) { + uint2 result; + result.y = ((v.y << (64-a)) | (v.x >> (a-32))); + result.x = ((v.x << (64-a)) | (v.y >> (a-32))); + return result; +} +/* +#define G(a,b,c,d) \ + do { \ +a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = d.yx; \ +c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2l(b, 24); \ +a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = ror2l(d, 16); \ +c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2r(b, 63); \ + } while(0) +*/ +#define G(a,b,c,d) \ + do { \ +a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = d.yx; \ +c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = as_uint2(as_uchar8(b).s34567012); \ +a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = ror2l(d, 16); \ +c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2r(b, 63); \ + } while(0) + + + +/*One Round of the Blake2b's compression function*/ +#define round_lyra(v) \ + do { \ + G(v[ 0],v[ 4],v[ 8],v[12]); \ + G(v[ 1],v[ 5],v[ 9],v[13]); \ + G(v[ 2],v[ 6],v[10],v[14]); \ + G(v[ 3],v[ 7],v[11],v[15]); \ + G(v[ 0],v[ 5],v[10],v[15]); \ + G(v[ 1],v[ 6],v[11],v[12]); \ + G(v[ 2],v[ 7],v[ 8],v[13]); \ + G(v[ 3],v[ 4],v[ 9],v[14]); \ + } while(0) + + +#define reduceDuplexRowSetup(rowIn, rowInOut, rowOut) \ + { \ + for (int i = 0; i < 8; i++) \ + { \ +\ + for (int j = 0; j < 12; j++) {state[j] ^= as_uint2(as_ulong(Matrix[12 * i + j][rowIn]) + as_ulong(Matrix[12 * i + j][rowInOut]));} \ + round_lyra(state); \ + for (int j = 0; j < 12; j++) {Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j];} \ +\ + Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ + Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ + Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ + Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ + Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ + Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ + Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ + Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ + Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ + Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ + Matrix[10 + 12 * i][rowInOut] ^= state[9]; \ + Matrix[11 + 12 * i][rowInOut] ^= state[10]; \ + } \ + \ + } + +#define reduceDuplexRow(rowIn, rowInOut, rowOut) \ + { \ + for (int i = 0; i < 8; i++) \ + { \ + for (int j = 0; j < 12; j++) \ + state[j] ^= as_uint2(as_ulong(Matrix[12 * i + j][rowIn]) + as_ulong(Matrix[12 * i + j][rowInOut])); \ + \ + round_lyra(state); \ + for (int j = 0; j < 12; j++) {Matrix[j + 12 * i][rowOut] ^= state[j];} \ +\ + Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ + Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ + Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ + Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ + Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ + Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ + Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ + Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ + Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ + Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ + Matrix[10 + 12 * i][rowInOut] ^= state[9]; \ + Matrix[11 + 12 * i][rowInOut] ^= state[10]; \ + } \ + \ + } +#define absorbblock(in) { \ + state[0] ^= Matrix[0][in]; \ + state[1] ^= Matrix[1][in]; \ + state[2] ^= Matrix[2][in]; \ + state[3] ^= Matrix[3][in]; \ + state[4] ^= Matrix[4][in]; \ + state[5] ^= Matrix[5][in]; \ + state[6] ^= Matrix[6][in]; \ + state[7] ^= Matrix[7][in]; \ + state[8] ^= Matrix[8][in]; \ + state[9] ^= Matrix[9][in]; \ + state[10] ^= Matrix[10][in]; \ + state[11] ^= Matrix[11][in]; \ + round_lyra(state); \ + round_lyra(state); \ + round_lyra(state); \ + round_lyra(state); \ + round_lyra(state); \ + round_lyra(state); \ + round_lyra(state); \ + round_lyra(state); \ + round_lyra(state); \ + round_lyra(state); \ + round_lyra(state); \ + round_lyra(state); \ + } \ No newline at end of file diff --git a/kernel/Lyra2RE.cl b/kernel/Lyra2RE.cl new file mode 100644 index 000000000..1283867ff --- /dev/null +++ b/kernel/Lyra2RE.cl @@ -0,0 +1,497 @@ +/* + * Lyra2RE kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * Copyright (c) 2014 djm34 + * Copyright (c) 2014 James Lovejoy + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author djm34 + */ + +#pragma OPENCL EXTENSION cl_amd_printf : enable + +#ifndef LYRA2RE_CL +#define LYRA2RE_CL + +#if __ENDIAN_LITTLE__ +#define SPH_LITTLE_ENDIAN 1 +#else +#define SPH_BIG_ENDIAN 1 +#endif + +#define SPH_UPTR sph_u64 + +typedef unsigned int sph_u32; +typedef int sph_s32; +#ifndef __OPENCL_VERSION__ +typedef unsigned long long sph_u64; +typedef long long sph_s64; +#else +typedef unsigned long sph_u64; +typedef long sph_s64; +#endif + + +#define SPH_64 1 +#define SPH_64_TRUE 1 + +#define SPH_C32(x) ((sph_u32)(x ## U)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) + +#define SPH_C64(x) ((sph_u64)(x ## UL)) +#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) + +//#define SPH_ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +//#define SPH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +//#define SPH_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +//#define SPH_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) + +#define SPH_ROTL32(x,n) rotate(x,(uint)n) //faster with driver 14.6 +#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n)) +#define SPH_ROTL64(x,n) rotate(x,(ulong)n) +//#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n)) + +/* +inline ulong rol64 (ulong l,ulong n) +{ +if (n<=32) { +uint2 t = rotate(as_uint2(l), (n)); +return as_ulong((uint2)(bitselect(t.s0, t.s1, (uint)(1 << (n)) - 1), bitselect(t.s0, t.s1, (uint)(~((1 << (n)) - 1))))); } +else { +uint2 t = rotate(as_uint2(l), (n - 32)); +return as_ulong((uint2)(bitselect(t.s1, t.s0, (uint)(1 << (n - 32)) - 1), bitselect(t.s1, t.s0, (uint)(~((1 << (n - 32)) - 1))))); +}} +*/ + + +/* +static inline ulong rol64(const ulong vw, unsigned n) { + uint2 result; + uint2 v=as_uint2(vw); + if (n == 32) { return as_ulong((uint2)(v.y,v.x)); } + if (n < 32) { + result.y = ( (v.y << (n)) | (v.x >> (32 - n)) ); + result.x = ( (v.x << (n)) | (v.y >> (32 - n)) ); + } + else { + result.y = ( (v.x << (n - 32)) | (v.y >> (64 - n)) ); + result.x = ( (v.y << (n - 32)) | (v.x >> (64 - n)) ); + } + return as_ulong(result); +} +*/ + +static inline sph_u64 ror64(sph_u64 vw, unsigned a) { + uint2 result; + uint2 v = as_uint2(vw); + unsigned n = (unsigned)(64 - a); + if (n == 32) { return as_ulong((uint2)(v.y,v.x)); } + if (n < 32) { + result.y = ((v.y << (n)) | (v.x >> (32 - n))); + result.x = ((v.x << (n)) | (v.y >> (32 - n))); + } + else { + result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); + result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); + } + return as_ulong(result); +} + +#define SPH_ROTR64(l,n) ror64(l,n) + + + +#include "blake256.cl" +#include "groestl256.cl" +#include "Lyra2.cl" +#include "keccak1600.cl" +#include "skein256.cl" + +#define SWAP4(x) as_uint(as_uchar4(x).wzyx) +#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) + +#if SPH_BIG_ENDIAN + #define DEC64E(x) (x) + #define DEC64BE(x) (*(const __global sph_u64 *) (x)); + #define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x)); + #define DEC32LE(x) (*(const __global sph_u32 *) (x)); +#else + #define DEC64E(x) SWAP8(x) + #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); + #define DEC64LE(x) (*(const __global sph_u64 *) (x)); +#define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x)); +#endif + +typedef union { + unsigned char h1[64]; + uint h4[16]; + ulong h8[8]; +} hash_t; + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search( + __global hash_t* hashes, + // precalc hash from fisrt part of message + const uint h0, + const uint h1, + const uint h2, + const uint h3, + const uint h4, + const uint h5, + const uint h6, + const uint h7, + // last 12 bytes of original message + const uint in16, + const uint in17, + const uint in18 +) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + sph_u32 h[8]; + sph_u32 m[16]; + sph_u32 v[16]; + + +h[0]=h0; +h[1]=h1; +h[2]=h2; +h[3]=h3; +h[4]=h4; +h[5]=h5; +h[6]=h6; +h[7]=h7; +// compress 2nd round + m[0] = in16; + m[1] = in17; + m[2] = in18; + m[3] = SWAP4(gid); + + for (int i = 4; i < 16; i++) {m[i] = c_Padding[i];} + + for (int i = 0; i < 8; i++) {v[i] = h[i];} + + v[8] = c_u256[0]; + v[9] = c_u256[1]; + v[10] = c_u256[2]; + v[11] = c_u256[3]; + v[12] = c_u256[4] ^ 640; + v[13] = c_u256[5] ^ 640; + v[14] = c_u256[6]; + v[15] = c_u256[7]; + + for (int r = 0; r < 14; r++) { + GS(0, 4, 0x8, 0xC, 0x0); + GS(1, 5, 0x9, 0xD, 0x2); + GS(2, 6, 0xA, 0xE, 0x4); + GS(3, 7, 0xB, 0xF, 0x6); + GS(0, 5, 0xA, 0xF, 0x8); + GS(1, 6, 0xB, 0xC, 0xA); + GS(2, 7, 0x8, 0xD, 0xC); + GS(3, 4, 0x9, 0xE, 0xE); + } + + for (int i = 0; i < 16; i++) { + int j = i & 7; + h[j] ^= v[i];} + +for (int i=0;i<8;i++) {hash->h4[i]=SWAP4(h[i]);} + +barrier(CLK_LOCAL_MEM_FENCE); + +} + +// keccak256 + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + sph_u64 keccak_gpu_state[25]; + + for (int i = 0; i<25; i++) { + if (i<4) { keccak_gpu_state[i] = hash->h8[i]; } + else { keccak_gpu_state[i] = 0; } + } + keccak_gpu_state[4] = 0x0000000000000001; + keccak_gpu_state[16] = 0x8000000000000000; + + keccak_block(keccak_gpu_state); + for (int i = 0; i<4; i++) { hash->h8[i] = keccak_gpu_state[i]; } +barrier(CLK_LOCAL_MEM_FENCE); + + + +} + +/// lyra2 algo + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + + + uint2 state[16]; + + for (int i = 0; i<4; i++) { state[i] = as_uint2(hash->h8[i]);} //password + for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt + + for (int i = 0; i<8; i++) { state[i + 8] = as_uint2(blake2b_IV[i]); } + + // blake2blyra x2 + + for (int i = 0; i<24; i++) { round_lyra(state); } //because 12 is not enough + + __private uint2 Matrix[96][8]; // very uncool + /// reducedSqueezeRow0 + + for (int i = 0; i < 8; i++) + { + for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][0] = state[j]; } + round_lyra(state); + } + + /// reducedSqueezeRow1 + + for (int i = 0; i < 8; i++) + { + for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + 12 * i][0]; } + round_lyra(state); + for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; } + } + + + reduceDuplexRowSetup(1, 0, 2); + reduceDuplexRowSetup(2, 1, 3); + reduceDuplexRowSetup(3, 0, 4); + reduceDuplexRowSetup(4, 3, 5); + reduceDuplexRowSetup(5, 2, 6); + reduceDuplexRowSetup(6, 1, 7); + + sph_u32 rowa; + rowa = state[0].x & 7; + + reduceDuplexRow(7, rowa, 0); + rowa = state[0].x & 7; + reduceDuplexRow(0, rowa, 3); + rowa = state[0].x & 7; + reduceDuplexRow(3, rowa, 6); + rowa = state[0].x & 7; + reduceDuplexRow(6, rowa, 1); + rowa = state[0].x & 7; + reduceDuplexRow(1, rowa, 4); + rowa = state[0].x & 7; + reduceDuplexRow(4, rowa, 7); + rowa = state[0].x & 7; + reduceDuplexRow(7, rowa, 2); + rowa = state[0].x & 7; + reduceDuplexRow(2, rowa, 5); + + absorbblock(rowa); + + for (int i = 0; i<4; i++) {hash->h8[i] = as_ulong(state[i]);} +barrier(CLK_LOCAL_MEM_FENCE); + + + +} + +//skein256 + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + + sph_u64 h[9]; + sph_u64 t[3]; + sph_u64 dt0,dt1,dt2,dt3; + sph_u64 p0, p1, p2, p3, p4, p5, p6, p7; + h[8] = skein_ks_parity; + + for (int i = 0; i<8; i++) { + h[i] = SKEIN_IV512_256[i]; + h[8] ^= h[i];} + + t[0]=t12[0]; + t[1]=t12[1]; + t[2]=t12[2]; + + dt0=hash->h8[0]; + dt1=hash->h8[1]; + dt2=hash->h8[2]; + dt3=hash->h8[3]; + + p0 = h[0] + dt0; + p1 = h[1] + dt1; + p2 = h[2] + dt2; + p3 = h[3] + dt3; + p4 = h[4]; + p5 = h[5] + t[0]; + p6 = h[6] + t[1]; + p7 = h[7]; + + #pragma unroll + for (int i = 1; i<19; i+=2) {Round_8_512(p0,p1,p2,p3,p4,p5,p6,p7,i);} + p0 ^= dt0; + p1 ^= dt1; + p2 ^= dt2; + p3 ^= dt3; + + h[0] = p0; + h[1] = p1; + h[2] = p2; + h[3] = p3; + h[4] = p4; + h[5] = p5; + h[6] = p6; + h[7] = p7; + h[8] = skein_ks_parity; + + for (int i = 0; i<8; i++) { h[8] ^= h[i]; } + + t[0] = t12[3]; + t[1] = t12[4]; + t[2] = t12[5]; + p5 += t[0]; //p5 already equal h[5] + p6 += t[1]; + + #pragma unroll + for (int i = 1; i<19; i+=2) { Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i); } + + hash->h8[0] = p0; + hash->h8[1] = p1; + hash->h8[2] = p2; + hash->h8[3] = p3; + barrier(CLK_LOCAL_MEM_FENCE); + +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search4(__global hash_t* hashes, __global uint* output, const ulong target) +{ +// __local ulong T0[256], T1[256], T2[256], T3[256], T4[256], T5[256], T6[256], T7[256]; + // uint u = get_local_id(0); +/* +for (uint u = get_local_id(0); u < 256; u += get_local_size(0)) { + + + T0[u] = T0_G[u]; + T1[u] = T1_G[u]; + T2[u] = T2_G[u]; + T3[u] = T3_G[u]; + T4[u] = T4_G[u]; + T5[u] = T5_G[u]; + T6[u] = T6_G[u]; + T7[u] = T7_G[u]; + } +barrier(CLK_LOCAL_MEM_FENCE); + + T1[u] = SPH_ROTL64(T0[u], 8UL); + T2[u] = SPH_ROTL64(T0[u], 16UL); + T3[u] = SPH_ROTL64(T0[u], 24UL); + T4[u] = SPH_ROTL64(T0[u], 32UL); + T5[u] = SPH_ROTL64(T0[u], 40UL); + T6[u] = SPH_ROTL64(T0[u], 48UL); + T7[u] = SPH_ROTL64(T0[u], 56UL); + +*/ + uint gid = get_global_id(0); + + __global hash_t *hash = &(hashes[gid - get_global_offset(0)]); + + + + __private ulong message[8], state[8]; + __private ulong t[8]; + + + for (int u = 0; u<4; u++) { message[u] = hash->h8[u]; } + + message[4] = 0x80UL; + message[5] = 0UL; + message[6] = 0UL; + message[7] = 0x0100000000000000UL; + + + for (int u = 0; u<8; u++) { state[u] = message[u]; } + state[7] ^= 0x0001000000000000UL; + + + + for (int r = 0; r < 10; r ++) {ROUND_SMALL_P(state, r); } + + state[7] ^= 0x0001000000000000UL; + + + for (int r = 0; r < 10; r ++) {ROUND_SMALL_Q(message, r); } + + + for (int u = 0; u<8; u++) { state[u] ^= message[u]; } + message[7] = state[7]; + + for (int r = 0; r < 9; r ++) {ROUND_SMALL_P(state, r); } + uchar8 State; + State.s0 =as_uchar8(state[7]^0x79).s0; + State.s1 =as_uchar8(state[0]^0x09).s1; + State.s2 =as_uchar8(state[1]^0x19).s2; + State.s3 =as_uchar8(state[2]^0x29).s3; + State.s4 =as_uchar8(state[3]^0x39).s4; + State.s5 =as_uchar8(state[4]^0x49).s5; + State.s6 =as_uchar8(state[5]^0x59).s6; + State.s7 =as_uchar8(state[6]^0x69).s7; + + + state[7] =T0_G[State.s0] + ^ R64(T0_G[State.s1], 8) + ^ R64(T0_G[State.s2], 16) + ^ R64(T0_G[State.s3], 24) + ^ T4_G[State.s4] + ^ R64(T4_G[State.s5], 8) + ^ R64(T4_G[State.s6], 16) + ^ R64(T4_G[State.s7], 24) ^message[7]; + +// t[7] ^= message[7]; + barrier(CLK_LOCAL_MEM_FENCE); + + + bool result = ( state[7] <= target); + if (result) { + output[atomic_inc(output + 0xFF)] = SWAP4(gid); + } + +} + + +#endif // LYRA2RE_CL \ No newline at end of file diff --git a/kernel/Lyra2REv2.cl b/kernel/Lyra2REv2.cl new file mode 100644 index 000000000..fd4b11255 --- /dev/null +++ b/kernel/Lyra2REv2.cl @@ -0,0 +1,525 @@ +/* + * Lyra2RE kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * Copyright (c) 2014 djm34 + * Copyright (c) 2014 James Lovejoy + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author djm34 + */ +// typedef unsigned int uint; +#pragma OPENCL EXTENSION cl_amd_printf : enable + +#ifndef LYRA2RE_CL +#define LYRA2RE_CL + +#if __ENDIAN_LITTLE__ +#define SPH_LITTLE_ENDIAN 1 +#else +#define SPH_BIG_ENDIAN 1 +#endif + +#define SPH_UPTR sph_u64 + +typedef unsigned int sph_u32; +typedef int sph_s32; +#ifndef __OPENCL_VERSION__ +typedef unsigned long sph_u64; +typedef long sph_s64; +#else +typedef unsigned long sph_u64; +typedef long sph_s64; +#endif + + +#define SPH_64 1 +#define SPH_64_TRUE 1 + +#define SPH_C32(x) ((sph_u32)(x ## U)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) + +#define SPH_C64(x) ((sph_u64)(x ## UL)) +#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) + +//#define SPH_ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +//#define SPH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +//#define SPH_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +//#define SPH_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) + +#define SPH_ROTL32(x,n) rotate(x,(uint)n) //faster with driver 14.6 +#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n)) +#define SPH_ROTL64(x,n) rotate(x,(ulong)n) +#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n)) +static inline sph_u64 ror64(sph_u64 vw, unsigned a) { + uint2 result; + uint2 v = as_uint2(vw); + unsigned n = (unsigned)(64 - a); + if (n == 32) { return as_ulong((uint2)(v.y, v.x)); } + if (n < 32) { + result.y = ((v.y << (n)) | (v.x >> (32 - n))); + result.x = ((v.x << (n)) | (v.y >> (32 - n))); + } + else { + result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); + result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); + } + return as_ulong(result); +} + +//#define SPH_ROTR64(l,n) ror64(l,n) +#define memshift 3 +#include "blake256.cl" +#include "Lyra2v2.cl" +#include "keccak1600.cl" +#include "skein256.cl" +#include "cubehash.cl" +#include "bmw256.cl" + +#define SWAP4(x) as_uint(as_uchar4(x).wzyx) +#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) +//#define SWAP8(x) as_ulong(as_uchar8(x).s32107654) +#if SPH_BIG_ENDIAN + #define DEC64E(x) (x) + #define DEC64BE(x) (*(const __global sph_u64 *) (x)); + #define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x)); + #define DEC32LE(x) (*(const __global sph_u32 *) (x)); +#else + #define DEC64E(x) SWAP8(x) + #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); + #define DEC64LE(x) (*(const __global sph_u64 *) (x)); +#define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x)); +#endif + +typedef union { + unsigned char h1[32]; + uint h4[8]; + ulong h8[4]; +} hash_t; + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search( + __global uchar* hashes, + // precalc hash from fisrt part of message + const uint h0, + const uint h1, + const uint h2, + const uint h3, + const uint h4, + const uint h5, + const uint h6, + const uint h7, + // last 12 bytes of original message + const uint in16, + const uint in17, + const uint in18 +) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + +// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + unsigned int h[8]; + unsigned int m[16]; + unsigned int v[16]; + + +h[0]=h0; +h[1]=h1; +h[2]=h2; +h[3]=h3; +h[4]=h4; +h[5]=h5; +h[6]=h6; +h[7]=h7; +// compress 2nd round + m[0] = in16; + m[1] = in17; + m[2] = in18; + m[3] = SWAP4(gid); + + for (int i = 4; i < 16; i++) {m[i] = c_Padding[i];} + + for (int i = 0; i < 8; i++) {v[i] = h[i];} + + v[8] = c_u256[0]; + v[9] = c_u256[1]; + v[10] = c_u256[2]; + v[11] = c_u256[3]; + v[12] = c_u256[4] ^ 640; + v[13] = c_u256[5] ^ 640; + v[14] = c_u256[6]; + v[15] = c_u256[7]; + + for (int r = 0; r < 14; r++) { + GS(0, 4, 0x8, 0xC, 0x0); + GS(1, 5, 0x9, 0xD, 0x2); + GS(2, 6, 0xA, 0xE, 0x4); + GS(3, 7, 0xB, 0xF, 0x6); + GS(0, 5, 0xA, 0xF, 0x8); + GS(1, 6, 0xB, 0xC, 0xA); + GS(2, 7, 0x8, 0xD, 0xC); + GS(3, 4, 0x9, 0xE, 0xE); + } + + for (int i = 0; i < 16; i++) { + int j = i & 7; + h[j] ^= v[i];} + +for (int i=0;i<8;i++) {hash->h4[i]=SWAP4(h[i]);} + +barrier(CLK_LOCAL_MEM_FENCE); + +} + +// keccak256 + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global uchar* hashes) +{ + uint gid = get_global_id(0); + // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + sph_u64 keccak_gpu_state[25]; + + for (int i = 0; i<25; i++) { + if (i<4) { keccak_gpu_state[i] = hash->h8[i]; } + else { keccak_gpu_state[i] = 0; } + } + keccak_gpu_state[4] = 0x0000000000000001; + keccak_gpu_state[16] = 0x8000000000000000; + + keccak_block(keccak_gpu_state); + for (int i = 0; i<4; i++) { hash->h8[i] = keccak_gpu_state[i]; } +barrier(CLK_LOCAL_MEM_FENCE); + + + +} + +// cubehash256 + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global uchar* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71; + sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63; + sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696; + sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C; + sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00; + sph_u32 xf = 0x40E5FBAB; + + sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5; + sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549; + sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285; + sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD; + sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6; + sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A; + + x0 ^= (hash->h4[0]); + x1 ^= (hash->h4[1]); + x2 ^= (hash->h4[2]); + x3 ^= (hash->h4[3]); + x4 ^= (hash->h4[4]); + x5 ^= (hash->h4[5]); + x6 ^= (hash->h4[6]); + x7 ^= (hash->h4[7]); + + + SIXTEEN_ROUNDS; + x0 ^= 0x80; + SIXTEEN_ROUNDS; + xv ^= 0x01; + for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS; + + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + + + barrier(CLK_GLOBAL_MEM_FENCE); + +} + + +/// lyra2 algo + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global uchar* hashes,__global uchar* matrix ) +{ + uint gid = get_global_id(0); + // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong4 *DMatrix = (__global ulong4 *)(matrix + (4 * memshift * 4 * 4 * 8 * (get_global_id(0) % MAX_GLOBAL_THREADS))); + +// uint offset = (4 * memshift * 4 * 4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))/32; + ulong4 state[4]; + + state[0].x = hash->h8[0]; //password + state[0].y = hash->h8[1]; //password + state[0].z = hash->h8[2]; //password + state[0].w = hash->h8[3]; //password + state[1] = state[0]; + state[2] = (ulong4)(0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL, 0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL); + state[3] = (ulong4)(0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL, 0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL); + for (int i = 0; i<12; i++) { round_lyra(state); } + + state[0] ^= (ulong4)(0x20,0x20,0x20,0x01); + state[1] ^= (ulong4)(0x04,0x04,0x80,0x0100000000000000); + + for (int i = 0; i<12; i++) { round_lyra(state); } + + + uint ps1 = (memshift * 3); +//#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 - memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix)[j+s1] = state[j]; + + round_lyra(state); + } + + reduceDuplexf(state,DMatrix); + + reduceDuplexRowSetupf(1, 0, 2,state, DMatrix); + reduceDuplexRowSetupf(2, 1, 3, state,DMatrix); + + + uint rowa; + uint prev = 3; + for (uint i = 0; i<4; i++) { + rowa = state[0].x & 3; + reduceDuplexRowf(prev, rowa, i, state, DMatrix); + prev = i; + } + + + + uint shift = (memshift * 4 * rowa); + + for (int j = 0; j < 3; j++) + state[j] ^= (DMatrix)[j+shift]; + + for (int i = 0; i < 12; i++) + round_lyra(state); +////////////////////////////////////// + + + for (int i = 0; i<4; i++) {hash->h8[i] = ((ulong*)state)[i];} +barrier(CLK_LOCAL_MEM_FENCE); + + + +} + +//skein256 + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search4(__global uchar* hashes) +{ + uint gid = get_global_id(0); + // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + sph_u64 h[9]; + sph_u64 t[3]; + sph_u64 dt0,dt1,dt2,dt3; + sph_u64 p0, p1, p2, p3, p4, p5, p6, p7; + h[8] = skein_ks_parity; + + for (int i = 0; i<8; i++) { + h[i] = SKEIN_IV512_256[i]; + h[8] ^= h[i];} + + t[0]=t12[0]; + t[1]=t12[1]; + t[2]=t12[2]; + + dt0=hash->h8[0]; + dt1=hash->h8[1]; + dt2=hash->h8[2]; + dt3=hash->h8[3]; + + p0 = h[0] + dt0; + p1 = h[1] + dt1; + p2 = h[2] + dt2; + p3 = h[3] + dt3; + p4 = h[4]; + p5 = h[5] + t[0]; + p6 = h[6] + t[1]; + p7 = h[7]; + + #pragma unroll + for (int i = 1; i<19; i+=2) {Round_8_512(p0,p1,p2,p3,p4,p5,p6,p7,i);} + p0 ^= dt0; + p1 ^= dt1; + p2 ^= dt2; + p3 ^= dt3; + + h[0] = p0; + h[1] = p1; + h[2] = p2; + h[3] = p3; + h[4] = p4; + h[5] = p5; + h[6] = p6; + h[7] = p7; + h[8] = skein_ks_parity; + + for (int i = 0; i<8; i++) { h[8] ^= h[i]; } + + t[0] = t12[3]; + t[1] = t12[4]; + t[2] = t12[5]; + p5 += t[0]; //p5 already equal h[5] + p6 += t[1]; + + #pragma unroll + for (int i = 1; i<19; i+=2) { Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i); } + + hash->h8[0] = p0; + hash->h8[1] = p1; + hash->h8[2] = p2; + hash->h8[3] = p3; + barrier(CLK_LOCAL_MEM_FENCE); + +} + +//cubehash + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search5(__global uchar* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71; + sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63; + sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696; + sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C; + sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00; + sph_u32 xf = 0x40E5FBAB; + + sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5; + sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549; + sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285; + sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD; + sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6; + sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A; + + x0 ^= (hash->h4[0]); + x1 ^= (hash->h4[1]); + x2 ^= (hash->h4[2]); + x3 ^= (hash->h4[3]); + x4 ^= (hash->h4[4]); + x5 ^= (hash->h4[5]); + x6 ^= (hash->h4[6]); + x7 ^= (hash->h4[7]); + + + SIXTEEN_ROUNDS; + x0 ^= 0x80; + SIXTEEN_ROUNDS; + xv ^= 0x01; + for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS; + + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + + + barrier(CLK_GLOBAL_MEM_FENCE); + +} + + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search6(__global uchar* hashes, __global uint* output, const ulong target) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + uint dh[16] = { + 0x40414243, 0x44454647, + 0x48494A4B, 0x4C4D4E4F, + 0x50515253, 0x54555657, + 0x58595A5B, 0x5C5D5E5F, + 0x60616263, 0x64656667, + 0x68696A6B, 0x6C6D6E6F, + 0x70717273, 0x74757677, + 0x78797A7B, 0x7C7D7E7F + }; + uint final_s[16] = { + 0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, + 0xaaaaaaa3, 0xaaaaaaa4, 0xaaaaaaa5, + 0xaaaaaaa6, 0xaaaaaaa7, 0xaaaaaaa8, + 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab, + 0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, + 0xaaaaaaaf + }; + + uint message[16]; + for (int i = 0; i<8; i++) message[i] = hash->h4[i]; + for (int i = 9; i<14; i++) message[i] = 0; + message[8]= 0x80; + message[14]=0x100; + message[15]=0; + + Compression256(message, dh); + Compression256(dh, final_s); + barrier(CLK_LOCAL_MEM_FENCE); + + + bool result = ( ((ulong*)final_s)[7] <= target); + if (result) { + output[atomic_inc(output + 0xFF)] = SWAP4(gid); + } + +} + + +#endif // LYRA2RE_CL \ No newline at end of file diff --git a/kernel/Lyra2v2.cl b/kernel/Lyra2v2.cl new file mode 100644 index 000000000..f9f9161de --- /dev/null +++ b/kernel/Lyra2v2.cl @@ -0,0 +1,184 @@ +/* +* Lyra2 kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* Copyright (c) 2014 djm34 +* +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ + + + +#define ROTL64(x,n) rotate(x,(ulong)n) +#define ROTR64(x,n) rotate(x,(ulong)(64-n)) +#define SWAP32(x) as_ulong(as_uint2(x).s10) +#define SWAP24(x) as_ulong(as_uchar8(x).s34567012) +#define SWAP16(x) as_ulong(as_uchar8(x).s23456701) + +#define G(a,b,c,d) \ + do { \ + a += b; d ^= a; d = SWAP32(d); \ + c += d; b ^= c; b = ROTR64(b,24); \ + a += b; d ^= a; d = ROTR64(d,16); \ + c += d; b ^= c; b = ROTR64(b, 63); \ +\ + } while (0) + +#define G_old(a,b,c,d) \ + do { \ + a += b; d ^= a; d = ROTR64(d, 32); \ + c += d; b ^= c; b = ROTR64(b, 24); \ + a += b; d ^= a; d = ROTR64(d, 16); \ + c += d; b ^= c; b = ROTR64(b, 63); \ +\ + } while (0) + + +/*One Round of the Blake2b's compression function*/ + +#define round_lyra(s) \ + do { \ + G(s[0].x, s[1].x, s[2].x, s[3].x); \ + G(s[0].y, s[1].y, s[2].y, s[3].y); \ + G(s[0].z, s[1].z, s[2].z, s[3].z); \ + G(s[0].w, s[1].w, s[2].w, s[3].w); \ + G(s[0].x, s[1].y, s[2].z, s[3].w); \ + G(s[0].y, s[1].z, s[2].w, s[3].x); \ + G(s[0].z, s[1].w, s[2].x, s[3].y); \ + G(s[0].w, s[1].x, s[2].y, s[3].z); \ + } while(0) + + + +void reduceDuplexf(ulong4* state ,__global ulong4* DMatrix) +{ + + ulong4 state1[3]; + uint ps1 = 0; + uint ps2 = (memshift * 3 + memshift * 4); +//#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 + i*memshift; + uint s2 = ps2 - i*memshift; + + for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; + + for (int j = 0; j < 3; j++) state[j] ^= state1[j]; + round_lyra(state); + for (int j = 0; j < 3; j++) state1[j] ^= state[j]; + + for (int j = 0; j < 3; j++) (DMatrix)[j + s2] = state1[j]; + } + +} + + + +void reduceDuplexRowf(uint rowIn,uint rowInOut,uint rowOut,ulong4 * state, __global ulong4 * DMatrix) +{ + +ulong4 state1[3], state2[3]; +uint ps1 = (memshift * 4 * rowIn); +uint ps2 = (memshift * 4 * rowInOut); +uint ps3 = (memshift * 4 * rowOut); + + + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 + i*memshift; + uint s2 = ps2 + i*memshift; + uint s3 = ps3 + i*memshift; + + + for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; + + for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2]; + + for (int j = 0; j < 3; j++) state1[j] += state2[j]; + + for (int j = 0; j < 3; j++) state[j] ^= state1[j]; + + + round_lyra(state); + + ((ulong*)state2)[0] ^= ((ulong*)state)[11]; + for (int j = 0; j < 11; j++) + ((ulong*)state2)[j + 1] ^= ((ulong*)state)[j]; + + if (rowInOut != rowOut) { + for (int j = 0; j < 3; j++) + (DMatrix)[j + s2] = state2[j]; + for (int j = 0; j < 3; j++) + (DMatrix)[j + s3] ^= state[j]; + } + else { + for (int j = 0; j < 3; j++) + state2[j] ^= state[j]; + for (int j = 0; j < 3; j++) + (DMatrix)[j + s2] = state2[j]; + } + + } + } + + + + +void reduceDuplexRowSetupf(uint rowIn, uint rowInOut, uint rowOut, ulong4 *state, __global ulong4* DMatrix) { + + ulong4 state2[3], state1[3]; + uint ps1 = (memshift * 4 * rowIn); + uint ps2 = (memshift * 4 * rowInOut); + uint ps3 = (memshift * 3 + memshift * 4 * rowOut); + + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 + i*memshift; + uint s2 = ps2 + i*memshift; + uint s3 = ps3 - i*memshift; + + for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; + + for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2]; + for (int j = 0; j < 3; j++) { + ulong4 tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } + round_lyra(state); + + for (int j = 0; j < 3; j++) { + state1[j] ^= state[j]; + (DMatrix)[j + s3] = state1[j]; + } + + ((ulong*)state2)[0] ^= ((ulong*)state)[11]; + for (int j = 0; j < 11; j++) + ((ulong*)state2)[j + 1] ^= ((ulong*)state)[j]; + for (int j = 0; j < 3; j++) + (DMatrix)[j + s2] = state2[j]; + } + } + diff --git a/kernel/blake256.cl b/kernel/blake256.cl new file mode 100644 index 000000000..012285a5e --- /dev/null +++ b/kernel/blake256.cl @@ -0,0 +1,96 @@ +/* +* blake256 kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* Copyright (c) 2014 djm34 +* Copyright (c) 2014 tpruvot +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ +__constant static const int sigma[16][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + + +__constant static const sph_u32 c_IV256[8] = { + 0x6A09E667, 0xBB67AE85, + 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19 +}; + +/* Second part (64-80) msg never change, store it */ +__constant static const sph_u32 c_Padding[16] = { + 0, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640, +}; +__constant static const sph_u32 c_u256[16] = { + 0x243F6A88, 0x85A308D3, + 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, + 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, + 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, + 0x3F84D5B5, 0xB5470917 +}; + +#define GS(a,b,c,d,x) { \ + const sph_u32 idx1 = sigma[r][x]; \ + const sph_u32 idx2 = sigma[r][x+1]; \ + v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \ + v[d] ^= v[a]; \ + v[d] = SPH_ROTR32(v[d], 16); \ + v[c] += v[d]; \ + v[b] ^= v[c]; \ + v[b] = SPH_ROTR32(v[b], 12); \ +\ + v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \ + v[d] ^= v[a]; \ + v[d] = SPH_ROTR32(v[d], 8); \ + v[c] += v[d]; \ + v[b] ^= v[c]; \ + v[b] = SPH_ROTR32(v[b], 7); \ +} + + + + + diff --git a/kernel/bmw256.cl b/kernel/bmw256.cl new file mode 100644 index 000000000..19c85cbc9 --- /dev/null +++ b/kernel/bmw256.cl @@ -0,0 +1,162 @@ +/* +* bmw256 kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* Copyright (c) 2015 djm34 +* +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ + + + +#define shl(x, n) ((x) << (n)) +#define shr(x, n) ((x) >> (n)) +//#define SHR(x, n) SHR2(x, n) +//#define SHL(x, n) SHL2(x, n) + + +#define SPH_ROTL32(x,n) rotate(x,(uint)n) +#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19)) +#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23)) +#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25)) +#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29)) +#define ss4(x) (shr((x), 1) ^ (x)) +#define ss5(x) (shr((x), 2) ^ (x)) +#define rs1(x) SPH_ROTL32((x), 3) +#define rs2(x) SPH_ROTL32((x), 7) +#define rs3(x) SPH_ROTL32((x), 13) +#define rs4(x) SPH_ROTL32((x), 16) +#define rs5(x) SPH_ROTL32((x), 19) +#define rs6(x) SPH_ROTL32((x), 23) +#define rs7(x) SPH_ROTL32((x), 27) + +/* Message expansion function 1 */ +uint expand32_1(int i, uint *M32, uint *H, uint *Q) +{ + + return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13]) + + ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9]) + + ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5]) + + ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1]) + + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); + +} + +/* Message expansion function 2 */ +uint expand32_2(int i, uint *M32, uint *H, uint *Q) +{ + + return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13]) + + Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9]) + + Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5]) + + Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1]) + + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); + +} + +void Compression256(uint *M32, uint *H) +{ + + int i; + uint XL32, XH32, Q[32]; + + + Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]); + Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]); + Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); + Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]); + Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]); + Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); + Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]); + Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]); + Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]); + Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]); + Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]); + Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]); + Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]); + Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]); + Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]); + Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]); + + /* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/ + Q[0] = ss0(Q[0]) + H[1]; + Q[1] = ss1(Q[1]) + H[2]; + Q[2] = ss2(Q[2]) + H[3]; + Q[3] = ss3(Q[3]) + H[4]; + Q[4] = ss4(Q[4]) + H[5]; + Q[5] = ss0(Q[5]) + H[6]; + Q[6] = ss1(Q[6]) + H[7]; + Q[7] = ss2(Q[7]) + H[8]; + Q[8] = ss3(Q[8]) + H[9]; + Q[9] = ss4(Q[9]) + H[10]; + Q[10] = ss0(Q[10]) + H[11]; + Q[11] = ss1(Q[11]) + H[12]; + Q[12] = ss2(Q[12]) + H[13]; + Q[13] = ss3(Q[13]) + H[14]; + Q[14] = ss4(Q[14]) + H[15]; + Q[15] = ss0(Q[15]) + H[0]; + + /* This is the Message expansion or f_1 in the documentation. */ + /* It has 16 rounds. */ + /* Blue Midnight Wish has two tunable security parameters. */ + /* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */ + /* The following relation for these parameters should is satisfied: */ + /* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */ +#pragma unroll + for (i = 0; i<2; i++) + Q[i + 16] = expand32_1(i + 16, M32, H, Q); + +#pragma unroll + for (i = 2; i<16; i++) + Q[i + 16] = expand32_2(i + 16, M32, H, Q); + + /* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */ + /* 16 new variables that are prooduced in the Message Expansion part. */ + XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23]; + XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; + + + /* This part is the function f_2 - in the documentation */ + + /* Compute the double chaining pipe for the next message block. */ + H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]); + H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]); + H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]); + H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]); + H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]); + H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]); + H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]); + H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]); + + H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]); + H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]); + H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]); + H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]); + H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]); + H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]); + H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]); + H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); + +} diff --git a/kernel/credits.cl b/kernel/credits.cl new file mode 100644 index 000000000..19cbea67f --- /dev/null +++ b/kernel/credits.cl @@ -0,0 +1,232 @@ +/* +* "credits" kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* +* Copyright (c) 2015 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ +#if !defined(cl_khr_byte_addressable_store) +#error "Device does not support unaligned stores" +#endif + + +#define ROL32(x, n) rotate(x, (uint) n) +#define SWAP32(a) (as_uint(as_uchar4(a).wzyx)) +#define SWAP64(x) as_ulong(as_uchar8(x).s32107654) /// hmm... + +#define SHR(x, n) ((x) >> n) + +#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^ SHR(x, 3)) +#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^ SHR(x, 10)) + +#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10)) +#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7)) + +#define P(a,b,c,d,e,f,g,h,x,K) \ +{ \ + temp1 = h + S3(e) + F1(e,f,g) + (K + x); \ + d += temp1; h = temp1 + S2(a) + F0(a,b,c); \ +} + +#define F0(y, x, z) bitselect(z, y, z ^ x) +#define F1(x, y, z) bitselect(z, y, x) + +#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0) +#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1) +#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2) +#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3) +#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4) +#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5) +#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6) +#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7) +#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8) +#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9) +#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10) +#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11) +#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12) +#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13) +#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14) +#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15) + +#define RD14 (S1(W12) + W7 + S0(W15) + W14) +#define RD15 (S1(W13) + W8 + S0(W0) + W15) + +/// generic sha transform +inline uint8 sha256_Transform(uint16 data, uint8 state) +{ + uint temp1; + uint8 res = state; + uint W0 = data.s0; + uint W1 = data.s1; + uint W2 = data.s2; + uint W3 = data.s3; + uint W4 = data.s4; + uint W5 = data.s5; + uint W6 = data.s6; + uint W7 = data.s7; + uint W8 = data.s8; + uint W9 = data.s9; + uint W10 = data.sA; + uint W11 = data.sB; + uint W12 = data.sC; + uint W13 = data.sD; + uint W14 = data.sE; + uint W15 = data.sF; + +#define v0 res.s0 +#define v1 res.s1 +#define v2 res.s2 +#define v3 res.s3 +#define v4 res.s4 +#define v5 res.s5 +#define v6 res.s6 +#define v7 res.s7 + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef v4 +#undef v5 +#undef v6 +#undef v7 + return (res + state); +} + + + +static __constant uint8 H256 = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, + 0xA54FF53A, 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19 +}; + + +static __constant uint8 pad_data = +{ + 0x00000000, 0x00000000, 0x80000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000540 +}; + +static __constant uint8 pad_state = +{ + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000100 +}; + + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint* restrict output,const ulong target, uint8 midstate ) +{ + + + uint nonce = get_global_id(0); + uint16 in; + uint8 state1; + + in.lo = ((__global const uint8 *)input)[4]; + in.hi = pad_data; + in.hi.s0 = ((__global const uint *)input)[40]; + in.hi.s1 = ((__global const uint *)input)[41]; + in.s3 = nonce; + state1 = sha256_Transform(in, midstate); + in.lo = state1; + in.hi = pad_state; + state1 = sha256_Transform(in,H256); + +if (SWAP64(state1.s67) <= target) + output[atomic_inc(output + 0xFF)] = nonce; + +} + diff --git a/kernel/cubehash256.cl b/kernel/cubehash256.cl new file mode 100644 index 000000000..9bc4c6545 --- /dev/null +++ b/kernel/cubehash256.cl @@ -0,0 +1,132 @@ +// cubehash256 +// djm34 2015 based on ccminer cubehash512 + +#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ +#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ + + +#define LROT(x, bits) rotate( x,(uint) bits) + + +#define ROTATEUPWARDS7(a) LROT(a,7) +#define ROTATEUPWARDS11(a) LROT(a,11) + +#define SWAP(a,b) { uint u = a; a = b; b = u; } + +inline void rrounds(uint x[2][2][2][2][2]) +{ + int r; + int j; + int k; + int l; + int m; + +//#pragma unroll 2 + for (r = 0; r < CUBEHASH_ROUNDS; ++r) { + + /* "add x_0jklm into x_1jklmn modulo 2^32" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 7 bits" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); + + /* "swap x_00klm with x_01klm" */ +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jk0m with x_1jk1m" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) + + /* "add x_0jklm into x_1jklm modulo 2^32" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 11 bits" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); + + /* "swap x_0j0lm with x_0j1lm" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jkl0 with x_1jkl1" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) + SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) + + } +} + + diff --git a/kernel/groestl.cl b/kernel/groestl.cl index bf93cd0ba..a81633cc3 100644 --- a/kernel/groestl.cl +++ b/kernel/groestl.cl @@ -57,8 +57,8 @@ #define USE_LE 1 #endif -#if USE_LE +#if USE_LE #if SPH_64 #define C64e(x) ((SPH_C64(x) >> 56) \ | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ @@ -1173,6 +1173,8 @@ __constant static const sph_u64 T7[] = { ^ R64(T4[B64_7(a[b7])], 24); \ } while (0) + + #else #define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ @@ -1186,6 +1188,9 @@ __constant static const sph_u64 T7[] = { ^ T7[B64_7(a[b7])]; \ } while (0) + + + #endif #if SPH_SMALL_FOOTPRINT_GROESTL @@ -1417,6 +1422,9 @@ __constant static const sph_u64 T7[] = { } while (0) */ + + + #define PERM_BIG_P(a) do { \ int r; \ for (r = 0; r < 14; ++r) { \ @@ -1429,4 +1437,125 @@ __constant static const sph_u64 T7[] = { for (r = 0; r < 14; ++r) { \ ROUND_BIG_Q(a, r); \ } \ - } while (0) \ No newline at end of file + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ R64(T0[B64_1(a[b1])], 8) \ + ^ R64(T0[B64_2(a[b2])], 16) \ + ^ R64(T0[B64_3(a[b3])], 24) \ + ^ T4[B64_4(a[b4])] \ + ^ R64(T4[B64_5(a[b5])], 8) \ + ^ R64(T4[B64_6(a[b6])], 16) \ + ^ R64(T4[B64_7(a[b7])], 24); \ + } while (0) + +#else + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ T1[B64_1(a[b1])] \ + ^ T2[B64_2(a[b2])] \ + ^ T3[B64_3(a[b3])] \ + ^ T4[B64_4(a[b4])] \ + ^ T5[B64_5(a[b5])] \ + ^ T6[B64_6(a[b6])] \ + ^ T7[B64_7(a[b7])]; \ + } while (0) + +#endif +#define ROUND_SMALL_P(a, r) do { \ + sph_u64 t[8]; \ + a[0] ^= PC64(0x00, r); \ + a[1] ^= PC64(0x10, r); \ + a[2] ^= PC64(0x20, r); \ + a[3] ^= PC64(0x30, r); \ + a[4] ^= PC64(0x40, r); \ + a[5] ^= PC64(0x50, r); \ + a[6] ^= PC64(0x60, r); \ + a[7] ^= PC64(0x70, r); \ + RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \ + RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \ + RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \ + RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \ + RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \ + RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \ + RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \ + RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#define ROUND_SMALL_Q(a, r) do { \ + sph_u64 t[8]; \ + a[0] ^= QC64(0x00, r); \ + a[1] ^= QC64(0x10, r); \ + a[2] ^= QC64(0x20, r); \ + a[3] ^= QC64(0x30, r); \ + a[4] ^= QC64(0x40, r); \ + a[5] ^= QC64(0x50, r); \ + a[6] ^= QC64(0x60, r); \ + a[7] ^= QC64(0x70, r); \ + RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \ + RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \ + RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \ + RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \ + RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \ + RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \ + RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \ + RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_P(a, r); \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_Q(a, r); \ + } while (0) + +#else + +/* + * Apparently, unrolling more than that confuses GCC, resulting in + * lower performance, even though L1 cache would be no problem. + */ +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_P(a, r + 0); \ + ROUND_SMALL_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_Q(a, r + 0); \ + ROUND_SMALL_Q(a, r + 1); \ + } \ + } while (0) + +#endif diff --git a/kernel/groestl256.cl b/kernel/groestl256.cl new file mode 100644 index 000000000..e083e7ce3 --- /dev/null +++ b/kernel/groestl256.cl @@ -0,0 +1,1241 @@ +/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */ +/* + * Groestl256 + * + * ==========================(LICENSE BEGIN)============================ + * Copyright (c) 2014 djm34 + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + + +#define C64e(x) ((SPH_C64(x) >> 56) \ + | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ + | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ + | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ + | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ + | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ + | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ + | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) + +#define B64_0(x) ((x) & 0xFF) +#define B64_1(x) (((x) >> 8) & 0xFF) +#define B64_2(x) (((x) >> 16) & 0xFF) +#define B64_3(x) (((x) >> 24) & 0xFF) +#define B64_4(x) (((x) >> 32) & 0xFF) +#define B64_5(x) (((x) >> 40) & 0xFF) +#define B64_6(x) (((x) >> 48) & 0xFF) +#define B64_7(x) ((x) >> 56) +#define R64 SPH_ROTL64 +#define PC64(j, r) ((sph_u64)((j) + (r))) +#define QC64(j, r) (((sph_u64)(r) << 56) ^ (~((sph_u64)(j) << 56))) + +__constant static const sph_u64 T0_G[] = { + C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8), + C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6), + C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6), + C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491), + C64e(0x6090f050f0c05060), C64e(0x0207050305040302), + C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56), + C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5), + C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec), + C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f), + C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa), + C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2), + C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb), + C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3), + C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45), + C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753), + C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b), + C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1), + C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c), + C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e), + C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83), + C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451), + C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9), + C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab), + C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a), + C64e(0x081c140c14100c08), C64e(0x9563f652f6315295), + C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d), + C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137), + C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f), + C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624), + C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf), + C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e), + C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea), + C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d), + C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34), + C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc), + C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b), + C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76), + C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d), + C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd), + C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713), + C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9), + C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1), + C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3), + C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6), + C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d), + C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72), + C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498), + C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85), + C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5), + C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed), + C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a), + C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411), + C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9), + C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe), + C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478), + C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b), + C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d), + C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05), + C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21), + C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1), + C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177), + C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342), + C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5), + C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf), + C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418), + C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3), + C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235), + C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e), + C64e(0x936af957f93d5793), C64e(0x55580df20daaf255), + C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a), + C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba), + C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6), + C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819), + C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3), + C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54), + C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b), + C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7), + C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28), + C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc), + C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad), + C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664), + C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14), + C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c), + C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8), + C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd), + C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4), + C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431), + C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2), + C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b), + C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da), + C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1), + C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049), + C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac), + C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf), + C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4), + C64e(0x476720e9208ee947), C64e(0x1038281828201810), + C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0), + C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c), + C64e(0x38546c246c702438), C64e(0x575f08f108aef157), + C64e(0x732152c752e6c773), C64e(0x9764f351f3355197), + C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1), + C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e), + C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61), + C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f), + C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c), + C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc), + C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506), + C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c), + C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a), + C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069), + C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899), + C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927), + C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb), + C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322), + C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9), + C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733), + C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c), + C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9), + C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa), + C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5), + C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859), + C64e(0x09929b809b128009), C64e(0x1a2339173934171a), + C64e(0x651075da75cada65), C64e(0xd784533153b531d7), + C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0), + C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029), + C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e), + C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8), + C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c) +}; + +/* +__constant static const ulong T1_G[] = { + C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84), + C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d), + C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd), + C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954), + C64e(0x606090f050f0c050), C64e(0x0202070503050403), + C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d), + C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162), + C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a), + C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d), + C64e(0x898949c040c00940), C64e(0xfafa68928792ef87), + C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb), + C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b), + C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67), + C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea), + C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7), + C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b), + C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c), + C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a), + C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41), + C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f), + C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4), + C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908), + C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73), + C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f), + C64e(0x08081c140c14100c), C64e(0x959563f652f63152), + C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e), + C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1), + C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5), + C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836), + C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d), + C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69), + C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f), + C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e), + C64e(0x5858c49c749cb074), C64e(0x343446722e72682e), + C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2), + C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb), + C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d), + C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face), + C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e), + C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697), + C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968), + C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c), + C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f), + C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed), + C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146), + C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b), + C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4), + C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a), + C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a), + C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116), + C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7), + C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294), + C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910), + C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781), + C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044), + C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3), + C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe), + C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a), + C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc), + C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904), + C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1), + C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463), + C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a), + C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d), + C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014), + C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f), + C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2), + C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39), + C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2), + C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447), + C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7), + C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795), + C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298), + C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f), + C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e), + C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683), + C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529), + C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c), + C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2), + C64e(0x161631271d272c1d), C64e(0xadad379a769a4176), + C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856), + C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e), + C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a), + C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4), + C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e), + C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6), + C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4), + C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b), + C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43), + C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7), + C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964), + C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0), + C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa), + C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525), + C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e), + C64e(0x47476720e9208ee9), C64e(0x1010382818282018), + C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88), + C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872), + C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1), + C64e(0x73732152c752e6c7), C64e(0x979764f351f33551), + C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c), + C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21), + C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc), + C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85), + C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842), + C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa), + C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05), + C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812), + C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f), + C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0), + C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958), + C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9), + C64e(0xd9d991483848a938), C64e(0xebebde351335cd13), + C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433), + C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970), + C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7), + C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822), + C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920), + C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff), + C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a), + C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8), + C64e(0x0909929b809b1280), C64e(0x1a1a233917393417), + C64e(0x65651075da75cada), C64e(0xd7d784533153b531), + C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8), + C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0), + C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11), + C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc), + C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a) +}; + +__constant static const ulong T2_G[] = { +C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb), +C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7), +C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7), +C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39), +C64e(0x50606090f050f0c0), C64e(0x0302020705030504), +C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac), +C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671), +C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3), +C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e), +C64e(0x40898949c040c009), C64e(0x87fafa68928792ef), +C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f), +C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded), +C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d), +C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a), +C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6), +C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d), +C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9), +C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98), +C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc), +C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d), +C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2), +C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9), +C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d), +C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154), +C64e(0x0c08081c140c1410), C64e(0x52959563f652f631), +C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221), +C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e), +C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e), +C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48), +C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5), +C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c), +C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf), +C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a), +C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268), +C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3), +C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6), +C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec), +C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa), +C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1), +C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226), +C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869), +C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499), +C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd), +C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77), +C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01), +C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4), +C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b), +C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11), +C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91), +C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1), +C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f), +C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722), +C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9), +C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7), +C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0), +C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96), +C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba), +C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a), +C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42), +C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9), +C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee), +C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584), +C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1), +C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765), +C64e(0x4c818155d44cd419), C64e(0x141818243c143c30), +C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d), +C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a), +C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c), +C64e(0x5793936af957f93d), C64e(0xf25555580df20daa), +C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4), +C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f), +C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7), +C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332), +C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d), +C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8), +C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16), +C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95), +C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450), +C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63), +C64e(0x1d161631271d272c), C64e(0x76adad379a769a41), +C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8), +C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228), +C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18), +C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b), +C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261), +C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193), +C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762), +C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff), +C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d), +C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af), +C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79), +C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92), +C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543), +C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85), +C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3), +C64e(0xe947476720e9208e), C64e(0x1810103828182820), +C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb), +C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8), +C64e(0x243838546c246c70), C64e(0xf157575f08f108ae), +C64e(0xc773732152c752e6), C64e(0x51979764f351f335), +C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459), +C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c), +C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2), +C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e), +C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8), +C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583), +C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c), +C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638), +C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4), +C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2), +C64e(0x911717bfa891a82e), C64e(0x58999971e858e829), +C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e), +C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd), +C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544), +C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049), +C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266), +C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678), +C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089), +C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f), +C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51), +C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2), +C64e(0x800909929b809b12), C64e(0x171a1a2339173934), +C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5), +C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb), +C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52), +C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c), +C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b), +C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58) +}; + +__constant static const ulong T3_G[] = { +C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497), +C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c), +C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc), +C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc), +C64e(0xc050606090f050f0), C64e(0x0403020207050305), +C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87), +C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6), +C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5), +C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc), +C64e(0x0940898949c040c0), C64e(0xef87fafa68928792), +C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26), +C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d), +C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9), +C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25), +C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702), +C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed), +C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24), +C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe), +C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3), +C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1), +C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407), +C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818), +C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395), +C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41), +C64e(0x100c08081c140c14), C64e(0x3152959563f652f6), +C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2), +C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8), +C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4), +C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a), +C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47), +C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb), +C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba), +C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9), +C64e(0xb0745858c49c749c), C64e(0x682e343446722e72), +C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd), +C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16), +C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7), +C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49), +C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42), +C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2), +C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8), +C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74), +C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21), +C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c), +C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca), +C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd), +C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467), +C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade), +C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e), +C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a), +C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762), +C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7), +C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030), +C64e(0x080604040e0a060a), C64e(0xe781fefe66988198), +C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc), +C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e), +C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19), +C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85), +C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf), +C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c), +C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158), +C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5), +C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e), +C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7), +C64e(0x194c818155d44cd4), C64e(0x30141818243c143c), +C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71), +C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd), +C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b), +C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d), +C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9), +C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732), +C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4), +C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3), +C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81), +C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82), +C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e), +C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b), +C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44), +C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d), +C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a), +C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa), +C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22), +C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e), +C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437), +C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2), +C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1), +C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7), +C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86), +C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5), +C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2), +C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac), +C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b), +C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15), +C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f), +C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89), +C64e(0x8ee947476720e920), C64e(0x2018101038281828), +C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883), +C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296), +C64e(0x70243838546c246c), C64e(0xaef157575f08f108), +C64e(0xe6c773732152c752), C64e(0x3551979764f351f3), +C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84), +C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163), +C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f), +C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594), +C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6), +C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5), +C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f), +C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236), +C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1), +C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b), +C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8), +C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0), +C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335), +C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355), +C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090), +C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2), +C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266), +C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060), +C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a), +C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e), +C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813), +C64e(0x12800909929b809b), C64e(0x34171a1a23391739), +C64e(0xcada65651075da75), C64e(0xb531d7d784533153), +C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3), +C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb), +C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133), +C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f), +C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e) +}; +*/ +__constant static const ulong T4_G[] = { +C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784), +C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d), +C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd), +C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54), +C64e(0xf0c050606090f050), C64e(0x0504030202070503), +C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d), +C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662), +C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a), +C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d), +C64e(0xc00940898949c040), C64e(0x92ef87fafa689287), +C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb), +C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b), +C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967), +C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea), +C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7), +C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b), +C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c), +C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a), +C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341), +C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f), +C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4), +C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808), +C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573), +C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f), +C64e(0x14100c08081c140c), C64e(0xf63152959563f652), +C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e), +C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1), +C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5), +C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36), +C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d), +C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69), +C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f), +C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e), +C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e), +C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2), +C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb), +C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d), +C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce), +C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e), +C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297), +C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868), +C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c), +C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f), +C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced), +C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46), +C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b), +C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4), +C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a), +C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a), +C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16), +C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7), +C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794), +C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010), +C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881), +C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44), +C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3), +C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe), +C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a), +C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc), +C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04), +C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1), +C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563), +C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a), +C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d), +C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14), +C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f), +C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2), +C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39), +C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2), +C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947), +C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7), +C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495), +C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398), +C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f), +C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e), +C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83), +C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29), +C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c), +C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2), +C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76), +C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56), +C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e), +C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a), +C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4), +C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e), +C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6), +C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4), +C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b), +C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543), +C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7), +C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64), +C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0), +C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa), +C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25), +C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e), +C64e(0x208ee947476720e9), C64e(0x2820181010382818), +C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388), +C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672), +C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1), +C64e(0x52e6c773732152c7), C64e(0xf33551979764f351), +C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c), +C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321), +C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc), +C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485), +C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642), +C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa), +C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05), +C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612), +C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f), +C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0), +C64e(0xa82e911717bfa891), C64e(0xe82958999971e858), +C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9), +C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513), +C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533), +C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070), +C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7), +C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622), +C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020), +C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff), +C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a), +C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8), +C64e(0x9b12800909929b80), C64e(0x3934171a1a233917), +C64e(0x75cada65651075da), C64e(0x53b531d7d7845331), +C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8), +C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0), +C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311), +C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc), +C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a) +}; + +/* +__constant static const ulong T5_G[] = { +C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97), +C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c), +C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc), +C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc), +C64e(0x50f0c050606090f0), C64e(0x0305040302020705), +C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187), +C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6), +C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5), +C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc), +C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892), +C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426), +C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d), +C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9), +C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025), +C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102), +C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed), +C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524), +C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be), +C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3), +C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1), +C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607), +C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118), +C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95), +C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41), +C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6), +C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2), +C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8), +C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4), +C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a), +C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847), +C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb), +C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba), +C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9), +C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672), +C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd), +C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16), +C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7), +C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449), +C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42), +C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2), +C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8), +C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574), +C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221), +C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c), +C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca), +C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd), +C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67), +C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde), +C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e), +C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a), +C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862), +C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7), +C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930), +C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698), +C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc), +C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e), +C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419), +C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085), +C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf), +C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c), +C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58), +C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5), +C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e), +C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7), +C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c), +C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271), +C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd), +C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b), +C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d), +C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9), +C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832), +C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4), +C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3), +C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281), +C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682), +C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e), +C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b), +C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44), +C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d), +C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a), +C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa), +C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622), +C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e), +C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37), +C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2), +C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1), +C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7), +C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486), +C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5), +C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2), +C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac), +C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b), +C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915), +C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f), +C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89), +C64e(0xe9208ee947476720), C64e(0x1828201810103828), +C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383), +C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96), +C64e(0x246c70243838546c), C64e(0xf108aef157575f08), +C64e(0xc752e6c773732152), C64e(0x51f33551979764f3), +C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584), +C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63), +C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f), +C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94), +C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6), +C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5), +C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f), +C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36), +C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1), +C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b), +C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8), +C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0), +C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35), +C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755), +C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990), +C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2), +C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66), +C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960), +C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a), +C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e), +C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13), +C64e(0x809b12800909929b), C64e(0x173934171a1a2339), +C64e(0xda75cada65651075), C64e(0x3153b531d7d78453), +C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3), +C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb), +C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33), +C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f), +C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e) +}; + +__constant static const ulong T6_G[] = { +C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f), +C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a), +C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a), +C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d), +C64e(0xf050f0c050606090), C64e(0x0503050403020207), +C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1), +C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513), +C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59), +C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3), +C64e(0xc040c00940898949), C64e(0x928792ef87fafa68), +C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294), +C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6), +C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a), +C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560), +C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351), +C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76), +C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5), +C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2), +C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd), +C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352), +C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156), +C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1), +C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e), +C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b), +C64e(0x140c14100c08081c), C64e(0xf652f63152959563), +C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f), +C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf), +C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb), +C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e), +C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98), +C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5), +C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50), +C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4), +C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446), +C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11), +C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d), +C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1), +C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34), +C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f), +C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1), +C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901), +C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5), +C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2), +C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a), +C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47), +C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af), +C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff), +C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b), +C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb), +C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7), +C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8), +C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6), +C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9), +C64e(0x0a060a080604040e), C64e(0x988198e781fefe66), +C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4), +C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75), +C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44), +C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580), +C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe), +C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd), +C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f), +C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7), +C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb), +C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08), +C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824), +C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2), +C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8), +C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65), +C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558), +C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3), +C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88), +C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642), +C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa), +C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322), +C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6), +C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95), +C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc), +C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c), +C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81), +C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37), +C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e), +C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436), +C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12), +C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f), +C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f), +C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435), +C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6), +C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274), +C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e), +C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18), +C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d), +C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972), +C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9), +C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0), +C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d), +C64e(0x20e9208ee9474767), C64e(0x2818282018101038), +C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073), +C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca), +C64e(0x6c246c7024383854), C64e(0x08f108aef157575f), +C64e(0x52c752e6c7737321), C64e(0xf351f33551979764), +C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125), +C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d), +C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e), +C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b), +C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba), +C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29), +C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609), +C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a), +C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b), +C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902), +C64e(0xa891a82e911717bf), C64e(0xe858e82958999971), +C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7), +C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde), +C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277), +C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939), +C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1), +C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a), +C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9), +C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0), +C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b), +C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a), +C64e(0x9b809b1280090992), C64e(0x39173934171a1a23), +C64e(0x75da75cada656510), C64e(0x533153b531d7d784), +C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003), +C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2), +C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d), +C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7), +C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62) +}; + +__constant static const ulong T7_G[] = { +C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8), +C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6), +C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6), +C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191), +C64e(0x90f050f0c0506060), C64e(0x0705030504030202), +C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656), +C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5), +C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec), +C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f), +C64e(0x49c040c009408989), C64e(0x68928792ef87fafa), +C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2), +C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb), +C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3), +C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545), +C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353), +C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b), +C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1), +C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c), +C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e), +C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383), +C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151), +C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9), +C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab), +C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a), +C64e(0x1c140c14100c0808), C64e(0x63f652f631529595), +C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d), +C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737), +C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f), +C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424), +C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf), +C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e), +C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea), +C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d), +C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434), +C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc), +C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b), +C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676), +C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d), +C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd), +C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313), +C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9), +C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1), +C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3), +C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6), +C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d), +C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272), +C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898), +C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585), +C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5), +C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded), +C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a), +C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111), +C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9), +C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe), +C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878), +C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b), +C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d), +C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505), +C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121), +C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1), +C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777), +C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242), +C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5), +C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf), +C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818), +C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3), +C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535), +C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e), +C64e(0x6af957f93d579393), C64e(0x580df20daaf25555), +C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a), +C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba), +C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6), +C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919), +C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3), +C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454), +C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b), +C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7), +C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828), +C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc), +C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad), +C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464), +C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414), +C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c), +C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8), +C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd), +C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4), +C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131), +C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2), +C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b), +C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada), +C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1), +C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949), +C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac), +C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf), +C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4), +C64e(0x6720e9208ee94747), C64e(0x3828182820181010), +C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0), +C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c), +C64e(0x546c246c70243838), C64e(0x5f08f108aef15757), +C64e(0x2152c752e6c77373), C64e(0x64f351f335519797), +C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1), +C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e), +C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161), +C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f), +C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c), +C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc), +C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606), +C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c), +C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a), +C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969), +C64e(0xbfa891a82e911717), C64e(0x71e858e829589999), +C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727), +C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb), +C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222), +C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9), +C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333), +C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c), +C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9), +C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa), +C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5), +C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959), +C64e(0x929b809b12800909), C64e(0x2339173934171a1a), +C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7), +C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0), +C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929), +C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e), +C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8), +C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c) +}; + +*/ + + + + + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0_G[B64_0(a[b0])] \ + ^ R64(T0_G[B64_1(a[b1])], 8) \ + ^ R64(T0_G[B64_2(a[b2])], 16) \ + ^ R64(T0_G[B64_3(a[b3])], 24) \ + ^ T4_G[B64_4(a[b4])] \ + ^ R64(T4_G[B64_5(a[b5])], 8) \ + ^ R64(T4_G[B64_6(a[b6])], 16) \ + ^ R64(T4_G[B64_7(a[b7])], 24); \ + } while (0) +/* +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0_G[B64_0(a[b0])] \ + ^ as_ulong(as_uchar8(T0_G[B64_1(a[b1])]).s70123456) \ + ^ as_ulong(as_uchar8(T0_G[B64_2(a[b2])]).s67012345) \ + ^ as_ulong(as_uchar8(T0_G[B64_3(a[b3])]).s56701234) \ + ^ T4_G[B64_4(a[b4])] \ + ^ as_ulong(as_uchar8(T4_G[B64_5(a[b5])]).s70123456) \ + ^ as_ulong(as_uchar8(T4_G[B64_6(a[b6])]).s67012345) \ + ^ as_ulong(as_uchar8(T4_G[B64_7(a[b7])]).s56701234); \ + } while (0) +*/ + + +/* +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ R64(T0[B64_1(a[b1])], 8) \ + ^ T2[B64_2(a[b2])] \ + ^ R64(T2[B64_3(a[b3])], 8) \ + ^ T4[B64_4(a[b4])] \ + ^ R64(T4[B64_5(a[b5])], 8) \ + ^ T6[B64_6(a[b6])] \ + ^ R64(T6[B64_7(a[b7])], 8); \ + } while (0) + + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ T1[B64_1(a[b1])] \ + ^ T2[B64_2(a[b2])] \ + ^ T3[B64_3(a[b3])] \ + ^ T4[B64_4(a[b4])] \ + ^ T5[B64_5(a[b5])] \ + ^ T6[B64_6(a[b6])] \ + ^ T7[B64_7(a[b7])]; \ + } while (0) +*/ +#define ROUND_SMALL_P(a, r) do { \ + a[0] ^= PC64(0x00, r); \ + a[1] ^= PC64(0x10, r); \ + a[2] ^= PC64(0x20, r); \ + a[3] ^= PC64(0x30, r); \ + a[4] ^= PC64(0x40, r); \ + a[5] ^= PC64(0x50, r); \ + a[6] ^= PC64(0x60, r); \ + a[7] ^= PC64(0x70, r); \ + RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \ + RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \ + RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \ + RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \ + RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \ + RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \ + RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \ + RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#define ROUND_SMALL_Pf(a,r) do { \ + a[0] ^= PC64(0x00, r); \ + a[1] ^= PC64(0x10, r); \ + a[2] ^= PC64(0x20, r); \ + a[3] ^= PC64(0x30, r); \ + a[4] ^= PC64(0x40, r); \ + a[5] ^= PC64(0x50, r); \ + a[6] ^= PC64(0x60, r); \ + a[7] ^= PC64(0x70, r); \ + RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \ + a[7] = t[7]; \ + } while (0) + +#define ROUND_SMALL_Q(a, r) do { \ + a[0] ^= QC64(0x00, r); \ + a[1] ^= QC64(0x10, r); \ + a[2] ^= QC64(0x20, r); \ + a[3] ^= QC64(0x30, r); \ + a[4] ^= QC64(0x40, r); \ + a[5] ^= QC64(0x50, r); \ + a[6] ^= QC64(0x60, r); \ + a[7] ^= QC64(0x70, r); \ + RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \ + RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \ + RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \ + RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \ + RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \ + RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \ + RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \ + RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#define PERM_SMALL_P(a) do { \ + for (int r = 0; r < 10; r ++) \ + ROUND_SMALL_P(a, r); \ + } while (0) + +#define PERM_SMALL_Pf(a) do { \ + for (int r = 0; r < 9; r ++) { \ + ROUND_SMALL_P(a, r);} \ + ROUND_SMALL_Pf(a,9); \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + for (int r = 0; r < 10; r ++) \ + ROUND_SMALL_Q(a, r); \ + } while (0) + diff --git a/kernel/groestlcoin-v1.cl b/kernel/groestlcoin-v1.cl new file mode 100644 index 000000000..c8382d581 --- /dev/null +++ b/kernel/groestlcoin-v1.cl @@ -0,0 +1,1854 @@ +/* + * ==========================(LICENSE BEGIN)============================ + * + * GroestlCoin kernel implementation: Copyright (c) 2014 pallas + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * GroestlCoin kernel implementation: @author pallas + * Forum thread: http://bitcointalk.org/index.php?topic=779598 + * Donations to: BTC 1H7qC5uHuGX2d5s9Kuw3k7Wm7xMQzL16SN + */ + +#ifndef GROESTLCOIN_CL +#define GROESTLCOIN_CL + +#define DC64(x) ((ulong)(x ## UL)) +#define DEC64E(x) (*(const __global ulong *) (x)); +#define H15 (((ulong)(512 & 0xFF) << 56) | ((ulong)(512 & 0xFF00) << 40)) +#define M15 0x100000000000000 +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) // rotate + +#define C64e(x) ((DC64(x) >> 56) \ + | ((DC64(x) >> 40) & DC64(0x000000000000FF00)) \ + | ((DC64(x) >> 24) & DC64(0x0000000000FF0000)) \ + | ((DC64(x) >> 8) & DC64(0x00000000FF000000)) \ + | ((DC64(x) << 8) & DC64(0x000000FF00000000)) \ + | ((DC64(x) << 24) & DC64(0x0000FF0000000000)) \ + | ((DC64(x) << 40) & DC64(0x00FF000000000000)) \ + | ((DC64(x) << 56))) +#define B64_0(x) ((x) & 0xFF) +#define B64_1(x) (((x) >> 8) & 0xFF) +#define B64_2(x) (((x) >> 16) & 0xFF) +#define B64_3(x) (((x) >> 24) & 0xFF) +#define B64_4(x) (((x) >> 32) & 0xFF) +#define B64_5(x) (((x) >> 40) & 0xFF) +#define B64_6(x) (((x) >> 48) & 0xFF) +#define B64_7(x) ((x) >> 56) +#define PC64(j, r) ((ulong)((j) + (r))) +#define QC64(j, r) (((ulong)(r) << 56) ^ ~((ulong)(j) << 56)) + +__constant static const ulong T0[] = { + C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8), + C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6), + C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6), + C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491), + C64e(0x6090f050f0c05060), C64e(0x0207050305040302), + C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56), + C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5), + C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec), + C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f), + C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa), + C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2), + C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb), + C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3), + C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45), + C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753), + C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b), + C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1), + C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c), + C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e), + C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83), + C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451), + C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9), + C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab), + C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a), + C64e(0x081c140c14100c08), C64e(0x9563f652f6315295), + C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d), + C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137), + C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f), + C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624), + C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf), + C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e), + C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea), + C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d), + C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34), + C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc), + C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b), + C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76), + C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d), + C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd), + C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713), + C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9), + C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1), + C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3), + C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6), + C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d), + C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72), + C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498), + C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85), + C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5), + C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed), + C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a), + C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411), + C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9), + C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe), + C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478), + C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b), + C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d), + C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05), + C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21), + C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1), + C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177), + C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342), + C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5), + C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf), + C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418), + C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3), + C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235), + C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e), + C64e(0x936af957f93d5793), C64e(0x55580df20daaf255), + C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a), + C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba), + C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6), + C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819), + C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3), + C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54), + C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b), + C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7), + C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28), + C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc), + C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad), + C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664), + C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14), + C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c), + C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8), + C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd), + C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4), + C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431), + C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2), + C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b), + C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da), + C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1), + C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049), + C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac), + C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf), + C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4), + C64e(0x476720e9208ee947), C64e(0x1038281828201810), + C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0), + C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c), + C64e(0x38546c246c702438), C64e(0x575f08f108aef157), + C64e(0x732152c752e6c773), C64e(0x9764f351f3355197), + C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1), + C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e), + C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61), + C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f), + C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c), + C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc), + C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506), + C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c), + C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a), + C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069), + C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899), + C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927), + C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb), + C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322), + C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9), + C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733), + C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c), + C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9), + C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa), + C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5), + C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859), + C64e(0x09929b809b128009), C64e(0x1a2339173934171a), + C64e(0x651075da75cada65), C64e(0xd784533153b531d7), + C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0), + C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029), + C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e), + C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8), + C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c) +}; + +__constant static const ulong T1[] = { + C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84), + C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d), + C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd), + C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954), + C64e(0x606090f050f0c050), C64e(0x0202070503050403), + C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d), + C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162), + C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a), + C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d), + C64e(0x898949c040c00940), C64e(0xfafa68928792ef87), + C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb), + C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b), + C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67), + C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea), + C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7), + C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b), + C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c), + C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a), + C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41), + C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f), + C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4), + C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908), + C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73), + C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f), + C64e(0x08081c140c14100c), C64e(0x959563f652f63152), + C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e), + C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1), + C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5), + C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836), + C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d), + C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69), + C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f), + C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e), + C64e(0x5858c49c749cb074), C64e(0x343446722e72682e), + C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2), + C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb), + C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d), + C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face), + C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e), + C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697), + C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968), + C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c), + C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f), + C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed), + C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146), + C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b), + C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4), + C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a), + C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a), + C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116), + C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7), + C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294), + C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910), + C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781), + C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044), + C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3), + C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe), + C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a), + C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc), + C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904), + C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1), + C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463), + C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a), + C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d), + C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014), + C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f), + C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2), + C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39), + C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2), + C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447), + C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7), + C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795), + C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298), + C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f), + C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e), + C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683), + C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529), + C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c), + C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2), + C64e(0x161631271d272c1d), C64e(0xadad379a769a4176), + C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856), + C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e), + C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a), + C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4), + C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e), + C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6), + C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4), + C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b), + C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43), + C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7), + C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964), + C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0), + C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa), + C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525), + C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e), + C64e(0x47476720e9208ee9), C64e(0x1010382818282018), + C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88), + C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872), + C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1), + C64e(0x73732152c752e6c7), C64e(0x979764f351f33551), + C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c), + C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21), + C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc), + C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85), + C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842), + C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa), + C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05), + C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812), + C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f), + C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0), + C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958), + C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9), + C64e(0xd9d991483848a938), C64e(0xebebde351335cd13), + C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433), + C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970), + C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7), + C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822), + C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920), + C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff), + C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a), + C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8), + C64e(0x0909929b809b1280), C64e(0x1a1a233917393417), + C64e(0x65651075da75cada), C64e(0xd7d784533153b531), + C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8), + C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0), + C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11), + C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc), + C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a) +}; +/* +__constant static const ulong T2_G[] = { + C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb), + C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7), + C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7), + C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39), + C64e(0x50606090f050f0c0), C64e(0x0302020705030504), + C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac), + C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671), + C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3), + C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e), + C64e(0x40898949c040c009), C64e(0x87fafa68928792ef), + C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f), + C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded), + C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d), + C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a), + C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6), + C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d), + C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9), + C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98), + C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc), + C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d), + C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2), + C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9), + C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d), + C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154), + C64e(0x0c08081c140c1410), C64e(0x52959563f652f631), + C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221), + C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e), + C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e), + C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48), + C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5), + C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c), + C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf), + C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a), + C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268), + C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3), + C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6), + C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec), + C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa), + C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1), + C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226), + C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869), + C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499), + C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd), + C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77), + C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01), + C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4), + C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b), + C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11), + C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91), + C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1), + C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f), + C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722), + C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9), + C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7), + C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0), + C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96), + C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba), + C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a), + C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42), + C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9), + C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee), + C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584), + C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1), + C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765), + C64e(0x4c818155d44cd419), C64e(0x141818243c143c30), + C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d), + C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a), + C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c), + C64e(0x5793936af957f93d), C64e(0xf25555580df20daa), + C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4), + C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f), + C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7), + C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332), + C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d), + C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8), + C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16), + C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95), + C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450), + C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63), + C64e(0x1d161631271d272c), C64e(0x76adad379a769a41), + C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8), + C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228), + C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18), + C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b), + C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261), + C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193), + C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762), + C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff), + C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d), + C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af), + C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79), + C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92), + C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543), + C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85), + C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3), + C64e(0xe947476720e9208e), C64e(0x1810103828182820), + C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb), + C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8), + C64e(0x243838546c246c70), C64e(0xf157575f08f108ae), + C64e(0xc773732152c752e6), C64e(0x51979764f351f335), + C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459), + C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c), + C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2), + C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e), + C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8), + C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583), + C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c), + C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638), + C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4), + C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2), + C64e(0x911717bfa891a82e), C64e(0x58999971e858e829), + C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e), + C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd), + C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544), + C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049), + C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266), + C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678), + C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089), + C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f), + C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51), + C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2), + C64e(0x800909929b809b12), C64e(0x171a1a2339173934), + C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5), + C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb), + C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52), + C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c), + C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b), + C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58) +}; + +__constant static const ulong T3_G[] = { + C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497), + C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c), + C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc), + C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc), + C64e(0xc050606090f050f0), C64e(0x0403020207050305), + C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87), + C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6), + C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5), + C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc), + C64e(0x0940898949c040c0), C64e(0xef87fafa68928792), + C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26), + C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d), + C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9), + C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25), + C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702), + C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed), + C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24), + C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe), + C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3), + C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1), + C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407), + C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818), + C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395), + C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41), + C64e(0x100c08081c140c14), C64e(0x3152959563f652f6), + C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2), + C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8), + C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4), + C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a), + C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47), + C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb), + C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba), + C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9), + C64e(0xb0745858c49c749c), C64e(0x682e343446722e72), + C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd), + C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16), + C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7), + C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49), + C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42), + C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2), + C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8), + C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74), + C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21), + C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c), + C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca), + C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd), + C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467), + C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade), + C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e), + C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a), + C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762), + C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7), + C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030), + C64e(0x080604040e0a060a), C64e(0xe781fefe66988198), + C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc), + C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e), + C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19), + C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85), + C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf), + C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c), + C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158), + C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5), + C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e), + C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7), + C64e(0x194c818155d44cd4), C64e(0x30141818243c143c), + C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71), + C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd), + C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b), + C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d), + C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9), + C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732), + C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4), + C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3), + C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81), + C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82), + C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e), + C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b), + C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44), + C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d), + C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a), + C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa), + C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22), + C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e), + C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437), + C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2), + C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1), + C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7), + C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86), + C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5), + C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2), + C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac), + C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b), + C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15), + C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f), + C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89), + C64e(0x8ee947476720e920), C64e(0x2018101038281828), + C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883), + C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296), + C64e(0x70243838546c246c), C64e(0xaef157575f08f108), + C64e(0xe6c773732152c752), C64e(0x3551979764f351f3), + C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84), + C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163), + C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f), + C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594), + C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6), + C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5), + C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f), + C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236), + C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1), + C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b), + C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8), + C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0), + C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335), + C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355), + C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090), + C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2), + C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266), + C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060), + C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a), + C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e), + C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813), + C64e(0x12800909929b809b), C64e(0x34171a1a23391739), + C64e(0xcada65651075da75), C64e(0xb531d7d784533153), + C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3), + C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb), + C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133), + C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f), + C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e) +}; + +__constant static const ulong T4_G[] = { + C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784), + C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d), + C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd), + C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54), + C64e(0xf0c050606090f050), C64e(0x0504030202070503), + C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d), + C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662), + C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a), + C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d), + C64e(0xc00940898949c040), C64e(0x92ef87fafa689287), + C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb), + C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b), + C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967), + C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea), + C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7), + C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b), + C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c), + C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a), + C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341), + C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f), + C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4), + C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808), + C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573), + C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f), + C64e(0x14100c08081c140c), C64e(0xf63152959563f652), + C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e), + C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1), + C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5), + C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36), + C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d), + C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69), + C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f), + C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e), + C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e), + C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2), + C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb), + C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d), + C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce), + C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e), + C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297), + C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868), + C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c), + C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f), + C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced), + C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46), + C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b), + C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4), + C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a), + C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a), + C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16), + C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7), + C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794), + C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010), + C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881), + C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44), + C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3), + C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe), + C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a), + C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc), + C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04), + C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1), + C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563), + C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a), + C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d), + C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14), + C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f), + C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2), + C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39), + C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2), + C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947), + C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7), + C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495), + C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398), + C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f), + C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e), + C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83), + C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29), + C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c), + C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2), + C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76), + C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56), + C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e), + C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a), + C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4), + C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e), + C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6), + C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4), + C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b), + C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543), + C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7), + C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64), + C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0), + C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa), + C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25), + C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e), + C64e(0x208ee947476720e9), C64e(0x2820181010382818), + C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388), + C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672), + C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1), + C64e(0x52e6c773732152c7), C64e(0xf33551979764f351), + C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c), + C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321), + C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc), + C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485), + C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642), + C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa), + C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05), + C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612), + C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f), + C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0), + C64e(0xa82e911717bfa891), C64e(0xe82958999971e858), + C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9), + C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513), + C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533), + C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070), + C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7), + C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622), + C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020), + C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff), + C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a), + C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8), + C64e(0x9b12800909929b80), C64e(0x3934171a1a233917), + C64e(0x75cada65651075da), C64e(0x53b531d7d7845331), + C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8), + C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0), + C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311), + C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc), + C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a) +}; + +__constant static const ulong T5_G[] = { + C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97), + C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c), + C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc), + C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc), + C64e(0x50f0c050606090f0), C64e(0x0305040302020705), + C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187), + C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6), + C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5), + C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc), + C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892), + C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426), + C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d), + C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9), + C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025), + C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102), + C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed), + C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524), + C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be), + C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3), + C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1), + C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607), + C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118), + C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95), + C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41), + C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6), + C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2), + C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8), + C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4), + C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a), + C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847), + C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb), + C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba), + C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9), + C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672), + C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd), + C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16), + C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7), + C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449), + C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42), + C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2), + C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8), + C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574), + C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221), + C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c), + C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca), + C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd), + C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67), + C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde), + C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e), + C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a), + C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862), + C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7), + C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930), + C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698), + C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc), + C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e), + C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419), + C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085), + C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf), + C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c), + C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58), + C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5), + C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e), + C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7), + C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c), + C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271), + C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd), + C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b), + C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d), + C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9), + C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832), + C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4), + C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3), + C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281), + C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682), + C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e), + C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b), + C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44), + C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d), + C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a), + C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa), + C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622), + C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e), + C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37), + C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2), + C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1), + C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7), + C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486), + C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5), + C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2), + C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac), + C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b), + C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915), + C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f), + C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89), + C64e(0xe9208ee947476720), C64e(0x1828201810103828), + C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383), + C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96), + C64e(0x246c70243838546c), C64e(0xf108aef157575f08), + C64e(0xc752e6c773732152), C64e(0x51f33551979764f3), + C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584), + C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63), + C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f), + C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94), + C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6), + C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5), + C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f), + C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36), + C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1), + C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b), + C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8), + C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0), + C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35), + C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755), + C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990), + C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2), + C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66), + C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960), + C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a), + C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e), + C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13), + C64e(0x809b12800909929b), C64e(0x173934171a1a2339), + C64e(0xda75cada65651075), C64e(0x3153b531d7d78453), + C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3), + C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb), + C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33), + C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f), + C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e) +}; + +__constant static const ulong T6_G[] = { + C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f), + C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a), + C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a), + C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d), + C64e(0xf050f0c050606090), C64e(0x0503050403020207), + C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1), + C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513), + C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59), + C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3), + C64e(0xc040c00940898949), C64e(0x928792ef87fafa68), + C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294), + C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6), + C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a), + C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560), + C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351), + C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76), + C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5), + C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2), + C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd), + C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352), + C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156), + C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1), + C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e), + C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b), + C64e(0x140c14100c08081c), C64e(0xf652f63152959563), + C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f), + C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf), + C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb), + C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e), + C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98), + C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5), + C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50), + C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4), + C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446), + C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11), + C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d), + C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1), + C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34), + C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f), + C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1), + C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901), + C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5), + C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2), + C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a), + C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47), + C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af), + C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff), + C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b), + C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb), + C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7), + C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8), + C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6), + C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9), + C64e(0x0a060a080604040e), C64e(0x988198e781fefe66), + C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4), + C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75), + C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44), + C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580), + C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe), + C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd), + C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f), + C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7), + C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb), + C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08), + C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824), + C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2), + C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8), + C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65), + C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558), + C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3), + C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88), + C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642), + C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa), + C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322), + C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6), + C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95), + C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc), + C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c), + C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81), + C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37), + C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e), + C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436), + C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12), + C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f), + C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f), + C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435), + C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6), + C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274), + C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e), + C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18), + C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d), + C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972), + C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9), + C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0), + C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d), + C64e(0x20e9208ee9474767), C64e(0x2818282018101038), + C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073), + C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca), + C64e(0x6c246c7024383854), C64e(0x08f108aef157575f), + C64e(0x52c752e6c7737321), C64e(0xf351f33551979764), + C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125), + C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d), + C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e), + C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b), + C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba), + C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29), + C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609), + C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a), + C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b), + C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902), + C64e(0xa891a82e911717bf), C64e(0xe858e82958999971), + C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7), + C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde), + C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277), + C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939), + C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1), + C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a), + C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9), + C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0), + C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b), + C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a), + C64e(0x9b809b1280090992), C64e(0x39173934171a1a23), + C64e(0x75da75cada656510), C64e(0x533153b531d7d784), + C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003), + C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2), + C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d), + C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7), + C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62) +}; + +__constant static const ulong T7_G[] = { + C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8), + C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6), + C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6), + C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191), + C64e(0x90f050f0c0506060), C64e(0x0705030504030202), + C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656), + C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5), + C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec), + C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f), + C64e(0x49c040c009408989), C64e(0x68928792ef87fafa), + C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2), + C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb), + C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3), + C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545), + C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353), + C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b), + C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1), + C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c), + C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e), + C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383), + C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151), + C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9), + C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab), + C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a), + C64e(0x1c140c14100c0808), C64e(0x63f652f631529595), + C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d), + C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737), + C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f), + C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424), + C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf), + C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e), + C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea), + C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d), + C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434), + C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc), + C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b), + C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676), + C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d), + C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd), + C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313), + C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9), + C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1), + C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3), + C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6), + C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d), + C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272), + C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898), + C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585), + C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5), + C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded), + C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a), + C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111), + C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9), + C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe), + C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878), + C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b), + C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d), + C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505), + C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121), + C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1), + C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777), + C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242), + C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5), + C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf), + C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818), + C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3), + C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535), + C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e), + C64e(0x6af957f93d579393), C64e(0x580df20daaf25555), + C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a), + C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba), + C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6), + C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919), + C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3), + C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454), + C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b), + C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7), + C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828), + C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc), + C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad), + C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464), + C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414), + C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c), + C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8), + C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd), + C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4), + C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131), + C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2), + C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b), + C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada), + C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1), + C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949), + C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac), + C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf), + C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4), + C64e(0x6720e9208ee94747), C64e(0x3828182820181010), + C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0), + C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c), + C64e(0x546c246c70243838), C64e(0x5f08f108aef15757), + C64e(0x2152c752e6c77373), C64e(0x64f351f335519797), + C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1), + C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e), + C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161), + C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f), + C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c), + C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc), + C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606), + C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c), + C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a), + C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969), + C64e(0xbfa891a82e911717), C64e(0x71e858e829589999), + C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727), + C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb), + C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222), + C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9), + C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333), + C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c), + C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9), + C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa), + C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5), + C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959), + C64e(0x929b809b12800909), C64e(0x2339173934171a1a), + C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7), + C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0), + C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929), + C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e), + C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8), + C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c) +}; +*/ +#define RBTT(d, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + d = T0[t0[b0]] \ + ^ T1[t1[b1]] \ + ^ T2[t2[b2]] \ + ^ T3[t3[b3]] \ + ^ T4[t4[b4]] \ + ^ T5[t5[b5]] \ + ^ T6[t6[b6]] \ + ^ T7[t7[b7]]; \ + } while (0) + +#define ROUND_BIG_P(a, r) do { \ + t0[0x0] = B64_0(a[0x0]) ^ PC64(0x00, r); \ + t1[0x0] = B64_1(a[0x0]); \ + t2[0x0] = B64_2(a[0x0]); \ + t3[0x0] = B64_3(a[0x0]); \ + t4[0x0] = B64_4(a[0x0]); \ + t5[0x0] = B64_5(a[0x0]); \ + t6[0x0] = B64_6(a[0x0]); \ + t7[0x0] = B64_7(a[0x0]); \ + t0[0x1] = B64_0(a[0x1]) ^ PC64(0x10, r); \ + t1[0x1] = B64_1(a[0x1]); \ + t2[0x1] = B64_2(a[0x1]); \ + t3[0x1] = B64_3(a[0x1]); \ + t4[0x1] = B64_4(a[0x1]); \ + t5[0x1] = B64_5(a[0x1]); \ + t6[0x1] = B64_6(a[0x1]); \ + t7[0x1] = B64_7(a[0x1]); \ + t0[0x2] = B64_0(a[0x2]) ^ PC64(0x20, r); \ + t1[0x2] = B64_1(a[0x2]); \ + t2[0x2] = B64_2(a[0x2]); \ + t3[0x2] = B64_3(a[0x2]); \ + t4[0x2] = B64_4(a[0x2]); \ + t5[0x2] = B64_5(a[0x2]); \ + t6[0x2] = B64_6(a[0x2]); \ + t7[0x2] = B64_7(a[0x2]); \ + t0[0x3] = B64_0(a[0x3]) ^ PC64(0x30, r); \ + t1[0x3] = B64_1(a[0x3]); \ + t2[0x3] = B64_2(a[0x3]); \ + t3[0x3] = B64_3(a[0x3]); \ + t4[0x3] = B64_4(a[0x3]); \ + t5[0x3] = B64_5(a[0x3]); \ + t6[0x3] = B64_6(a[0x3]); \ + t7[0x3] = B64_7(a[0x3]); \ + t0[0x4] = B64_0(a[0x4]) ^ PC64(0x40, r); \ + t1[0x4] = B64_1(a[0x4]); \ + t2[0x4] = B64_2(a[0x4]); \ + t3[0x4] = B64_3(a[0x4]); \ + t4[0x4] = B64_4(a[0x4]); \ + t5[0x4] = B64_5(a[0x4]); \ + t6[0x4] = B64_6(a[0x4]); \ + t7[0x4] = B64_7(a[0x4]); \ + t0[0x5] = B64_0(a[0x5]) ^ PC64(0x50, r); \ + t1[0x5] = B64_1(a[0x5]); \ + t2[0x5] = B64_2(a[0x5]); \ + t3[0x5] = B64_3(a[0x5]); \ + t4[0x5] = B64_4(a[0x5]); \ + t5[0x5] = B64_5(a[0x5]); \ + t6[0x5] = B64_6(a[0x5]); \ + t7[0x5] = B64_7(a[0x5]); \ + t0[0x6] = B64_0(a[0x6]) ^ PC64(0x60, r); \ + t1[0x6] = B64_1(a[0x6]); \ + t2[0x6] = B64_2(a[0x6]); \ + t3[0x6] = B64_3(a[0x6]); \ + t4[0x6] = B64_4(a[0x6]); \ + t5[0x6] = B64_5(a[0x6]); \ + t6[0x6] = B64_6(a[0x6]); \ + t7[0x6] = B64_7(a[0x6]); \ + t0[0x7] = B64_0(a[0x7]) ^ PC64(0x70, r); \ + t1[0x7] = B64_1(a[0x7]); \ + t2[0x7] = B64_2(a[0x7]); \ + t3[0x7] = B64_3(a[0x7]); \ + t4[0x7] = B64_4(a[0x7]); \ + t5[0x7] = B64_5(a[0x7]); \ + t6[0x7] = B64_6(a[0x7]); \ + t7[0x7] = B64_7(a[0x7]); \ + t0[0x8] = B64_0(a[0x8]) ^ PC64(0x80, r); \ + t1[0x8] = B64_1(a[0x8]); \ + t2[0x8] = B64_2(a[0x8]); \ + t3[0x8] = B64_3(a[0x8]); \ + t4[0x8] = B64_4(a[0x8]); \ + t5[0x8] = B64_5(a[0x8]); \ + t6[0x8] = B64_6(a[0x8]); \ + t7[0x8] = B64_7(a[0x8]); \ + t0[0x9] = B64_0(a[0x9]) ^ PC64(0x90, r); \ + t1[0x9] = B64_1(a[0x9]); \ + t2[0x9] = B64_2(a[0x9]); \ + t3[0x9] = B64_3(a[0x9]); \ + t4[0x9] = B64_4(a[0x9]); \ + t5[0x9] = B64_5(a[0x9]); \ + t6[0x9] = B64_6(a[0x9]); \ + t7[0x9] = B64_7(a[0x9]); \ + t0[0xA] = B64_0(a[0xA]) ^ PC64(0xA0, r); \ + t1[0xA] = B64_1(a[0xA]); \ + t2[0xA] = B64_2(a[0xA]); \ + t3[0xA] = B64_3(a[0xA]); \ + t4[0xA] = B64_4(a[0xA]); \ + t5[0xA] = B64_5(a[0xA]); \ + t6[0xA] = B64_6(a[0xA]); \ + t7[0xA] = B64_7(a[0xA]); \ + t0[0xB] = B64_0(a[0xB]) ^ PC64(0xB0, r); \ + t1[0xB] = B64_1(a[0xB]); \ + t2[0xB] = B64_2(a[0xB]); \ + t3[0xB] = B64_3(a[0xB]); \ + t4[0xB] = B64_4(a[0xB]); \ + t5[0xB] = B64_5(a[0xB]); \ + t6[0xB] = B64_6(a[0xB]); \ + t7[0xB] = B64_7(a[0xB]); \ + t0[0xC] = B64_0(a[0xC]) ^ PC64(0xC0, r); \ + t1[0xC] = B64_1(a[0xC]); \ + t2[0xC] = B64_2(a[0xC]); \ + t3[0xC] = B64_3(a[0xC]); \ + t4[0xC] = B64_4(a[0xC]); \ + t5[0xC] = B64_5(a[0xC]); \ + t6[0xC] = B64_6(a[0xC]); \ + t7[0xC] = B64_7(a[0xC]); \ + t0[0xD] = B64_0(a[0xD]) ^ PC64(0xD0, r); \ + t1[0xD] = B64_1(a[0xD]); \ + t2[0xD] = B64_2(a[0xD]); \ + t3[0xD] = B64_3(a[0xD]); \ + t4[0xD] = B64_4(a[0xD]); \ + t5[0xD] = B64_5(a[0xD]); \ + t6[0xD] = B64_6(a[0xD]); \ + t7[0xD] = B64_7(a[0xD]); \ + t0[0xE] = B64_0(a[0xE]) ^ PC64(0xE0, r); \ + t1[0xE] = B64_1(a[0xE]); \ + t2[0xE] = B64_2(a[0xE]); \ + t3[0xE] = B64_3(a[0xE]); \ + t4[0xE] = B64_4(a[0xE]); \ + t5[0xE] = B64_5(a[0xE]); \ + t6[0xE] = B64_6(a[0xE]); \ + t7[0xE] = B64_7(a[0xE]); \ + t0[0xF] = B64_0(a[0xF]) ^ PC64(0xF0, r); \ + t1[0xF] = B64_1(a[0xF]); \ + t2[0xF] = B64_2(a[0xF]); \ + t3[0xF] = B64_3(a[0xF]); \ + t4[0xF] = B64_4(a[0xF]); \ + t5[0xF] = B64_5(a[0xF]); \ + t6[0xF] = B64_6(a[0xF]); \ + t7[0xF] = B64_7(a[0xF]); \ + RBTT(a[0x0], 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \ + RBTT(a[0x1], 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \ + RBTT(a[0x2], 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \ + RBTT(a[0x3], 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \ + RBTT(a[0x4], 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \ + RBTT(a[0x5], 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \ + RBTT(a[0x6], 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \ + RBTT(a[0x7], 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \ + RBTT(a[0x8], 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \ + RBTT(a[0x9], 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \ + RBTT(a[0xA], 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \ + RBTT(a[0xB], 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \ + RBTT(a[0xC], 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \ + RBTT(a[0xD], 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \ + RBTT(a[0xE], 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \ + RBTT(a[0xF], 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + a[0x0] ^= QC64(0x00, r); \ + a[0x1] ^= QC64(0x10, r); \ + a[0x2] ^= QC64(0x20, r); \ + a[0x3] ^= QC64(0x30, r); \ + a[0x4] ^= QC64(0x40, r); \ + a[0x5] ^= QC64(0x50, r); \ + a[0x6] ^= QC64(0x60, r); \ + a[0x7] ^= QC64(0x70, r); \ + a[0x8] ^= QC64(0x80, r); \ + a[0x9] ^= QC64(0x90, r); \ + a[0xA] ^= QC64(0xA0, r); \ + a[0xB] ^= QC64(0xB0, r); \ + a[0xC] ^= QC64(0xC0, r); \ + a[0xD] ^= QC64(0xD0, r); \ + a[0xE] ^= QC64(0xE0, r); \ + a[0xF] ^= QC64(0xF0, r); \ + t0[0x0] = B64_0(a[0x0]); \ + t1[0x0] = B64_1(a[0x0]); \ + t2[0x0] = B64_2(a[0x0]); \ + t3[0x0] = B64_3(a[0x0]); \ + t4[0x0] = B64_4(a[0x0]); \ + t5[0x0] = B64_5(a[0x0]); \ + t6[0x0] = B64_6(a[0x0]); \ + t7[0x0] = B64_7(a[0x0]); \ + t0[0x1] = B64_0(a[0x1]); \ + t1[0x1] = B64_1(a[0x1]); \ + t2[0x1] = B64_2(a[0x1]); \ + t3[0x1] = B64_3(a[0x1]); \ + t4[0x1] = B64_4(a[0x1]); \ + t5[0x1] = B64_5(a[0x1]); \ + t6[0x1] = B64_6(a[0x1]); \ + t7[0x1] = B64_7(a[0x1]); \ + t0[0x2] = B64_0(a[0x2]); \ + t1[0x2] = B64_1(a[0x2]); \ + t2[0x2] = B64_2(a[0x2]); \ + t3[0x2] = B64_3(a[0x2]); \ + t4[0x2] = B64_4(a[0x2]); \ + t5[0x2] = B64_5(a[0x2]); \ + t6[0x2] = B64_6(a[0x2]); \ + t7[0x2] = B64_7(a[0x2]); \ + t0[0x3] = B64_0(a[0x3]); \ + t1[0x3] = B64_1(a[0x3]); \ + t2[0x3] = B64_2(a[0x3]); \ + t3[0x3] = B64_3(a[0x3]); \ + t4[0x3] = B64_4(a[0x3]); \ + t5[0x3] = B64_5(a[0x3]); \ + t6[0x3] = B64_6(a[0x3]); \ + t7[0x3] = B64_7(a[0x3]); \ + t0[0x4] = B64_0(a[0x4]); \ + t1[0x4] = B64_1(a[0x4]); \ + t2[0x4] = B64_2(a[0x4]); \ + t3[0x4] = B64_3(a[0x4]); \ + t4[0x4] = B64_4(a[0x4]); \ + t5[0x4] = B64_5(a[0x4]); \ + t6[0x4] = B64_6(a[0x4]); \ + t7[0x4] = B64_7(a[0x4]); \ + t0[0x5] = B64_0(a[0x5]); \ + t1[0x5] = B64_1(a[0x5]); \ + t2[0x5] = B64_2(a[0x5]); \ + t3[0x5] = B64_3(a[0x5]); \ + t4[0x5] = B64_4(a[0x5]); \ + t5[0x5] = B64_5(a[0x5]); \ + t6[0x5] = B64_6(a[0x5]); \ + t7[0x5] = B64_7(a[0x5]); \ + t0[0x6] = B64_0(a[0x6]); \ + t1[0x6] = B64_1(a[0x6]); \ + t2[0x6] = B64_2(a[0x6]); \ + t3[0x6] = B64_3(a[0x6]); \ + t4[0x6] = B64_4(a[0x6]); \ + t5[0x6] = B64_5(a[0x6]); \ + t6[0x6] = B64_6(a[0x6]); \ + t7[0x6] = B64_7(a[0x6]); \ + t0[0x7] = B64_0(a[0x7]); \ + t1[0x7] = B64_1(a[0x7]); \ + t2[0x7] = B64_2(a[0x7]); \ + t3[0x7] = B64_3(a[0x7]); \ + t4[0x7] = B64_4(a[0x7]); \ + t5[0x7] = B64_5(a[0x7]); \ + t6[0x7] = B64_6(a[0x7]); \ + t7[0x7] = B64_7(a[0x7]); \ + t0[0x8] = B64_0(a[0x8]); \ + t1[0x8] = B64_1(a[0x8]); \ + t2[0x8] = B64_2(a[0x8]); \ + t3[0x8] = B64_3(a[0x8]); \ + t4[0x8] = B64_4(a[0x8]); \ + t5[0x8] = B64_5(a[0x8]); \ + t6[0x8] = B64_6(a[0x8]); \ + t7[0x8] = B64_7(a[0x8]); \ + t0[0x9] = B64_0(a[0x9]); \ + t1[0x9] = B64_1(a[0x9]); \ + t2[0x9] = B64_2(a[0x9]); \ + t3[0x9] = B64_3(a[0x9]); \ + t4[0x9] = B64_4(a[0x9]); \ + t5[0x9] = B64_5(a[0x9]); \ + t6[0x9] = B64_6(a[0x9]); \ + t7[0x9] = B64_7(a[0x9]); \ + t0[0xA] = B64_0(a[0xA]); \ + t1[0xA] = B64_1(a[0xA]); \ + t2[0xA] = B64_2(a[0xA]); \ + t3[0xA] = B64_3(a[0xA]); \ + t4[0xA] = B64_4(a[0xA]); \ + t5[0xA] = B64_5(a[0xA]); \ + t6[0xA] = B64_6(a[0xA]); \ + t7[0xA] = B64_7(a[0xA]); \ + t0[0xB] = B64_0(a[0xB]); \ + t1[0xB] = B64_1(a[0xB]); \ + t2[0xB] = B64_2(a[0xB]); \ + t3[0xB] = B64_3(a[0xB]); \ + t4[0xB] = B64_4(a[0xB]); \ + t5[0xB] = B64_5(a[0xB]); \ + t6[0xB] = B64_6(a[0xB]); \ + t7[0xB] = B64_7(a[0xB]); \ + t0[0xC] = B64_0(a[0xC]); \ + t1[0xC] = B64_1(a[0xC]); \ + t2[0xC] = B64_2(a[0xC]); \ + t3[0xC] = B64_3(a[0xC]); \ + t4[0xC] = B64_4(a[0xC]); \ + t5[0xC] = B64_5(a[0xC]); \ + t6[0xC] = B64_6(a[0xC]); \ + t7[0xC] = B64_7(a[0xC]); \ + t0[0xD] = B64_0(a[0xD]); \ + t1[0xD] = B64_1(a[0xD]); \ + t2[0xD] = B64_2(a[0xD]); \ + t3[0xD] = B64_3(a[0xD]); \ + t4[0xD] = B64_4(a[0xD]); \ + t5[0xD] = B64_5(a[0xD]); \ + t6[0xD] = B64_6(a[0xD]); \ + t7[0xD] = B64_7(a[0xD]); \ + t0[0xE] = B64_0(a[0xE]); \ + t1[0xE] = B64_1(a[0xE]); \ + t2[0xE] = B64_2(a[0xE]); \ + t3[0xE] = B64_3(a[0xE]); \ + t4[0xE] = B64_4(a[0xE]); \ + t5[0xE] = B64_5(a[0xE]); \ + t6[0xE] = B64_6(a[0xE]); \ + t7[0xE] = B64_7(a[0xE]); \ + t0[0xF] = B64_0(a[0xF]); \ + t1[0xF] = B64_1(a[0xF]); \ + t2[0xF] = B64_2(a[0xF]); \ + t3[0xF] = B64_3(a[0xF]); \ + t4[0xF] = B64_4(a[0xF]); \ + t5[0xF] = B64_5(a[0xF]); \ + t6[0xF] = B64_6(a[0xF]); \ + t7[0xF] = B64_7(a[0xF]); \ + RBTT(a[0x0], 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \ + RBTT(a[0x1], 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \ + RBTT(a[0x2], 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \ + RBTT(a[0x3], 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \ + RBTT(a[0x4], 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \ + RBTT(a[0x5], 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \ + RBTT(a[0x6], 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \ + RBTT(a[0x7], 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \ + RBTT(a[0x8], 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \ + RBTT(a[0x9], 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \ + RBTT(a[0xA], 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \ + RBTT(a[0xB], 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \ + RBTT(a[0xC], 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \ + RBTT(a[0xD], 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \ + RBTT(a[0xE], 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \ + RBTT(a[0xF], 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \ + } while (0) + +#define PERM_BIG_P(a, start, end) do { \ + for (u = start; u < end; u++) { \ + ROUND_BIG_P(a, u); \ + } \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + /* for (ulong u = 0; u < (14UL << 56); u += (1UL << 56)) { */ \ + for (u = 0; u < 14; u++) { \ + ROUND_BIG_Q(a, u); \ + } \ + } while (0) + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) { + __local ulong T2[256], T3[256], T4[256], T5[256], T6[256], T7[256]; + uint u; + + // for (u = get_local_id(0); u < 256; u += get_local_size(0)) { + u = get_local_id(0); + /* + T1[u] = T1_G[u]; + T2[u] = T2_G[u]; + T3[u] = T3_G[u]; + T4[u] = T4_G[u]; + T5[u] = T5_G[u]; + T6[u] = T6_G[u]; + T7[u] = T7_G[u]; + */ + // create other tables based on T0: avoids keeping them in the kernel. +// T1[u] = ROTL64(T0[u], 8UL); + T2[u] = ROTL64(T0[u], 16UL); + T3[u] = ROTL64(T0[u], 24UL); + T4[u] = ROTL64(T0[u], 32UL); + T5[u] = ROTL64(T0[u], 40UL); + T6[u] = ROTL64(T0[u], 48UL); + T7[u] = ROTL64(T0[u], 56UL); + barrier(CLK_LOCAL_MEM_FENCE); + + ulong g[16], m[16], t0[16], t1[16], t2[16], t3[16], t4[16], t5[16], t6[16], t7[16]; + uint flag = 0, gid = get_global_id(0), r = 13; + + m[0] = DEC64E(block + 0 * 8); + m[1] = DEC64E(block + 1 * 8); + m[2] = DEC64E(block + 2 * 8); + m[3] = DEC64E(block + 3 * 8); + m[4] = DEC64E(block + 4 * 8); + m[5] = DEC64E(block + 5 * 8); + m[6] = DEC64E(block + 6 * 8); + m[7] = DEC64E(block + 7 * 8); + m[8] = DEC64E(block + 8 * 8); + m[9] = DEC64E(block + 9 * 8); + m[9] &= 0x00000000FFFFFFFF; + m[9] |= ((ulong) gid << 32); + m[10] = 0x80; + +perm: + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = M15; + +#pragma unroll + for (u = 0; u < 15; u++) g[u] = m[u]; + g[15] = M15 ^ H15; + + + g[0x0] ^= PC64(0x00, 0); + g[0x1] ^= PC64(0x10, 0); + g[0x2] ^= PC64(0x20, 0); + g[0x3] ^= PC64(0x30, 0); + g[0x4] ^= PC64(0x40, 0); + g[0x5] ^= PC64(0x50, 0); + g[0x6] ^= PC64(0x60, 0); + g[0x7] ^= PC64(0x70, 0); + g[0x8] ^= PC64(0x80, 0); + g[0x9] ^= PC64(0x90, 0); + g[0xA] ^= PC64(0xA0, 0); + g[0xB] = PC64(0xB0, 0); + g[0xC] = PC64(0xC0, 0); + g[0xD] = PC64(0xD0, 0); + g[0xE] = PC64(0xE0, 0); + g[0xF] ^= PC64(0xF0, 0); + t0[0x0] = B64_0(g[0x0]); + t1[0x0] = B64_1(g[0x0]); + t2[0x0] = B64_2(g[0x0]); + t3[0x0] = B64_3(g[0x0]); + t4[0x0] = B64_4(g[0x0]); + t5[0x0] = B64_5(g[0x0]); + t6[0x0] = B64_6(g[0x0]); + t7[0x0] = B64_7(g[0x0]); + t0[0x1] = B64_0(g[0x1]); + t1[0x1] = B64_1(g[0x1]); + t2[0x1] = B64_2(g[0x1]); + t3[0x1] = B64_3(g[0x1]); + t4[0x1] = B64_4(g[0x1]); + t5[0x1] = B64_5(g[0x1]); + t6[0x1] = B64_6(g[0x1]); + t7[0x1] = B64_7(g[0x1]); + t0[0x2] = B64_0(g[0x2]); + t1[0x2] = B64_1(g[0x2]); + t2[0x2] = B64_2(g[0x2]); + t3[0x2] = B64_3(g[0x2]); + t4[0x2] = B64_4(g[0x2]); + t5[0x2] = B64_5(g[0x2]); + t6[0x2] = B64_6(g[0x2]); + t7[0x2] = B64_7(g[0x2]); + t0[0x3] = B64_0(g[0x3]); + t1[0x3] = B64_1(g[0x3]); + t2[0x3] = B64_2(g[0x3]); + t3[0x3] = B64_3(g[0x3]); + t4[0x3] = B64_4(g[0x3]); + t5[0x3] = B64_5(g[0x3]); + t6[0x3] = B64_6(g[0x3]); + t7[0x3] = B64_7(g[0x3]); + t0[0x4] = B64_0(g[0x4]); + t1[0x4] = B64_1(g[0x4]); + t2[0x4] = B64_2(g[0x4]); + t3[0x4] = B64_3(g[0x4]); + t4[0x4] = B64_4(g[0x4]); + t5[0x4] = B64_5(g[0x4]); + t6[0x4] = B64_6(g[0x4]); + t7[0x4] = B64_7(g[0x4]); + t0[0x5] = B64_0(g[0x5]); + t1[0x5] = B64_1(g[0x5]); + t2[0x5] = B64_2(g[0x5]); + t3[0x5] = B64_3(g[0x5]); + t4[0x5] = B64_4(g[0x5]); + t5[0x5] = B64_5(g[0x5]); + t6[0x5] = B64_6(g[0x5]); + t7[0x5] = B64_7(g[0x5]); + t0[0x6] = B64_0(g[0x6]); + t1[0x6] = B64_1(g[0x6]); + t2[0x6] = B64_2(g[0x6]); + t3[0x6] = B64_3(g[0x6]); + t4[0x6] = B64_4(g[0x6]); + t5[0x6] = B64_5(g[0x6]); + t6[0x6] = B64_6(g[0x6]); + t7[0x6] = B64_7(g[0x6]); + t0[0x7] = B64_0(g[0x7]); + t1[0x7] = B64_1(g[0x7]); + t2[0x7] = B64_2(g[0x7]); + t3[0x7] = B64_3(g[0x7]); + t4[0x7] = B64_4(g[0x7]); + t5[0x7] = B64_5(g[0x7]); + t6[0x7] = B64_6(g[0x7]); + t7[0x7] = B64_7(g[0x7]); + t0[0x8] = B64_0(g[0x8]); + t1[0x8] = B64_1(g[0x8]); + t2[0x8] = B64_2(g[0x8]); + t3[0x8] = B64_3(g[0x8]); + t4[0x8] = B64_4(g[0x8]); + t5[0x8] = B64_5(g[0x8]); + t6[0x8] = B64_6(g[0x8]); + t7[0x8] = B64_7(g[0x8]); + t0[0x9] = B64_0(g[0x9]); + t1[0x9] = B64_1(g[0x9]); + t2[0x9] = B64_2(g[0x9]); + t3[0x9] = B64_3(g[0x9]); + t4[0x9] = B64_4(g[0x9]); + t5[0x9] = B64_5(g[0x9]); + t6[0x9] = B64_6(g[0x9]); + t7[0x9] = B64_7(g[0x9]); + t0[0xA] = B64_0(g[0xA]); + t1[0xA] = B64_1(g[0xA]); + t2[0xA] = B64_2(g[0xA]); + t3[0xA] = B64_3(g[0xA]); + t4[0xA] = B64_4(g[0xA]); + t5[0xA] = B64_5(g[0xA]); + t6[0xA] = B64_6(g[0xA]); + t7[0xA] = B64_7(g[0xA]); + t0[0xB] = B64_0(g[0xB]); + t1[0xB] = B64_1(g[0xB]); + t2[0xB] = B64_2(g[0xB]); + t3[0xB] = B64_3(g[0xB]); + t4[0xB] = B64_4(g[0xB]); + t5[0xB] = B64_5(g[0xB]); + t6[0xB] = B64_6(g[0xB]); + t7[0xB] = B64_7(g[0xB]); + t0[0xC] = B64_0(g[0xC]); + t1[0xC] = B64_1(g[0xC]); + t2[0xC] = B64_2(g[0xC]); + t3[0xC] = B64_3(g[0xC]); + t4[0xC] = B64_4(g[0xC]); + t5[0xC] = B64_5(g[0xC]); + t6[0xC] = B64_6(g[0xC]); + t7[0xC] = B64_7(g[0xC]); + t0[0xD] = B64_0(g[0xD]); + t1[0xD] = B64_1(g[0xD]); + t2[0xD] = B64_2(g[0xD]); + t3[0xD] = B64_3(g[0xD]); + t4[0xD] = B64_4(g[0xD]); + t5[0xD] = B64_5(g[0xD]); + t6[0xD] = B64_6(g[0xD]); + t7[0xD] = B64_7(g[0xD]); + t0[0xE] = B64_0(g[0xE]); + t1[0xE] = B64_1(g[0xE]); + t2[0xE] = B64_2(g[0xE]); + t3[0xE] = B64_3(g[0xE]); + t4[0xE] = B64_4(g[0xE]); + t5[0xE] = B64_5(g[0xE]); + t6[0xE] = B64_6(g[0xE]); + t7[0xE] = B64_7(g[0xE]); + t0[0xF] = B64_0(g[0xF]); + t1[0xF] = B64_1(g[0xF]); + t2[0xF] = B64_2(g[0xF]); + t3[0xF] = B64_3(g[0xF]); + t4[0xF] = B64_4(g[0xF]); + t5[0xF] = B64_5(g[0xF]); + t6[0xF] = B64_6(g[0xF]); + t7[0xF] = B64_7(g[0xF]); + g[0x0] = T0[t0[0x0]] ^ T1[t1[0x1]] ^ T2[t2[0x2]] ^ T3[t3[0x3]] ^ T4[t4[0x4]] ^ T5[t5[0x5]] ^ T6[t6[0x6]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x1] = T0[t0[0x1]] ^ T1[t1[0x2]] ^ T2[t2[0x3]] ^ T3[t3[0x4]] ^ T4[t4[0x5]] ^ T5[t5[0x6]] ^ T6[t6[0x7]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x2] = T0[t0[0x2]] ^ T1[t1[0x3]] ^ T2[t2[0x4]] ^ T3[t3[0x5]] ^ T4[t4[0x6]] ^ T5[t5[0x7]] ^ T6[t6[0x8]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x3] = T0[t0[0x3]] ^ T1[t1[0x4]] ^ T2[t2[0x5]] ^ T3[t3[0x6]] ^ T4[t4[0x7]] ^ T5[t5[0x8]] ^ T6[t6[0x9]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x4] = T0[t0[0x4]] ^ T1[t1[0x5]] ^ T2[t2[0x6]] ^ T3[t3[0x7]] ^ T4[t4[0x8]] ^ T5[t5[0x9]] ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0xF]]; + g[0x5] = T0[t0[0x5]] ^ T1[t1[0x6]] ^ T2[t2[0x7]] ^ T3[t3[0x8]] ^ T4[t4[0x9]] ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x0]]; + g[0x6] = T0[t0[0x6]] ^ T1[t1[0x7]] ^ T2[t2[0x8]] ^ T3[t3[0x9]] ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x1]]; + g[0x7] = T0[t0[0x7]] ^ T1[t1[0x8]] ^ T2[t2[0x9]] ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x2]]; + g[0x8] = T0[t0[0x8]] ^ T1[t1[0x9]] ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x3]]; + g[0x9] = T0[t0[0x9]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ T6[t6[0xF]] ^ T7[t7[0x4]]; + g[0xA] = T0[t0[0xA]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ T5[t5[0xF]] ^ T6[t6[0x0]] ^ T7[t7[0x5]]; + g[0xB] = T0[t0[0xB]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ T4[t4[0xF]] ^ T5[t5[0x0]] ^ T6[t6[0x1]] ^ T7[t7[0x6]]; + g[0xC] = T0[t0[0xC]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ T3[t3[0xF]] ^ T4[t4[0x0]] ^ T5[t5[0x1]] ^ T6[t6[0x2]] ^ T7[t7[0x7]]; + g[0xD] = T0[t0[0xD]] ^ C64e(0xc6c632f4a5f497a5) ^ T2[t2[0xF]] ^ T3[t3[0x0]] ^ T4[t4[0x1]] ^ T5[t5[0x2]] ^ T6[t6[0x3]] ^ T7[t7[0x8]]; + g[0xE] = T0[t0[0xE]] ^ T1[t1[0xF]] ^ T2[t2[0x0]] ^ T3[t3[0x1]] ^ T4[t4[0x2]] ^ T5[t5[0x3]] ^ T6[t6[0x4]] ^ T7[t7[0x9]]; + g[0xF] = T0[t0[0xF]] ^ T1[t1[0x0]] ^ T2[t2[0x1]] ^ T3[t3[0x2]] ^ T4[t4[0x3]] ^ T5[t5[0x4]] ^ T6[t6[0x5]] ^ T7[t7[0xA]]; + + + PERM_BIG_P(g, 1, 14); + PERM_BIG_Q(m); + +#pragma unroll + for (u = 0; u < 16; u++) g[u] ^= m[u]; +#pragma unroll + for (u = 0; u < 8; u++) m[u] = g[u + 8]; + g[15] ^= H15; + PERM_BIG_P(g, 0, r); + +round: +// move the ^= to the relevant first byte down here? tried that, was slower?!?!? + g[0x0] ^= PC64(0x00, r); + g[0x1] ^= PC64(0x10, r); + g[0x6] ^= PC64(0x60, r); + g[0xB] ^= PC64(0xB0, r); + g[0xC] ^= PC64(0xC0, r); + g[0xD] ^= PC64(0xD0, r); + g[0xE] ^= PC64(0xE0, r); + g[0xF] ^= PC64(0xF0, r); + t0[0x0] = B64_0(g[0x0]); + t1[0x0] = B64_1(g[0x0]); + t2[0x0] = B64_2(g[0x0]); + t3[0x0] = B64_3(g[0x0]); + t4[0x0] = B64_4(g[0x0]); + t5[0x0] = B64_5(g[0x0]); + t6[0x0] = B64_6(g[0x0]); + t7[0x0] = B64_7(g[0x0]); + t0[0x1] = B64_0(g[0x1]); + t1[0x1] = B64_1(g[0x1]); + t2[0x1] = B64_2(g[0x1]); + t3[0x1] = B64_3(g[0x1]); + t4[0x1] = B64_4(g[0x1]); + t5[0x1] = B64_5(g[0x1]); + t6[0x1] = B64_6(g[0x1]); + t7[0x1] = B64_7(g[0x1]); + t0[0x6] = B64_0(g[0x6]); + t1[0x6] = B64_1(g[0x6]); + t2[0x6] = B64_2(g[0x6]); + t3[0x6] = B64_3(g[0x6]); + t4[0x6] = B64_4(g[0x6]); + t5[0x6] = B64_5(g[0x6]); + t6[0x6] = B64_6(g[0x6]); + t7[0x6] = B64_7(g[0x6]); + t0[0xB] = B64_0(g[0xB]); + t1[0xB] = B64_1(g[0xB]); + t2[0xB] = B64_2(g[0xB]); + t3[0xB] = B64_3(g[0xB]); + t4[0xB] = B64_4(g[0xB]); + t5[0xB] = B64_5(g[0xB]); + t6[0xB] = B64_6(g[0xB]); + t7[0xB] = B64_7(g[0xB]); + t0[0xC] = B64_0(g[0xC]); + t1[0xC] = B64_1(g[0xC]); + t2[0xC] = B64_2(g[0xC]); + t3[0xC] = B64_3(g[0xC]); + t4[0xC] = B64_4(g[0xC]); + t5[0xC] = B64_5(g[0xC]); + t6[0xC] = B64_6(g[0xC]); + t7[0xC] = B64_7(g[0xC]); + t0[0xD] = B64_0(g[0xD]); + t1[0xD] = B64_1(g[0xD]); + t2[0xD] = B64_2(g[0xD]); + t3[0xD] = B64_3(g[0xD]); + t4[0xD] = B64_4(g[0xD]); + t5[0xD] = B64_5(g[0xD]); + t6[0xD] = B64_6(g[0xD]); + t7[0xD] = B64_7(g[0xD]); + t0[0xE] = B64_0(g[0xE]); + t1[0xE] = B64_1(g[0xE]); + t2[0xE] = B64_2(g[0xE]); + t3[0xE] = B64_3(g[0xE]); + t4[0xE] = B64_4(g[0xE]); + t5[0xE] = B64_5(g[0xE]); + t6[0xE] = B64_6(g[0xE]); + t7[0xE] = B64_7(g[0xE]); + t0[0xF] = B64_0(g[0xF]); + t1[0xF] = B64_1(g[0xF]); + t2[0xF] = B64_2(g[0xF]); + t3[0xF] = B64_3(g[0xF]); + t4[0xF] = B64_4(g[0xF]); + t5[0xF] = B64_5(g[0xF]); + t6[0xF] = B64_6(g[0xF]); + t7[0xF] = B64_7(g[0xF]); + + if (flag < 2) { + g[0x2] ^= PC64(0x20, r); + g[0x3] ^= PC64(0x30, r); + g[0x4] ^= PC64(0x40, r); + g[0x5] ^= PC64(0x50, r); + g[0x7] ^= PC64(0x70, r); + g[0x8] ^= PC64(0x80, r); + g[0x9] ^= PC64(0x90, r); + g[0xA] ^= PC64(0xA0, r); + t0[0x2] = B64_0(g[0x2]); + t1[0x2] = B64_1(g[0x2]); + t2[0x2] = B64_2(g[0x2]); + t3[0x2] = B64_3(g[0x2]); + t4[0x2] = B64_4(g[0x2]); + t5[0x2] = B64_5(g[0x2]); + t6[0x2] = B64_6(g[0x2]); + t7[0x2] = B64_7(g[0x2]); + t0[0x3] = B64_0(g[0x3]); + t1[0x3] = B64_1(g[0x3]); + t2[0x3] = B64_2(g[0x3]); + t3[0x3] = B64_3(g[0x3]); + t4[0x3] = B64_4(g[0x3]); + t5[0x3] = B64_5(g[0x3]); + t6[0x3] = B64_6(g[0x3]); + t7[0x3] = B64_7(g[0x3]); + t0[0x4] = B64_0(g[0x4]); + t1[0x4] = B64_1(g[0x4]); + t2[0x4] = B64_2(g[0x4]); + t3[0x4] = B64_3(g[0x4]); + t4[0x4] = B64_4(g[0x4]); + t5[0x4] = B64_5(g[0x4]); + t6[0x4] = B64_6(g[0x4]); + t7[0x4] = B64_7(g[0x4]); + t0[0x5] = B64_0(g[0x5]); + t1[0x5] = B64_1(g[0x5]); + t2[0x5] = B64_2(g[0x5]); + t3[0x5] = B64_3(g[0x5]); + t4[0x5] = B64_4(g[0x5]); + t5[0x5] = B64_5(g[0x5]); + t6[0x5] = B64_6(g[0x5]); + t7[0x5] = B64_7(g[0x5]); + t0[0x7] = B64_0(g[0x7]); + t1[0x7] = B64_1(g[0x7]); + t2[0x7] = B64_2(g[0x7]); + t3[0x7] = B64_3(g[0x7]); + t4[0x7] = B64_4(g[0x7]); + t5[0x7] = B64_5(g[0x7]); + t6[0x7] = B64_6(g[0x7]); + t7[0x7] = B64_7(g[0x7]); + t0[0x8] = B64_0(g[0x8]); + t1[0x8] = B64_1(g[0x8]); + t2[0x8] = B64_2(g[0x8]); + t3[0x8] = B64_3(g[0x8]); + t4[0x8] = B64_4(g[0x8]); + t5[0x8] = B64_5(g[0x8]); + t6[0x8] = B64_6(g[0x8]); + t7[0x8] = B64_7(g[0x8]); + t0[0x9] = B64_0(g[0x9]); + t1[0x9] = B64_1(g[0x9]); + t2[0x9] = B64_2(g[0x9]); + t3[0x9] = B64_3(g[0x9]); + t4[0x9] = B64_4(g[0x9]); + t5[0x9] = B64_5(g[0x9]); + t6[0x9] = B64_6(g[0x9]); + t7[0x9] = B64_7(g[0x9]); + t0[0xA] = B64_0(g[0xA]); + t1[0xA] = B64_1(g[0xA]); + t2[0xA] = B64_2(g[0xA]); + t3[0xA] = B64_3(g[0xA]); + t4[0xA] = B64_4(g[0xA]); + t5[0xA] = B64_5(g[0xA]); + t6[0xA] = B64_6(g[0xA]); + t7[0xA] = B64_7(g[0xA]); + if (flag == 0) { + RBTT(g[0x8], 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); + RBTT(g[0x9], 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); + RBTT(g[0xA], 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); + } else { + RBTT(g[0x0], 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); + RBTT(g[0x1], 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); + RBTT(g[0x6], 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); + } + RBTT(g[0xC], 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); + RBTT(g[0xD], 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); + RBTT(g[0xE], 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); + RBTT(g[0xF], 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); + } + RBTT(g[0xB], 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); + + if (flag == 2) goto end; + if (flag++ == 1) { + r = 13; + goto round; + } + + r = 12; +#pragma unroll + for (u = 0; u < 8; u++) m[u] ^= g[u + 8]; + m[7] ^= H15; + m[8] = 0x80; + m[9] = 0; + m[10] = 0; + goto perm; + +end: + if ((g[3 + 8] ^ m[3]) <= target) output[output[0xFF]++] = as_uint(as_uchar4(gid).wzyx); +} + +#endif + diff --git a/kernel/keccak1600.cl b/kernel/keccak1600.cl new file mode 100644 index 000000000..d870a155e --- /dev/null +++ b/kernel/keccak1600.cl @@ -0,0 +1,84 @@ +/* + * keccak_1600 function + * C. Buchner 2014 + * + */ + +__constant static const sph_u64 RC[] = { + SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), + SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), + SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), + SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), + SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), + SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), + SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), + SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), + SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), + SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) +}; + + +inline void keccak_block(ulong *s) { + size_t i; + ulong t[5], u[5], v, w; + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ SPH_ROTL64(t[1], 1); + u[1] = t[0] ^ SPH_ROTL64(t[2], 1); + u[2] = t[1] ^ SPH_ROTL64(t[3], 1); + u[3] = t[2] ^ SPH_ROTL64(t[4], 1); + u[4] = t[3] ^ SPH_ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[1]; + s[1] = SPH_ROTL64(s[6], 44); + s[6] = SPH_ROTL64(s[9], 20); + s[9] = SPH_ROTL64(s[22], 61); + s[22] = SPH_ROTL64(s[14], 39); + s[14] = SPH_ROTL64(s[20], 18); + s[20] = SPH_ROTL64(s[2], 62); + s[2] = SPH_ROTL64(s[12], 43); + s[12] = SPH_ROTL64(s[13], 25); + s[13] = SPH_ROTL64(s[19], 8); + s[19] = SPH_ROTL64(s[23], 56); + s[23] = SPH_ROTL64(s[15], 41); + s[15] = SPH_ROTL64(s[4], 27); + s[4] = SPH_ROTL64(s[24], 14); + s[24] = SPH_ROTL64(s[21], 2); + s[21] = SPH_ROTL64(s[8], 55); + s[8] = SPH_ROTL64(s[16], 45); + s[16] = SPH_ROTL64(s[5], 36); + s[5] = SPH_ROTL64(s[3], 28); + s[3] = SPH_ROTL64(s[18], 21); + s[18] = SPH_ROTL64(s[17], 15); + s[17] = SPH_ROTL64(s[11], 10); + s[11] = SPH_ROTL64(s[7], 6); + s[7] = SPH_ROTL64(s[10], 3); + s[10] = SPH_ROTL64(v, 1); + + v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; + v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + s[0] ^= RC[i]; + } +}; \ No newline at end of file diff --git a/kernel/pluck.cl b/kernel/pluck.cl new file mode 100644 index 000000000..4fa501c76 --- /dev/null +++ b/kernel/pluck.cl @@ -0,0 +1,463 @@ +/* +* "pluck" kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* +* Copyright (c) 2015 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ +#if !defined(cl_khr_byte_addressable_store) +#error "Device does not support unaligned stores" +#endif +#define ROL32(x, n) rotate(x, (uint) n) +//#define ROL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +#define HASH_MEMORY 4096 + + +#define SALSA(a,b,c,d) do { \ + t =a+d; b^=rotate(t, 7U); \ + t =b+a; c^=rotate(t, 9U); \ + t =c+b; d^=rotate(t, 13U); \ + t =d+c; a^=rotate(t, 18U); \ +} while(0) + + +#define SALSA_CORE(state) do { \ +\ +SALSA(state.s0,state.s4,state.s8,state.sc); \ +SALSA(state.s5,state.s9,state.sd,state.s1); \ +SALSA(state.sa,state.se,state.s2,state.s6); \ +SALSA(state.sf,state.s3,state.s7,state.sb); \ +SALSA(state.s0,state.s1,state.s2,state.s3); \ +SALSA(state.s5,state.s6,state.s7,state.s4); \ +SALSA(state.sa,state.sb,state.s8,state.s9); \ +SALSA(state.sf,state.sc,state.sd,state.se); \ + } while(0) + +/* +#define SALSA_CORE(state) do { \ + state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \ + state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \ + state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \ + state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \ + state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \ + state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \ + state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \ + state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \ +} while(0) +*/ +uint16 xor_salsa8(uint16 Bx) +{ +uint t; + uint16 st = Bx; + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + return(st + Bx); +} + + + +#define SHR(x, n) ((x) >> n) +#define SWAP32(a) (as_uint(as_uchar4(a).wzyx)) + +#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^ SHR(x, 3)) +#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^ SHR(x, 10)) + +#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10)) +#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7)) + +#define P(a,b,c,d,e,f,g,h,x,K) \ +{ \ + temp1 = h + S3(e) + F1(e,f,g) + (K + x); \ + d += temp1; h = temp1 + S2(a) + F0(a,b,c); \ +} + +#define PLAST(a,b,c,d,e,f,g,h,x,K) \ +{ \ + d += h + S3(e) + F1(e,f,g) + (x + K); \ +} + +#define F0(y, x, z) bitselect(z, y, z ^ x) +#define F1(x, y, z) bitselect(z, y, x) + +#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0) +#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1) +#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2) +#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3) +#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4) +#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5) +#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6) +#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7) +#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8) +#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9) +#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10) +#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11) +#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12) +#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13) +#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14) +#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15) + +#define RD14 (S1(W12) + W7 + S0(W15) + W14) +#define RD15 (S1(W13) + W8 + S0(W0) + W15) + +inline uint8 sha256_round1(uint16 data) +{ + uint temp1; + uint8 res; + uint W0 = SWAP32(data.s0); + uint W1 = SWAP32(data.s1); + uint W2 = SWAP32(data.s2); + uint W3 = SWAP32(data.s3); + uint W4 = SWAP32(data.s4); + uint W5 = SWAP32(data.s5); + uint W6 = SWAP32(data.s6); + uint W7 = SWAP32(data.s7); + uint W8 = SWAP32(data.s8); + uint W9 = SWAP32(data.s9); + uint W10 = SWAP32(data.sA); + uint W11 = SWAP32(data.sB); + uint W12 = SWAP32(data.sC); + uint W13 = SWAP32(data.sD); + uint W14 = SWAP32(data.sE); + uint W15 = SWAP32(data.sF); + + uint v0 = 0x6A09E667; + uint v1 = 0xBB67AE85; + uint v2 = 0x3C6EF372; + uint v3 = 0xA54FF53A; + uint v4 = 0x510E527F; + uint v5 = 0x9B05688C; + uint v6 = 0x1F83D9AB; + uint v7 = 0x5BE0CD19; + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); + + res.s0 = v0 + 0x6A09E667; + res.s1 = v1 + 0xBB67AE85; + res.s2 = v2 + 0x3C6EF372; + res.s3 = v3 + 0xA54FF53A; + res.s4 = v4 + 0x510E527F; + res.s5 = v5 + 0x9B05688C; + res.s6 = v6 + 0x1F83D9AB; + res.s7 = v7 + 0x5BE0CD19; + return (res); +} + + +inline uint8 sha256_round2(uint16 data,uint8 buf) +{ + uint temp1; + uint8 res; + uint W0 = data.s0; + uint W1 = data.s1; + uint W2 = data.s2; + uint W3 = data.s3; + uint W4 = data.s4; + uint W5 = data.s5; + uint W6 = data.s6; + uint W7 = data.s7; + uint W8 = data.s8; + uint W9 = data.s9; + uint W10 = data.sA; + uint W11 = data.sB; + uint W12 = data.sC; + uint W13 = data.sD; + uint W14 = data.sE; + uint W15 = data.sF; + + uint v0 = buf.s0; + uint v1 = buf.s1; + uint v2 = buf.s2; + uint v3 = buf.s3; + uint v4 = buf.s4; + uint v5 = buf.s5; + uint v6 = buf.s6; + uint v7 = buf.s7; + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); + + res.s0 = SWAP32(v0 + buf.s0); + res.s1 = SWAP32(v1 + buf.s1); + res.s2 = SWAP32(v2 + buf.s2); + res.s3 = SWAP32(v3 + buf.s3); + res.s4 = SWAP32(v4 + buf.s4); + res.s5 = SWAP32(v5 + buf.s5); + res.s6 = SWAP32(v6 + buf.s6); + res.s7 = SWAP32(v7 + buf.s7); + return (res); +} + +inline uint8 sha256_80(uint* data,uint nonce) +{ + +uint8 buf = sha256_round1( ((uint16*)data)[0]); +uint in[16]; +for (int i = 0; i<3; i++) { in[i] = SWAP32(data[i + 16]); } +in[3] = SWAP32(nonce); +in[4] = 0x80000000; +in[15] = 0x280; +for (int i = 5; i<15; i++) { in[i] = 0; } + +return(sha256_round2(((uint16*)in)[0], buf)); +} + +inline uint8 sha256_64(uint* data) +{ + +uint8 buf=sha256_round1(((uint16*)data)[0]); +uint in[16]; +for (int i = 1; i<15; i++) { in[i] = 0; } +in[0] = 0x80000000; +in[15] = 0x200; + + return(sha256_round2(((uint16*)in)[0],buf)); +} + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target) +{ + + __global uchar *hashbuffer = (__global uchar *)(padcache + (1024*128 * (get_global_id(0) % MAX_GLOBAL_THREADS))); + + uint data[20]; + + ((uint16 *)data)[0] = ((__global const uint16 *)input)[0]; + ((uint4 *)data)[4] = ((__global const uint4 *)input)[4]; + + ((__global uint8*)hashbuffer)[0] = sha256_80(data,get_global_id(0)); + ((__global uint8*)hashbuffer)[1] = 0; + + for (int i = 2; i < 4096 - 1; i++) + { + uint randmax = i * 32 - 4; + uint randseed[16]; + uint randbuffer[16]; + uint joint[16]; + + ((uint8*)randseed)[0] = ((__global uint8*)hashbuffer)[i - 2]; + ((uint8*)randseed)[1] = ((__global uint8*)hashbuffer)[i - 1]; + + if (i>4) + { + + ((uint8*)randseed)[0] ^= ((__global uint8*)hashbuffer)[i - 4]; + ((uint8*)randseed)[1] ^= ((__global uint8*)hashbuffer)[i - 3]; + } + + ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); + + + + ((uint8*)joint)[0] = ((__global uint8*)hashbuffer)[i - 1]; + for (int j = 0; j < 8; j++) + { + uint rand = randbuffer[j] % (randmax - 32); + + ((uchar4*)joint)[(j + 8)].x =((__global uchar*)(hashbuffer))[0+rand]; + ((uchar4*)joint)[(j + 8)].y =((__global uchar*)(hashbuffer))[1+rand]; + ((uchar4*)joint)[(j + 8)].z =((__global uchar*)(hashbuffer))[2+rand]; + ((uchar4*)joint)[(j + 8)].w =((__global uchar*)(hashbuffer))[3+rand]; +} + ((__global uint8*)(hashbuffer))[i] = sha256_64(joint); + + + + (( uint8*)randseed)[0] = ((__global uint8*)(hashbuffer))[i - 1]; + (( uint8*)randseed)[1] = ((__global uint8*)(hashbuffer))[i]; + + + if (i>4) + { + + ((uint8*)randseed)[0] ^= ((__global uint8*)(hashbuffer))[i - 4]; + ((uint8*)randseed)[1] ^= ((__global uint8*)(hashbuffer))[i - 3]; + + } + + ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); + + for (int j = 0; j < 32; j += 2) + { + uint rand = randbuffer[j / 2] % randmax; + uchar4 Tohere; + + Tohere.x = ((__global uchar*)(hashbuffer))[randmax + j]; + Tohere.y = ((__global uchar*)(hashbuffer))[randmax + j + 1]; + Tohere.z = ((__global uchar*)(hashbuffer))[randmax + j + 2]; + Tohere.w = ((__global uchar*)(hashbuffer))[randmax + j + 3]; + ((__global uchar*)(hashbuffer))[rand] = Tohere.x; + ((__global uchar*)(hashbuffer))[rand+1] = Tohere.y; + ((__global uchar*)(hashbuffer))[rand+2] = Tohere.z; + ((__global uchar*)(hashbuffer))[rand+3] = Tohere.w; + + } + + } // main loop + + + if( ((__global uint *)hashbuffer)[7] <= (target)) {output[atomic_inc(output + 0xFF)] = SWAP32(get_global_id(0)); +//printf("gpu hashbuffer %08x nonce %08x\n",((__global uint *)hashbuffer)[7] ,SWAP32(get_global_id(0))); +} + + + +///////////////////////////////////////////////////////////////// + +} \ No newline at end of file diff --git a/kernel/skein256.cl b/kernel/skein256.cl new file mode 100644 index 000000000..a7d85faf4 --- /dev/null +++ b/kernel/skein256.cl @@ -0,0 +1,105 @@ + +/* +* skein256 kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* Copyright (c) 2014 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ + + +__constant static const sph_u64 SKEIN_IV512[] = { + SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), + SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), + SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), + SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) +}; + +__constant static const sph_u64 SKEIN_IV512_256[8] = { + 0xCCD044A12FDB3E13UL, 0xE83590301A79A9EBUL, + 0x55AEA0614F816E6FUL, 0x2A2767A4AE9B94DBUL, + 0xEC06025E74DD7683UL, 0xE7A436CDC4746251UL, + 0xC36FBAF9393AD185UL, 0x3EEDBA1833EDFC13UL +}; + + + +__constant static const int ROT256[8][4] = +{ + {46, 36, 19, 37}, + {33, 27, 14, 42}, + {17, 49, 36, 39}, + {44, 9, 54, 56}, + {39, 30, 34, 24}, + {13, 50, 10, 17}, + {25, 29, 39, 43}, + {8, 35, 56, 22} +}; + +__constant static const sph_u64 skein_ks_parity = 0x1BD11BDAA9FC1A22; + +__constant static const sph_u64 t12[6] = +{ 0x20UL, +0xf000000000000000UL, +0xf000000000000020UL, +0x08UL, +0xff00000000000000UL, +0xff00000000000008UL +}; + + +#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT) { \ +p0 += p1; p1 = SPH_ROTL64(p1, ROT256[ROT][0]); p1 ^= p0; \ +p2 += p3; p3 = SPH_ROTL64(p3, ROT256[ROT][1]); p3 ^= p2; \ +p4 += p5; p5 = SPH_ROTL64(p5, ROT256[ROT][2]); p5 ^= p4; \ +p6 += p7; p7 = SPH_ROTL64(p7, ROT256[ROT][3]); p7 ^= p6; \ +} + +#define Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, R) { \ + Round512(p0, p1, p2, p3, p4, p5, p6, p7, 0); \ + Round512(p2, p1, p4, p7, p6, p5, p0, p3, 1); \ + Round512(p4, p1, p6, p3, p0, p5, p2, p7, 2); \ + Round512(p6, p1, p0, p7, p2, p5, p4, p3, 3); \ + p0 += h[((R)+0) % 9]; \ + p1 += h[((R)+1) % 9]; \ + p2 += h[((R)+2) % 9]; \ + p3 += h[((R)+3) % 9]; \ + p4 += h[((R)+4) % 9]; \ + p5 += h[((R)+5) % 9] + t[((R)+0) % 3]; \ + p6 += h[((R)+6) % 9] + t[((R)+1) % 3]; \ + p7 += h[((R)+7) % 9] + R; \ + Round512(p0, p1, p2, p3, p4, p5, p6, p7, 4); \ + Round512(p2, p1, p4, p7, p6, p5, p0, p3, 5); \ + Round512(p4, p1, p6, p3, p0, p5, p2, p7, 6); \ + Round512(p6, p1, p0, p7, p2, p5, p4, p3, 7); \ + p0 += h[((R)+1) % 9]; \ + p1 += h[((R)+2) % 9]; \ + p2 += h[((R)+3) % 9]; \ + p3 += h[((R)+4) % 9]; \ + p4 += h[((R)+5) % 9]; \ + p5 += h[((R)+6) % 9] + t[((R)+1) % 3]; \ + p6 += h[((R)+7) % 9] + t[((R)+2) % 3]; \ + p7 += h[((R)+8) % 9] + (R+1); \ +} \ No newline at end of file diff --git a/kernel/yescrypt-multi.cl b/kernel/yescrypt-multi.cl new file mode 100644 index 000000000..3af7b28ac --- /dev/null +++ b/kernel/yescrypt-multi.cl @@ -0,0 +1,314 @@ +/* +* "yescrypt" kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* +* Copyright (c) 2015 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ +#if !defined(cl_khr_byte_addressable_store) +#error "Device does not support unaligned stores" +#endif + +#include "yescrypt_essential.cl" + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, __global uchar* buff1, __global uchar* buff2, __global uchar* buff3, const uint target) +{ + + __global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)*(get_global_id(0) % MAX_GLOBAL_THREADS))); + __global uint8 *sha256tokeep = (__global uint8 *)(buff3 + (8 * sizeof(uint)*(get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + + uint nonce = (get_global_id(0)); + uint data[20]; + uint16 in; + uint8 state1, state2; +// uint8 sha256tokeep; + +// ulong16 Bdev[8]; // will require an additional buffer + ((uint16 *)data)[0] = ((__global const uint16 *)input)[0]; + ((uint4 *)data)[4] = ((__global const uint4 *)input)[4]; +// for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); } + // if (nonce == 10) { printf("data %08x %08x\n", data[0], data[1]); } + uint8 passwd = sha256_80(data, nonce); + //pbkdf + in.lo = pad1.lo ^ passwd; + in.hi = pad1.hi; + state1 = sha256_Transform(in, H256); + + in.lo = pad2.lo ^ passwd; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + + in = ((uint16*)data)[0]; + state1 = sha256_Transform(in, state1); +#pragma unroll 1 + for (int i = 0; i<8; i++) + { + uint16 result; + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 1; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.lo = swapvec(sha256_Transform(in, state2)); + if (i == 0) sha256tokeep[0] = result.lo; + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 2; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.hi = swapvec(sha256_Transform(in, state2)); + Bdev[i].lo = as_ulong8(shuffle(result)); +// Bdev[i].lo = as_ulong8(result); + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 3; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.lo = swapvec(sha256_Transform(in, state2)); + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 4; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.hi = swapvec(sha256_Transform(in, state2)); + + + Bdev[i].hi = as_ulong8(shuffle(result)); +// Bdev[i].hi = as_ulong8(result); + } + + //mixing1 + + prevstate[0] = Bdev[0]; + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + prevstate[1] = Bdev[0]; + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + + uint n = 1; +#pragma unroll 1 + for (uint i = 2; i < 64; i++) + { + + prevstate[i] = Bdev[0]; + + if ((i&(i - 1)) == 0) n = n << 1; + + uint j = as_uint2(Bdev[0].hi.s0).x & (n - 1); + + j += i - n; + Bdev[0] ^= prevstate[j]; + + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + } + + +} + + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global uchar *buffer1, __global uchar *buffer2) +{ +} + + + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global uchar *padcache, __global uchar *buff1, __global uchar *buff2) +{ + + __global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16* prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + for (int i = 0; i<8; i++) + hashbuffer[i] = Bdev[i]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + + + for (int i = 0; i<8; i++) + hashbuffer[i + 8] = Bdev[i]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + int n = 1; +#pragma unroll 1 + for (int i = 2; i < 2048; i ++) + { + + for (int k = 0; k<8; k++) + (hashbuffer + 8 * i)[k] = Bdev[k]; + + + if ((i&(i - 1)) == 0) n = n << 1; + + uint j = as_uint2(Bdev[7].hi.s0).x & (n - 1); + j += i - n; + + for (int k = 0; k < 8; k++) + Bdev[k] ^= (hashbuffer + 8 * j)[k]; + + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + } +} + +/* +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global uchar *buffer1, __global uchar *buffer2) +{ +} +*/ + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global uchar *padcache, __global uchar *buff1, __global uchar *buff2) +{ + + __global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16* prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + +#pragma unroll 1 + for (int z = 0; z < 684; z++) + { + + uint j = as_uint2(Bdev[7].hi.s0).x & 2047; + + + for (int k = 0; k < 8; k++) + Bdev[k] ^= (hashbuffer + 8 * j)[k]; + + if (z<682) + for (int k = 0; k<8; k++) + (hashbuffer + 8 * j)[k] = Bdev[k]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); +//// + } + +} + +/* +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search5(__global uchar *buffer1, __global uchar *buffer2) +{ +} +*/ + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search4(__global const uchar* restrict input, __global uint* restrict output, __global uchar *buff2,__global uchar* buff3, const uint target) +{ + + __global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global uint8 *sha256tokeep = (__global uint8 *)(buff3 + (8 * sizeof(uint)*(get_global_id(0) % MAX_GLOBAL_THREADS))); + + uint nonce = (get_global_id(0)); + + + uint data[20]; + ((uint16 *)data)[0] = ((__global const uint16 *)input)[0]; + ((uint4 *)data)[4] = ((__global const uint4 *)input)[4]; +// for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); } + uint8 swpass = swapvec(sha256tokeep[0]); + uint16 in; + uint8 state1,state2; + in.lo = pad1.lo ^ swpass; + in.hi = pad1.hi; + + + state1 = sha256_Transform(in, H256); + + in.lo = pad2.lo ^ swpass; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + +#pragma unroll 1 + for (int i = 0; i<8; i++) { + in = unshuffle(Bdev[i].lo); + in = swapvec16(in); + state1 = sha256_Transform(in, state1); + in = unshuffle(Bdev[i].hi); + in = swapvec16(in); + state1 = sha256_Transform(in, state1); + } + in = pad5; + state1 = sha256_Transform(in, state1); + in.lo = state1; + in.hi = pad4; + uint8 res = sha256_Transform(in, state2); + + //hmac and final sha + + in.lo = pad1.lo ^ res; + in.hi = pad1.hi; + state1 = sha256_Transform(in, H256); + in.lo = pad2.lo ^ res; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + in = ((uint16*)data)[0]; + state1 = sha256_Transform(in, state1); + in = padsha80; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = get_global_id(0); + in.sf = 0x480; + state1 = sha256_Transform(in, state1); + in.lo = state1; + in.hi = pad4; + state1 = sha256_Transform(in, state2); + // state2 = H256; + in.lo = state1; + in.hi = pad4; + in.sf = 0x100; + res = sha256_Transform(in, H256); + + + if (SWAP32(res.s7) <= (target)) + output[atomic_inc(output + 0xFF)] = (nonce); + +} diff --git a/kernel/yescrypt.cl b/kernel/yescrypt.cl new file mode 100644 index 000000000..0a94ebcab --- /dev/null +++ b/kernel/yescrypt.cl @@ -0,0 +1,253 @@ +/* +* "yescrypt" kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* +* Copyright (c) 2015 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ +#if !defined(cl_khr_byte_addressable_store) +#error "Device does not support unaligned stores" +#endif + +#include "yescrypt_essential.cl" + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, __global uchar* buff1, __global uchar* buff2, const uint target) +{ + + __global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)*(get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + + uint nonce = (get_global_id(0)); + uint data[20]; + uint16 in; + uint8 state1, state2; + uint8 sha256tokeep; + + ((uint16 *)data)[0] = ((__global const uint16 *)input)[0]; + ((uint4 *)data)[4] = ((__global const uint4 *)input)[4]; + for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); } + // if (nonce == 10) { printf("data %08x %08x\n", data[0], data[1]); } + uint8 passwd = sha256_80(data, nonce); + //pbkdf + in.lo = pad1.lo ^ passwd; + in.hi = pad1.hi; + state1 = sha256_Transform(in, H256); + + in.lo = pad2.lo ^ passwd; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + + in = ((uint16*)data)[0]; + state1 = sha256_Transform(in, state1); +#pragma unroll 1 + for (int i = 0; i<8; i++) + { + uint16 result; + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 1; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.lo = swapvec(sha256_Transform(in, state2)); + if (i == 0) sha256tokeep = result.lo; + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 2; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.hi = swapvec(sha256_Transform(in, state2)); + Bdev[i].lo = as_ulong8(shuffle(result)); + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 3; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.lo = swapvec(sha256_Transform(in, state2)); + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 4; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.hi = swapvec(sha256_Transform(in, state2)); + + + Bdev[i].hi = as_ulong8(shuffle(result)); + } + + //mixing1 + + prevstate[0] = Bdev[0]; + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + prevstate[1] = Bdev[0]; + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + + uint n = 1; +#pragma unroll 1 + for (uint i = 2; i < 64; i++) + { + + prevstate[i] = Bdev[0]; + + if ((i&(i - 1)) == 0) n = n << 1; + + uint j = as_uint2(Bdev[0].hi.s0).x & (n - 1); + + j += i - n; + Bdev[0] ^= prevstate[j]; + + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + } + + + for (int i = 0; i<8; i++) + hashbuffer[i] = Bdev[i]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + + + for (int i = 0; i<8; i++) + hashbuffer[i + 8] = Bdev[i]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + n = 1; +#pragma unroll 1 + for (int i = 2; i < 2048; i++) + { + + for (int k = 0; k<8; k++) + (hashbuffer + 8 * i)[k] = Bdev[k]; + + + if ((i&(i - 1)) == 0) n = n << 1; + + uint j = as_uint2(Bdev[7].hi.s0).x & (n - 1); + j += i - n; + + for (int k = 0; k < 8; k++) + Bdev[k] ^= (hashbuffer + 8 * j)[k]; + + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + } + + +#pragma unroll 1 + for (int z = 0; z < 684; z++) + { + + uint j = as_uint2(Bdev[7].hi.s0).x & 2047; + + + for (int k = 0; k < 8; k++) + Bdev[k] ^= (hashbuffer + 8 * j)[k]; + + if (z<682) + for (int k = 0; k<8; k++) + (hashbuffer + 8 * j)[k] = Bdev[k]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + //// + } + + + + uint8 swpass = swapvec(sha256tokeep); +// uint16 in; +// uint8 state1, state2; + in.lo = pad1.lo ^ swpass; + in.hi = pad1.hi; + + + state1 = sha256_Transform(in, H256); + + in.lo = pad2.lo ^ swpass; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + +#pragma unroll 1 + for (int i = 0; i<8; i++) { + in = unshuffle(Bdev[i].lo); + in = swapvec16(in); + state1 = sha256_Transform(in, state1); + in = unshuffle(Bdev[i].hi); + in = swapvec16(in); + state1 = sha256_Transform(in, state1); + } + in = pad5; + state1 = sha256_Transform(in, state1); + in.lo = state1; + in.hi = pad4; + uint8 res = sha256_Transform(in, state2); + + //hmac and final sha + + in.lo = pad1.lo ^ res; + in.hi = pad1.hi; + state1 = sha256_Transform(in, H256); + in.lo = pad2.lo ^ res; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + in = ((uint16*)data)[0]; + state1 = sha256_Transform(in, state1); + in = padsha80; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = get_global_id(0); + in.sf = 0x480; + state1 = sha256_Transform(in, state1); + in.lo = state1; + in.hi = pad4; + state1 = sha256_Transform(in, state2); + // state2 = H256; + in.lo = state1; + in.hi = pad4; + in.sf = 0x100; + res = sha256_Transform(in, H256); + + + if (SWAP32(res.s7) <= (target)) + output[atomic_inc(output + 0xFF)] = (nonce); + +} + diff --git a/kernel/yescrypt_essential.cl b/kernel/yescrypt_essential.cl new file mode 100644 index 000000000..ba1816a8e --- /dev/null +++ b/kernel/yescrypt_essential.cl @@ -0,0 +1,760 @@ +/* +* "yescrypt" kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* +* Copyright (c) 2015 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ + +#define ROL32(x, n) rotate(x, (uint) n) +#define SWAP32(a) (as_uint(as_uchar4(a).wzyx)) +//#define ROL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +#define HASH_MEMORY 4096 + + +#define SALSA(a,b,c,d) do { \ + t =a+d; b^=ROL32(t, 7U); \ + t =b+a; c^=ROL32(t, 9U); \ + t =c+b; d^=ROL32(t, 13U); \ + t =d+c; a^=ROL32(t, 18U); \ +} while(0) + + +#define SALSA_CORE(state) do { \ +\ +SALSA(state.s0,state.s4,state.s8,state.sc); \ +SALSA(state.s5,state.s9,state.sd,state.s1); \ +SALSA(state.sa,state.se,state.s2,state.s6); \ +SALSA(state.sf,state.s3,state.s7,state.sb); \ +SALSA(state.s0,state.s1,state.s2,state.s3); \ +SALSA(state.s5,state.s6,state.s7,state.s4); \ +SALSA(state.sa,state.sb,state.s8,state.s9); \ +SALSA(state.sf,state.sc,state.sd,state.se); \ + } while(0) + +#define uSALSA_CORE(state) do { \ +\ +SALSA(state.s0,state.s4,state.s8,state.sc); \ +SALSA(state.s1,state.s5,state.s9,state.sd); \ +SALSA(state.s2,state.s6,state.sa,state.se); \ +SALSA(state.s3,state.s7,state.sb,state.sf); \ +SALSA(state.s0,state.sd,state.sa,state.s7); \ +SALSA(state.s1,state.se,state.sb,state.s4); \ +SALSA(state.s2,state.sf,state.s8,state.s5); \ +SALSA(state.s3,state.sc,state.s9,state.s6); \ +} while(0) + + +#define unshuffle(state) (as_uint16(state).s0da741eb852fc963) + +#define shuffle(state) (as_uint16(state).s05af49e38d27c16b) + +static __constant uint16 pad1 = +{ + 0x36363636, 0x36363636, 0x36363636, 0x36363636, + 0x36363636, 0x36363636, 0x36363636, 0x36363636, + 0x36363636, 0x36363636, 0x36363636, 0x36363636, + 0x36363636, 0x36363636, 0x36363636, 0x36363636 +}; + +static __constant uint16 pad2 = +{ + 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, + 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, + 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, + 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c +}; + +static __constant uint16 pad5 = +{ + 0x00000001, 0x80000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00002220 +}; + +static __constant uint16 pad3 = +{ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x80000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x000004a0 +}; + +static __constant uint16 padsha80 = +{ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000280 +}; + +static __constant uint8 pad4 = +{ + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000300 +}; + + + +static __constant uint8 H256 = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, + 0xA54FF53A, 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19 +}; + +inline uint8 swapvec(uint8 buf) +{ + uint8 vec; + vec.s0 = SWAP32(buf.s0); + vec.s1 = SWAP32(buf.s1); + vec.s2 = SWAP32(buf.s2); + vec.s3 = SWAP32(buf.s3); + vec.s4 = SWAP32(buf.s4); + vec.s5 = SWAP32(buf.s5); + vec.s6 = SWAP32(buf.s6); + vec.s7 = SWAP32(buf.s7); + return vec; +} + + + +inline uint16 swapvec16(uint16 buf) +{ + uint16 vec; + vec.s0 = SWAP32(buf.s0); + vec.s1 = SWAP32(buf.s1); + vec.s2 = SWAP32(buf.s2); + vec.s3 = SWAP32(buf.s3); + vec.s4 = SWAP32(buf.s4); + vec.s5 = SWAP32(buf.s5); + vec.s6 = SWAP32(buf.s6); + vec.s7 = SWAP32(buf.s7); + vec.s8 = SWAP32(buf.s8); + vec.s9 = SWAP32(buf.s9); + vec.sa = SWAP32(buf.sa); + vec.sb = SWAP32(buf.sb); + vec.sc = SWAP32(buf.sc); + vec.sd = SWAP32(buf.sd); + vec.se = SWAP32(buf.se); + vec.sf = SWAP32(buf.sf); + return vec; +} + + ulong8 salsa20_8(uint16 Bx) +{ +uint t; + uint16 st = Bx; + uSALSA_CORE(st); + uSALSA_CORE(st); + uSALSA_CORE(st); + uSALSA_CORE(st); + return(as_ulong8(st + Bx)); +} + + ulong8 salsa20_8n(uint16 Bx) + { + uint t; + uint16 st = Bx; + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + return(as_ulong8(st + Bx)); + } + + + ulong16 blockmix_salsa8_small2(ulong16 Bin) +{ + ulong8 X = Bin.hi; + X ^= Bin.lo; + X = salsa20_8(as_uint16(X)); + Bin.lo = X; + X ^= Bin.hi; + X = salsa20_8(as_uint16(X)); + Bin.hi = X; + return(Bin); +} +/* + uint16 salsa20_8_2(uint16 Bx) + { + uint t; + uint16 st = Bx; + uSALSA_CORE(st); + uSALSA_CORE(st); + uSALSA_CORE(st); + uSALSA_CORE(st); + return(st + Bx); + } + + ulong16 blockmix_salsa8_small2(ulong16 Bin) + { + uint16 X = as_uint16(Bin.hi); + X ^= as_uint16(Bin.lo); + X = salsa20_8_2(as_uint16(X)); + Bin.lo = as_ulong8(X); + X ^= as_uint16(Bin.hi); + X = salsa20_8_2(as_uint16(X)); + Bin.hi = as_ulong8(X); + return(Bin); + } +*/ + + +inline ulong2 madd4long2(uint4 a, uint4 b) +{ + uint4 result; + result.x = a.x*a.y + b.x; + result.y = b.y + mad_hi(a.x, a.y, b.x); + result.z = a.z*a.w + b.z; + result.w = b.w + mad_hi(a.z, a.w, b.z); + return as_ulong2(result); +} + +inline ulong2 madd4long3(uint4 a, ulong2 b) +{ + ulong2 result; + result.x = (ulong)a.x*(ulong)a.y + b.x; + result.y = (ulong)a.z*(ulong)a.w + b.y; + return result; +} + + +inline ulong8 block_pwxform_long_old(ulong8 Bout, __global ulong16 *prevstate) +{ + + ulong2 vec = Bout.lo.lo; + + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = ((__global ulong2*)(prevstate ))[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = ((__global ulong2*)(prevstate + 32))[x.y]; + + vec ^= p1; + } + Bout.lo.lo = vec; + vec = Bout.lo.hi; + for (int i = 0; i < 6; i++) + { + + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = ((__global ulong2*)(prevstate))[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = ((__global ulong2*)(prevstate + 32))[x.y]; + + vec ^= p1; + } + Bout.lo.hi = vec; + + vec = Bout.hi.lo; + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = ((__global ulong2*)(prevstate))[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = ((__global ulong2*)(prevstate + 32))[x.y]; + vec ^= p1; + } + Bout.hi.lo = vec; + vec = Bout.hi.hi; + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = ((__global ulong2*)(prevstate))[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = ((__global ulong2*)(prevstate + 32))[x.y]; + + vec ^= p1; + } + Bout.hi.hi = vec; + + return(Bout); +} + +inline ulong8 block_pwxform_long(ulong8 Bout, __global ulong2 *prevstate) +{ + + ulong2 vec = Bout.lo.lo; + + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = prevstate[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = (prevstate + 32*8)[x.y]; + + vec ^= p1; + } + Bout.lo.lo = vec; + vec = Bout.lo.hi; + for (int i = 0; i < 6; i++) + { + + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = prevstate[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = (prevstate + 32 * 8)[x.y]; + + vec ^= p1; + } + Bout.lo.hi = vec; + + vec = Bout.hi.lo; + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = prevstate[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = (prevstate + 32 * 8)[x.y]; + vec ^= p1; + } + Bout.hi.lo = vec; + vec = Bout.hi.hi; + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = prevstate[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = (prevstate + 32 * 8)[x.y]; + + vec ^= p1; + } + Bout.hi.hi = vec; + + return(Bout); +} + + + + +inline void blockmix_pwxform(__global ulong8 *Bin, __global ulong16 *prevstate) +{ + Bin[0] ^= Bin[15]; + Bin[0] = block_pwxform_long_old(Bin[0], prevstate); +#pragma unroll 1 + for (int i = 1; i < 16; i++) + { + Bin[i] ^= Bin[i - 1]; + Bin[i] = block_pwxform_long_old(Bin[i], prevstate); + } + Bin[15] = salsa20_8(as_uint16(Bin[15])); +} + +#define SHR(x, n) ((x) >> n) + + +#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^ SHR(x, 3)) +#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^ SHR(x, 10)) + +#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10)) +#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7)) + +#define P(a,b,c,d,e,f,g,h,x,K) \ +{ \ + temp1 = h + S3(e) + F1(e,f,g) + (K + x); \ + d += temp1; h = temp1 + S2(a) + F0(a,b,c); \ +} + +#define PLAST(a,b,c,d,e,f,g,h,x,K) \ +{ \ + d += h + S3(e) + F1(e,f,g) + (x + K); \ +} + +#define F0(y, x, z) bitselect(z, y, z ^ x) +#define F1(x, y, z) bitselect(z, y, x) + +#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0) +#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1) +#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2) +#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3) +#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4) +#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5) +#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6) +#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7) +#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8) +#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9) +#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10) +#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11) +#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12) +#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13) +#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14) +#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15) + +#define RD14 (S1(W12) + W7 + S0(W15) + W14) +#define RD15 (S1(W13) + W8 + S0(W0) + W15) + +/// generic sha transform +inline uint8 sha256_Transform(uint16 data, uint8 state) +{ +uint temp1; + uint8 res = state; + uint W0 = data.s0; + uint W1 = data.s1; + uint W2 = data.s2; + uint W3 = data.s3; + uint W4 = data.s4; + uint W5 = data.s5; + uint W6 = data.s6; + uint W7 = data.s7; + uint W8 = data.s8; + uint W9 = data.s9; + uint W10 = data.sA; + uint W11 = data.sB; + uint W12 = data.sC; + uint W13 = data.sD; + uint W14 = data.sE; + uint W15 = data.sF; + +#define v0 res.s0 +#define v1 res.s1 +#define v2 res.s2 +#define v3 res.s3 +#define v4 res.s4 +#define v5 res.s5 +#define v6 res.s6 +#define v7 res.s7 + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef v4 +#undef v5 +#undef v6 +#undef v7 + return (res+state); +} + + +static inline uint8 sha256_round1(uint16 data) +{ + uint temp1; + uint8 res; + uint W0 = data.s0; + uint W1 = data.s1; + uint W2 = data.s2; + uint W3 = data.s3; + uint W4 = data.s4; + uint W5 = data.s5; + uint W6 = data.s6; + uint W7 = data.s7; + uint W8 = data.s8; + uint W9 = data.s9; + uint W10 = data.sA; + uint W11 = data.sB; + uint W12 = data.sC; + uint W13 = data.sD; + uint W14 = data.sE; + uint W15 = data.sF; + + uint v0 = 0x6A09E667; + uint v1 = 0xBB67AE85; + uint v2 = 0x3C6EF372; + uint v3 = 0xA54FF53A; + uint v4 = 0x510E527F; + uint v5 = 0x9B05688C; + uint v6 = 0x1F83D9AB; + uint v7 = 0x5BE0CD19; + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); + + res.s0 = v0 + 0x6A09E667; + res.s1 = v1 + 0xBB67AE85; + res.s2 = v2 + 0x3C6EF372; + res.s3 = v3 + 0xA54FF53A; + res.s4 = v4 + 0x510E527F; + res.s5 = v5 + 0x9B05688C; + res.s6 = v6 + 0x1F83D9AB; + res.s7 = v7 + 0x5BE0CD19; + return (res); +} + + +static inline uint8 sha256_round2(uint16 data,uint8 buf) +{ + uint temp1; + uint8 res; + uint W0 = data.s0; + uint W1 = data.s1; + uint W2 = data.s2; + uint W3 = data.s3; + uint W4 = data.s4; + uint W5 = data.s5; + uint W6 = data.s6; + uint W7 = data.s7; + uint W8 = data.s8; + uint W9 = data.s9; + uint W10 = data.sA; + uint W11 = data.sB; + uint W12 = data.sC; + uint W13 = data.sD; + uint W14 = data.sE; + uint W15 = data.sF; + + uint v0 = buf.s0; + uint v1 = buf.s1; + uint v2 = buf.s2; + uint v3 = buf.s3; + uint v4 = buf.s4; + uint v5 = buf.s5; + uint v6 = buf.s6; + uint v7 = buf.s7; + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); + + res.s0 = (v0 + buf.s0); + res.s1 = (v1 + buf.s1); + res.s2 = (v2 + buf.s2); + res.s3 = (v3 + buf.s3); + res.s4 = (v4 + buf.s4); + res.s5 = (v5 + buf.s5); + res.s6 = (v6 + buf.s6); + res.s7 = (v7 + buf.s7); + return (res); +} + +static inline uint8 sha256_80(uint* data,uint nonce) +{ + +uint8 buf = sha256_round1( ((uint16*)data)[0]); +uint16 in = padsha80; +in.s0 = data[16]; +in.s1 = data[17]; +in.s2 = data[18]; +in.s3 = nonce; + +return(sha256_round2(in,buf)); +} + diff --git a/miner.h b/miner.h index 206d2763f..09e8d4222 100644 --- a/miner.h +++ b/miner.h @@ -692,6 +692,7 @@ static inline void flip32(void *dest_p, const void *src_p) dest[i] = swab32(src[i]); } + static inline void flip64(void *dest_p, const void *src_p) { uint32_t *dest = (uint32_t *)dest_p; @@ -722,6 +723,17 @@ static inline void flip128(void *dest_p, const void *src_p) dest[i] = swab32(src[i]); } +static inline void flip168(void *dest_p, const void *src_p) +{ + uint32_t *dest = (uint32_t *)dest_p; + const uint32_t *src = (uint32_t *)src_p; + int i; + + for (i = 0; i < 42; i++) + dest[i] = swab32(src[i]); +} + + /* For flipping to the correct endianness if necessary */ #if defined(__BIG_ENDIAN__) || defined(MIPSEB) static inline void endian_flip32(void *dest_p, const void *src_p) @@ -733,6 +745,11 @@ static inline void endian_flip128(void *dest_p, const void *src_p) { flip128(dest_p, src_p); } +static inline void endian_flip168(void *dest_p, const void *src_p) +{ + flip168(dest_p, src_p); +} + #else static inline void endian_flip32(void __maybe_unused *dest_p, const void __maybe_unused *src_p) @@ -743,8 +760,13 @@ static inline void endian_flip128(void __maybe_unused *dest_p, const void __maybe_unused *src_p) { } +static inline void +endian_flip168(void __maybe_unused *dest_p, const void __maybe_unused *src_p) +{ +} #endif + extern double cgpu_runtime(struct cgpu_info *cgpu); extern void _quit(int status); @@ -1013,6 +1035,7 @@ extern bool opt_protocol; extern bool have_longpoll; extern char *opt_kernel_path; extern char *opt_socks_proxy; +extern bool opt_lyra; #if defined(unix) || defined(__APPLE__) extern char *opt_stderr_cmd; @@ -1133,10 +1156,10 @@ extern struct pool *add_pool(void); extern bool add_pool_details(struct pool *pool, bool live, char *url, char *user, char *pass, char *name, char *desc, char *profile, char *algo); #define MAX_GPUDEVICES 16 -#define MAX_DEVICES 4096 - -#define MIN_INTENSITY 8 -#define MIN_INTENSITY_STR "8" +//#define MAX_DEVICES 4096 +#define MAX_DEVICES 8192 +#define MIN_INTENSITY 4 +#define MIN_INTENSITY_STR "4" #define MAX_INTENSITY 31 #define MAX_INTENSITY_STR "31" #define MIN_XINTENSITY 1 @@ -1244,6 +1267,7 @@ struct stratum_work { size_t cb_len; size_t header_len; int merkles; + double next_diff; double diff; }; @@ -1404,7 +1428,7 @@ struct pool { #define GETWORK_MODE_GBT 'G' struct work { - unsigned char data[128]; + unsigned char data[168]; unsigned char midstate[32]; unsigned char target[32]; unsigned char hash[32]; diff --git a/ocl.c b/ocl.c index 1d624e317..138561239 100644 --- a/ocl.c +++ b/ocl.c @@ -35,6 +35,9 @@ #include "ocl/build_kernel.h" #include "ocl/binary_kernel.h" #include "algorithm/neoscrypt.h" +#include "algorithm/pluck.h" +#include "algorithm/yescrypt.h" +#include "algorithm/Lyra2RE.h" /* FIXME: only here for global config vars, replace with configuration.h * or similar as soon as config is in a struct instead of littered all @@ -429,7 +432,258 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); - } else if (!cgpu->opt_tc) { + } +/////////////////////////////////// pluck + // neoscrypt TC + else if (!safe_cmp(cgpu->algorithm.name, "pluck") && !cgpu->opt_tc) { + size_t glob_thread_count; + long max_int; + unsigned char type = 0; + + // determine which intensity type to use + if (cgpu->rawintensity > 0) { + glob_thread_count = cgpu->rawintensity; + max_int = glob_thread_count; + type = 2; + } + else if (cgpu->xintensity > 0) { + glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity); + max_int = cgpu->xintensity; + type = 1; + } + else { + glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity); + max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity); + } + + glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count); + + // if TC * scratchbuf size is too big for memory... reduce to max + if ((glob_thread_count * PLUCK_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) { + + /* Selected intensity will not run on this GPU. Not enough memory. + * Adapt the memory setting. */ + // depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc + switch (type) { + //raw intensity + case 2: + while ((glob_thread_count * PLUCK_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) { + --glob_thread_count; + } + + max_int = glob_thread_count; + cgpu->rawintensity = glob_thread_count; + break; + + //x intensity + case 1: + glob_thread_count = cgpu->max_alloc / PLUCK_SCRATCHBUF_SIZE; + max_int = glob_thread_count / clState->compute_shaders; + + while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_XINTENSITY) { + applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu); + max_int = MIN_XINTENSITY; + } + + cgpu->xintensity = max_int; + glob_thread_count = clState->compute_shaders * (1UL << max_int); + break; + + default: + glob_thread_count = cgpu->max_alloc / PLUCK_SCRATCHBUF_SIZE; + while (max_int && ((1UL << max_int) & glob_thread_count) == 0) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_INTENSITY) { + applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu); + max_int = MIN_INTENSITY; + } + + cgpu->intensity = max_int; + glob_thread_count = 1UL << max_int; + break; + } + } + + // TC is glob thread count + cgpu->thread_concurrency = glob_thread_count; + + applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); + } + else if ((!safe_cmp(cgpu->algorithm.name, "yescrypt") || + !safe_cmp(algorithm->name, "yescrypt-multi")) && !cgpu->opt_tc) { + size_t glob_thread_count; + long max_int; + unsigned char type = 0; + + // determine which intensity type to use + if (cgpu->rawintensity > 0) { + glob_thread_count = cgpu->rawintensity; + max_int = glob_thread_count; + type = 2; + } + else if (cgpu->xintensity > 0) { + glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity); + max_int = cgpu->xintensity; + type = 1; + } + else { + glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity); + max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity); + } + + glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count); + + // if TC * scratchbuf size is too big for memory... reduce to max + if ((glob_thread_count * YESCRYPT_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) { + + /* Selected intensity will not run on this GPU. Not enough memory. + * Adapt the memory setting. */ + // depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc + switch (type) { + //raw intensity + case 2: + while ((glob_thread_count * YESCRYPT_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) { + --glob_thread_count; + } + + max_int = glob_thread_count; + cgpu->rawintensity = glob_thread_count; + break; + + //x intensity + case 1: + glob_thread_count = cgpu->max_alloc / YESCRYPT_SCRATCHBUF_SIZE; + max_int = glob_thread_count / clState->compute_shaders; + + while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_XINTENSITY) { + applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu); + max_int = MIN_XINTENSITY; + } + + cgpu->xintensity = max_int; + glob_thread_count = clState->compute_shaders * (1UL << max_int); + break; + + default: + glob_thread_count = cgpu->max_alloc / YESCRYPT_SCRATCHBUF_SIZE; + while (max_int && ((1UL << max_int) & glob_thread_count) == 0) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_INTENSITY) { + applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu); + max_int = MIN_INTENSITY; + } + + cgpu->intensity = max_int; + glob_thread_count = 1UL << max_int; + break; + } + } + + // TC is glob thread count + cgpu->thread_concurrency = glob_thread_count; + + applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); + + } + else if ( !safe_cmp(cgpu->algorithm.name, "lyra2REv2") ) { + size_t glob_thread_count; + long max_int; + unsigned char type = 0; + + // determine which intensity type to use + if (cgpu->rawintensity > 0) { + glob_thread_count = cgpu->rawintensity; + max_int = glob_thread_count; + type = 2; + } + else if (cgpu->xintensity > 0) { + glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity); + max_int = cgpu->xintensity; + type = 1; + } + else { + glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity); + max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity); + } + + glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count); + + // if TC * scratchbuf size is too big for memory... reduce to max + if ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) { + + /* Selected intensity will not run on this GPU. Not enough memory. + * Adapt the memory setting. */ + // depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc + switch (type) { + //raw intensity + case 2: + while ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) { + --glob_thread_count; + } + + max_int = glob_thread_count; + cgpu->rawintensity = glob_thread_count; + break; + + //x intensity + case 1: + glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE; + max_int = glob_thread_count / clState->compute_shaders; + + while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_XINTENSITY) { + applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu); + max_int = MIN_XINTENSITY; + } + + cgpu->xintensity = max_int; + glob_thread_count = clState->compute_shaders * (1UL << max_int); + break; + + default: + glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE; + while (max_int && ((1UL << max_int) & glob_thread_count) == 0) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_INTENSITY) { + applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu); + max_int = MIN_INTENSITY; + } + + cgpu->intensity = max_int; + glob_thread_count = 1UL << max_int; + break; + } + } + + // TC is glob thread count + cgpu->thread_concurrency = glob_thread_count; + + applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); + + } else if (!cgpu->opt_tc) { unsigned int sixtyfours; sixtyfours = cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1; @@ -518,21 +772,32 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg if (clState->n_extra_kernels > 0) { unsigned int i; char kernel_name[9]; // max: search99 + 0x0 + char kernel_name2[10]; // max: search99 + 0x0 clState->extra_kernels = (cl_kernel *)malloc(sizeof(cl_kernel) * clState->n_extra_kernels); for (i = 0; i < clState->n_extra_kernels; i++) { - snprintf(kernel_name, 9, "%s%d", "search", i + 1); - clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_name, &status); + if (i+1<100){ + snprintf(kernel_name, 9, "%s%d", "search", i + 1); + clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_name, &status); + }else { + snprintf(kernel_name2, 10, "%s%d", "search", i + 1); + clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_name2, &status);} if (status != CL_SUCCESS) { - applog(LOG_ERR, "Error %d: Creating ExtraKernel #%d from program. (clCreateKernel)", status, i); + applog(LOG_DEBUG, "Error %d: Creating ExtraKernel #%d from program. (clCreateKernel)", status, i); return NULL; } } } size_t bufsize; - size_t readbufsize = 128; + size_t buf1size; + size_t buf3size; + size_t buf2size; + + + + size_t readbufsize = (!safe_cmp(algorithm->name, "credits"))? 168:128; if (algorithm->rw_buffer_size < 0) { // calc buffer size for neoscrypt @@ -546,7 +811,50 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_DEBUG, "Neoscrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); // scrypt/n-scrypt - } else { + } + else if (!safe_cmp(algorithm->name, "yescrypt") || !safe_cmp(algorithm->name, "yescrypt-multi")) { + /* The scratch/pad-buffer needs 32kBytes memory per thread. */ + bufsize = YESCRYPT_SCRATCHBUF_SIZE * cgpu->thread_concurrency; + buf1size = PLUCK_SECBUF_SIZE * cgpu->thread_concurrency; + buf2size = 128 * 8 * 8 * cgpu->thread_concurrency; + buf3size= 8 * 8 * 4 * cgpu->thread_concurrency; + /* This is the input buffer. For yescrypt this is guaranteed to be + * 80 bytes only. */ + readbufsize = 80; + + applog(LOG_DEBUG, "yescrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); + // scrypt/n-scrypt + + + + } + else if (!safe_cmp(algorithm->name, "lyra2REv2") ) { + /* The scratch/pad-buffer needs 32kBytes memory per thread. */ + bufsize = LYRA_SCRATCHBUF_SIZE * cgpu->thread_concurrency; + buf1size = 4* 8 * cgpu->thread_concurrency; //matrix + + /* This is the input buffer. For yescrypt this is guaranteed to be + * 80 bytes only. */ + readbufsize = 80; + + applog(LOG_DEBUG, "lyra2REv2 buffer sizes: %lu RW, %lu RW", (unsigned long)bufsize, (unsigned long)buf1size); + // scrypt/n-scrypt + + + + } + else if (!safe_cmp(algorithm->name, "pluck")) { + /* The scratch/pad-buffer needs 32kBytes memory per thread. */ + bufsize = PLUCK_SCRATCHBUF_SIZE * cgpu->thread_concurrency; + + /* This is the input buffer. For pluck this is guaranteed to be + * 80 bytes only. */ + readbufsize = 80; + + applog(LOG_DEBUG, "pluck buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); + // scrypt/n-scrypt + } + else { size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0)); bufsize = 128 * ipt * cgpu->thread_concurrency; applog(LOG_DEBUG, "Scrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); @@ -557,6 +865,10 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } clState->padbuffer8 = NULL; + clState->buffer1 = NULL; + clState->buffer2 = NULL; + clState->buffer3 = NULL; + if (bufsize > 0) { applog(LOG_DEBUG, "Creating read/write buffer sized %lu", (unsigned long)bufsize); @@ -568,6 +880,47 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize); } + + if (!safe_cmp(algorithm->name, "yescrypt") || !safe_cmp(algorithm->name, "yescrypt-multi")) { +// need additionnal buffers + clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status); + if (status != CL_SUCCESS && !clState->buffer1) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL;} + + clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf2size, NULL, &status); + if (status != CL_SUCCESS && !clState->buffer2) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status); + return NULL; + } + + clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf3size, NULL, &status); + if (status != CL_SUCCESS && !clState->buffer3) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer3), decrease TC or increase LG", status); + return NULL; + } + + + } + + else if (!safe_cmp(algorithm->name, "lyra2REv2") ) { + // need additionnal buffers + clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status); + if (status != CL_SUCCESS && !clState->buffer1) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL; + } + + } + else { + clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); // we don't need that much just tired... + if (status != CL_SUCCESS && !clState->buffer1) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL; + } + + } + /* This buffer is weird and might work to some degree even if * the create buffer call has apparently failed, so check if we * get anything back before we call it a failure. */ @@ -576,6 +929,13 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease TC or increase LG", status); return NULL; } + +// clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize/8, NULL, &status); // we don't need that much just tired... +// if (status != CL_SUCCESS && !clState->buffer1) { +// applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); +// return NULL; +// } + } applog(LOG_DEBUG, "Using read buffer sized %lu", (unsigned long)readbufsize); diff --git a/ocl.h b/ocl.h index 272246da1..30b77eed3 100644 --- a/ocl.h +++ b/ocl.h @@ -22,7 +22,10 @@ typedef struct __clState { cl_mem outputBuffer; cl_mem CLbuffer0; cl_mem padbuffer8; - unsigned char cldata[80]; + cl_mem buffer1; + cl_mem buffer2; + cl_mem buffer3; + unsigned char cldata[168]; bool hasBitAlign; bool goffset; cl_uint vwidth; diff --git a/sgminer.c b/sgminer.c index f3e409334..fe64a4142 100644 --- a/sgminer.c +++ b/sgminer.c @@ -48,6 +48,7 @@ char *curly = ":D"; #endif #include #include "sph/sph_sha2.h" +#include "sph/sph_blake.h" #include "compat.h" #include "miner.h" @@ -1922,6 +1923,7 @@ static void calc_midstate(struct work *work) endian_flip32(work->midstate, work->midstate); } + static struct work *make_work(void) { struct work *w = (struct work *)calloc(1, sizeof(struct work)); @@ -2263,7 +2265,12 @@ static bool gbt_decode(struct pool *pool, json_t *res_val) static bool getwork_decode(json_t *res_val, struct work *work) { - if (unlikely(!jobj_binary(res_val, "data", work->data, sizeof(work->data), true))) { + + size_t worklen = 128; + worklen = ((!safe_cmp(work->pool->algorithm.name, "credits")) ? sizeof(work->data) : worklen); + + + if (unlikely(!jobj_binary(res_val, "data", work->data, worklen, true))) { if (opt_morenotices) applog(LOG_ERR, "%s: JSON inval data", isnull(get_pool_name(work->pool), "")); return false; @@ -3021,10 +3028,18 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s cgpu = get_thr_cgpu(thr_id); - endian_flip128(work->data, work->data); + if (safe_cmp(work->pool->algorithm.name, "credits")) { + endian_flip128(work->data, work->data); } else + { + endian_flip168(work->data, work->data); + } /* build hex string - Make sure to restrict to 80 bytes for Neoscrypt */ - hexstr = bin2hex(work->data, ((!safe_cmp(work->pool->algorithm.name, "neoscrypt")) ? 80 : sizeof(work->data))); + + int worksize_default = 128; + hexstr = bin2hex(work->data, (!safe_cmp(work->pool->algorithm.name, "neoscrypt") ? 80 : worksize_default)); + hexstr = bin2hex(work->data, (!safe_cmp(work->pool->algorithm.name, "credits") ? sizeof(work->data) : worksize_default)); + /* build JSON-RPC request */ if (work->gbt) { @@ -7063,7 +7078,10 @@ void inc_hw_errors(struct thr_info *thr) /* Fills in the work nonce and builds the output data in work->hash */ static void rebuild_nonce(struct work *work, uint32_t nonce) { - uint32_t *work_nonce = (uint32_t *)(work->data + 76); +uint32_t nonce_pos = 76; +if (!safe_cmp(work->pool->algorithm.name, "credits")) nonce_pos = 140; + + uint32_t *work_nonce = (uint32_t *)(work->data + nonce_pos); *work_nonce = htole32(nonce); @@ -7079,7 +7097,10 @@ bool test_nonce(struct work *work, uint32_t nonce) rebuild_nonce(work, nonce); // for Neoscrypt, the diff1targ value is in work->target - if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) { + if (!safe_cmp(work->pool->algorithm.name, "neoscrypt") || !safe_cmp(work->pool->algorithm.name, "pluck") + || !safe_cmp(work->pool->algorithm.name, "yescrypt") + || !safe_cmp(work->pool->algorithm.name, "yescrypt-multi") +) { diff1targ = ((uint32_t *)work->target)[7]; } else { @@ -8723,7 +8744,8 @@ int main(int argc, char *argv[]) #endif /* Default algorithm specified in algorithm.c ATM */ - set_algorithm(&default_profile.algorithm, "scrypt"); + /* changed to x11 which won't cause crash*/ + set_algorithm(&default_profile.algorithm, "x11"); devcursor = 8; logstart = devcursor + 1; diff --git a/sgminer.exe b/sgminer.exe new file mode 100644 index 000000000..17491d63b Binary files /dev/null and b/sgminer.exe differ diff --git a/sph/Makefile.am b/sph/Makefile.am index d80e438ad..bc2f4b238 100644 --- a/sph/Makefile.am +++ b/sph/Makefile.am @@ -1,3 +1,3 @@ noinst_LIBRARIES = libsph.a -libsph_a_SOURCES = bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c sha2.c sha2big.c fugue.c hamsi.c panama.c shabal.c whirlpool.c +libsph_a_SOURCES = bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c sha2.c sha2big.c fugue.c hamsi.c panama.c shabal.c whirlpool.c sha256_Y.c diff --git a/sph/sha256_Y.c b/sph/sha256_Y.c new file mode 100644 index 000000000..a5d786d3f --- /dev/null +++ b/sph/sha256_Y.c @@ -0,0 +1,418 @@ +/*- + * Copyright 2005,2007,2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "algorithm/sysendian.h" + +#include "sph/sha256_Y.h" + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, k) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + k) + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t * state, const unsigned char block[64]) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + /* 1. Prepare message schedule W. */ + be32dec_vect(W, block, 64); + + for (i = 16; i < 64; i++) + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0, 0x428a2f98); + RNDr(S, W, 1, 0x71374491); + RNDr(S, W, 2, 0xb5c0fbcf); + RNDr(S, W, 3, 0xe9b5dba5); + RNDr(S, W, 4, 0x3956c25b); + RNDr(S, W, 5, 0x59f111f1); + RNDr(S, W, 6, 0x923f82a4); + RNDr(S, W, 7, 0xab1c5ed5); + RNDr(S, W, 8, 0xd807aa98); + RNDr(S, W, 9, 0x12835b01); + RNDr(S, W, 10, 0x243185be); + RNDr(S, W, 11, 0x550c7dc3); + RNDr(S, W, 12, 0x72be5d74); + RNDr(S, W, 13, 0x80deb1fe); + RNDr(S, W, 14, 0x9bdc06a7); + RNDr(S, W, 15, 0xc19bf174); + RNDr(S, W, 16, 0xe49b69c1); + RNDr(S, W, 17, 0xefbe4786); + RNDr(S, W, 18, 0x0fc19dc6); + RNDr(S, W, 19, 0x240ca1cc); + RNDr(S, W, 20, 0x2de92c6f); + RNDr(S, W, 21, 0x4a7484aa); + RNDr(S, W, 22, 0x5cb0a9dc); + RNDr(S, W, 23, 0x76f988da); + RNDr(S, W, 24, 0x983e5152); + RNDr(S, W, 25, 0xa831c66d); + RNDr(S, W, 26, 0xb00327c8); + RNDr(S, W, 27, 0xbf597fc7); + RNDr(S, W, 28, 0xc6e00bf3); + RNDr(S, W, 29, 0xd5a79147); + RNDr(S, W, 30, 0x06ca6351); + RNDr(S, W, 31, 0x14292967); + RNDr(S, W, 32, 0x27b70a85); + RNDr(S, W, 33, 0x2e1b2138); + RNDr(S, W, 34, 0x4d2c6dfc); + RNDr(S, W, 35, 0x53380d13); + RNDr(S, W, 36, 0x650a7354); + RNDr(S, W, 37, 0x766a0abb); + RNDr(S, W, 38, 0x81c2c92e); + RNDr(S, W, 39, 0x92722c85); + RNDr(S, W, 40, 0xa2bfe8a1); + RNDr(S, W, 41, 0xa81a664b); + RNDr(S, W, 42, 0xc24b8b70); + RNDr(S, W, 43, 0xc76c51a3); + RNDr(S, W, 44, 0xd192e819); + RNDr(S, W, 45, 0xd6990624); + RNDr(S, W, 46, 0xf40e3585); + RNDr(S, W, 47, 0x106aa070); + RNDr(S, W, 48, 0x19a4c116); + RNDr(S, W, 49, 0x1e376c08); + RNDr(S, W, 50, 0x2748774c); + RNDr(S, W, 51, 0x34b0bcb5); + RNDr(S, W, 52, 0x391c0cb3); + RNDr(S, W, 53, 0x4ed8aa4a); + RNDr(S, W, 54, 0x5b9cca4f); + RNDr(S, W, 55, 0x682e6ff3); + RNDr(S, W, 56, 0x748f82ee); + RNDr(S, W, 57, 0x78a5636f); + RNDr(S, W, 58, 0x84c87814); + RNDr(S, W, 59, 0x8cc70208); + RNDr(S, W, 60, 0x90befffa); + RNDr(S, W, 61, 0xa4506ceb); + RNDr(S, W, 62, 0xbef9a3f7); + RNDr(S, W, 63, 0xc67178f2); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) { + state[i] += S[i]; + +} + + /* Clean the stack. */ + memset(W, 0, 256); + memset(S, 0, 32); + t0 = t1 = 0; +} + +static unsigned char PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX_Y * ctx) +{ + unsigned char len[8]; + uint32_t r, plen; + + /* + * Convert length to a vector of bytes -- we do this now rather + * than later because the length will change after we pad. + */ + be32enc_vect(len, ctx->count, 8); + + /* Add 1--64 bytes so that the resulting length is 56 mod 64 */ + r = (ctx->count[1] >> 3) & 0x3f; + plen = (r < 56) ? (56 - r) : (120 - r); + SHA256_Update_Y(ctx, PAD, (size_t)plen); + + /* Add the terminating bit-count */ + SHA256_Update_Y(ctx, len, 8); +} + +/* SHA-256 initialization. Begins a SHA-256 operation. */ +void +SHA256_Init_Y(SHA256_CTX_Y * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6A09E667; + ctx->state[1] = 0xBB67AE85; + ctx->state[2] = 0x3C6EF372; + ctx->state[3] = 0xA54FF53A; + ctx->state[4] = 0x510E527F; + ctx->state[5] = 0x9B05688C; + ctx->state[6] = 0x1F83D9AB; + ctx->state[7] = 0x5BE0CD19; +} + +/* Add bytes into the hash */ +void +SHA256_Update_Y(SHA256_CTX_Y * ctx, const void *in, size_t len) +{ + uint32_t bitlen[2]; + uint32_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count[1] >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen[1] = ((uint32_t)len) << 3; + bitlen[0] = (uint32_t)(len >> 29); + + /* Update number of bits */ + if ((ctx->count[1] += bitlen[1]) < bitlen[1]) + ctx->count[0]++; + ctx->count[0] += bitlen[0]; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + + SHA256_Transform(ctx->state, ctx->buf); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + + while (len >= 64) { + SHA256_Transform(ctx->state, src); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-256 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx) +{ + /* Add padding */ + SHA256_Pad(ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, 32); + + /* Clear the context state */ + memset((void *)ctx, 0, sizeof(*ctx)); +} + +/* Initialize an HMAC-SHA256 operation with the given key. */ +void +HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen) +{ + unsigned char pad[64]; + unsigned char khash[32]; + const unsigned char * K = _K; + size_t i; + + /* If Klen > 64, the key is really SHA256(K). */ + if (Klen > 64) { + SHA256_Init_Y(&ctx->ictx); + SHA256_Update_Y(&ctx->ictx, K, Klen); + SHA256_Final_Y(khash, &ctx->ictx); + K = khash; + Klen = 32; + } + + /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ + SHA256_Init_Y(&ctx->ictx); + memset(pad, 0x36, 64); + for (i = 0; i < Klen; i++) { + pad[i] ^= K[i]; + } + SHA256_Update_Y(&ctx->ictx, pad, 64); + + /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ + SHA256_Init_Y(&ctx->octx); + memset(pad, 0x5c, 64); + for (i = 0; i < Klen; i++) + { + pad[i] ^= K[i]; + } + SHA256_Update_Y(&ctx->octx, pad, 64); + + /* Clean the stack. */ + memset(khash, 0, 32); +} + +/* Add bytes to the HMAC-SHA256 operation. */ +void +HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len) +{ + /* Feed data to the inner SHA256 operation. */ + SHA256_Update_Y(&ctx->ictx, in, len); +} + +/* Finish an HMAC-SHA256 operation. */ +void +HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx) +{ + unsigned char ihash[32]; + + /* Finish the inner SHA256 operation. */ + SHA256_Final_Y(ihash, &ctx->ictx); + + /* Feed the inner hash to the outer SHA256 operation. */ + SHA256_Update_Y(&ctx->octx, ihash, 32); + + /* Finish the outer SHA256 operation. */ + SHA256_Final_Y(digest, &ctx->octx); + + /* Clean the stack. */ + memset(ihash, 0, 32); +} + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ + +void +PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, +size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) +{ + HMAC_SHA256_CTX_Y PShctx, hctx; + size_t i; + uint8_t ivec[4]; + uint8_t U[32]; + uint8_t T[32]; + uint64_t j; + int k; + size_t clen; + + /* Compute HMAC state after processing P and S. */ + HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen); + HMAC_SHA256_Update_Y(&PShctx, salt, saltlen); + + /* Iterate through the blocks. */ + for (i = 0; i * 32 < dkLen; i++) { + /* Generate INT(i + 1). */ + be32enc(ivec, (uint32_t)(i + 1)); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y)); + HMAC_SHA256_Update_Y(&hctx, ivec, 4); + HMAC_SHA256_Final_Y(U, &hctx); + + /* T_i = U_1 ... */ + memcpy(T, U, 32); + + for (j = 2; j <= c; j++) { + /* Compute U_j. */ + HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen); + HMAC_SHA256_Update_Y(&hctx, U, 32); + HMAC_SHA256_Final_Y(U, &hctx); + + /* ... xor U_j ... */ + for (k = 0; k < 32; k++) + T[k] ^= U[k]; + } + + /* Copy as many bytes as necessary into buf. */ + clen = dkLen - i * 32; + if (clen > 32) + clen = 32; + memcpy(&buf[i * 32], T, clen); + } + + /* Clean PShctx, since we never called _Final on it. */ + memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y)); +} diff --git a/sph/sha256_Y.h b/sph/sha256_Y.h new file mode 100644 index 000000000..e97b81ba2 --- /dev/null +++ b/sph/sha256_Y.h @@ -0,0 +1,63 @@ +/*- + * Copyright 2005,2007,2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $ + */ + +#ifndef _SHA256_H_ +#define _SHA256_H_ + +#include + +#include + +typedef struct SHA256Context { + uint32_t state[8]; + uint32_t count[2]; + unsigned char buf[64]; +} SHA256_CTX_Y; + +typedef struct HMAC_SHA256Context { + SHA256_CTX_Y ictx; + SHA256_CTX_Y octx; +} HMAC_SHA256_CTX_Y; + +void SHA256_Init_Y(SHA256_CTX_Y *); +void SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t); +void SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *); +void HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); +void HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); +void HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *); + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t, + uint64_t, uint8_t *, size_t); + + +#endif /* !_SHA256_H_ */ diff --git a/util.c b/util.c index ae62be2df..756ff7f05 100644 --- a/util.c +++ b/util.c @@ -674,7 +674,7 @@ bool fulltest(const unsigned char *hash, const unsigned char *target) uint32_t *target32 = (uint32_t *)target; bool rc = true; int i; - + for (i = 28 / 4; i >= 0; i--) { uint32_t h32tmp = le32toh(hash32[i]); uint32_t t32tmp = le32toh(target32[i]); @@ -1560,6 +1560,8 @@ static bool parse_notify(struct pool *pool, json_t *val) pool->swork.nbit = nbit; pool->swork.ntime = ntime; pool->swork.clean = clean; + pool->swork.diff = pool->swork.next_diff; + alloc_len = pool->swork.cb_len = cb1_len + pool->n1_len + pool->n2size + cb2_len; pool->nonce2_offset = cb1_len + pool->n1_len; @@ -1668,8 +1670,8 @@ static bool parse_diff(struct pool *pool, json_t *val) return false; cg_wlock(&pool->data_lock); - old_diff = pool->swork.diff; - pool->swork.diff = diff; + old_diff = pool->swork.next_diff; + pool->swork.next_diff = diff; cg_wunlock(&pool->data_lock); if (old_diff != diff) { @@ -2560,7 +2562,7 @@ bool initiate_stratum(struct pool *pool) if (!pool->stratum_url) pool->stratum_url = pool->sockaddr_url; pool->stratum_active = true; - pool->swork.diff = 1; + pool->swork.next_diff = pool->swork.diff = 1; if (opt_protocol) { applog(LOG_DEBUG, "%s confirmed mining.subscribe with extranonce1 %s extran2size %d", get_pool_name(pool), pool->nonce1, pool->n2size);