From dc3f0b86fc1528ed99a3fb34e05b5c4ddde04230 Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Sat, 9 Dec 2023 17:44:43 -0500 Subject: [PATCH 1/7] (Not Working) Add K12 package --- crypto/build.rs | 12 +- ffi-deps/FourQlib/FourQ_32bit/schnorrq.c | 2 +- ffi-deps/K12/README.markdown | 84 + ffi-deps/K12/lib/KangarooTwelve.c | 333 ++++ ffi-deps/K12/lib/KangarooTwelve.h | 134 ++ ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h | 48 + .../K12/lib/Plain64/KeccakP-1600-plain64.c | 24 + ffi-deps/K12/lib/align.h | 34 + ffi-deps/K12/lib/brg_endian.h | 143 ++ ffi-deps/chopper-linux.cpp | 1473 +---------------- identity/build.rs | 8 +- 11 files changed, 838 insertions(+), 1457 deletions(-) create mode 100644 ffi-deps/K12/README.markdown create mode 100644 ffi-deps/K12/lib/KangarooTwelve.c create mode 100644 ffi-deps/K12/lib/KangarooTwelve.h create mode 100644 ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h create mode 100644 ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c create mode 100644 ffi-deps/K12/lib/align.h create mode 100644 ffi-deps/K12/lib/brg_endian.h diff --git a/crypto/build.rs b/crypto/build.rs index 91ddaf0..8d1f280 100644 --- a/crypto/build.rs +++ b/crypto/build.rs @@ -18,7 +18,7 @@ fn main() { } else { cc::Build::new() .define("__LINUX__", "1") - .define("_ARM_", "1") + .define("_X86_", "1") .define("_AVX_", "1") .define("USE_ENDO", "true") .include("../ffi-deps/FourQlib/FourQ_32bit") @@ -31,9 +31,17 @@ fn main() { .file("../ffi-deps/FourQlib/sha512/sha512.c") .compile("libFourQ"); + cc::Build::new() + .include("../ffi-deps/K12/lib") + .include("../ffi-deps/K12/lib/Plain64") + .file("../ffi-deps/K12/lib/KangarooTwelve.c") + .compile("KangarooTwelve"); + cc::Build::new() .file("../ffi-deps/chopper-linux.cpp") - .define("_AMD64_", "1") + .define("__LINUX__", "1") + .define("_X86_", "1") + //.define("_AMD64_", "1") .compile("Chopper") } } diff --git a/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c b/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c index e1f130f..9905350 100644 --- a/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c +++ b/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c @@ -17,7 +17,7 @@ #include #include -extern int KangarooTwelveCryptoHashFunction(unsigned char* input, unsigned int inputByteLen, unsigned char* output); +extern int KangarooTwelveCryptoHashFunction(const unsigned char* input, const unsigned int inputByteLen, unsigned char* output); #define CryptoHashFunction KangarooTwelveCryptoHashFunction ECCRYPTO_STATUS SchnorrQ_KeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey) diff --git a/ffi-deps/K12/README.markdown b/ffi-deps/K12/README.markdown new file mode 100644 index 0000000..4a85e1b --- /dev/null +++ b/ffi-deps/K12/README.markdown @@ -0,0 +1,84 @@ +# What is KangarooTwelve ? + +[**KangarooTwelve**][k12] (or **K12**) is a fast and secure extendable-output function (XOF), the generalization of hash functions to arbitrary output lengths. +Derived from Keccak, it aims at higher speeds than FIPS 202's SHA-3 and SHAKE functions, while retaining their flexibility and basis of security. + +On high-end platforms, it can exploit a high degree of parallelism, whether using multiple cores or the single-instruction multiple-data (SIMD) instruction set of modern processors. +On Intel's Haswell and Skylake architectures, KangarooTwelve tops at less than 1.5 cycles/byte for long messages on a single core, and at 0.51 cycles/byte on the SkylakeX and Cascade Lake architectures. +On the latest Apple A14 and M1 processors, KangarooTwelve can take advantage of the ARMv8-A's SHA-3 dedicated instructions to deliver 0.75 cycles/byte for long messages on a single core. +On low-end platforms, as well as for short messages, it also benefits from about a factor two speed-up compared to the fastest FIPS 202 instance SHAKE128. + +More details can be found in our [ACNS Paper][eprint]. + +# What can I find here? + +This repository contains source code that implements the extandable output (or hash) function [**KangarooTwelve**][k12] (or **K12**). +Its purpose is to offer optimized implementations of K12 and nothing else. + +The code comes from the [**eXtended Keccak Code Package**][xkcp] (or **XKCP**), after much trimming to keep only what is needed for K12. +It is still structured like the XKCP in two layers. The lower layer implements the permutation Keccak-_p_[1600, 12] and possibly parallel versions thereof, whereas the higher layer implements the sponge construction and the K12 tree hash mode. +Also, some sources have been merged to reduce the file count. + +* For the higher layer, we kept only the code needed for K12. +* For the lower layer, we removed all the functions that are not needed for K12. The lower layer therefore implements a subset of the SnP and PlSnP interfaces. + +For Keccak or Xoodoo-based functions other than K12 only, it is recommended to use the XKCP itself instead and not to mix both this repository and the XKCP. + + +# Is there a tool to compute the K12 hash of a file? + +Not in this repository, but Jack O'Connor's [`kangarootwelve_xkcp.rs` repository](https://github.com/oconnor663/kangarootwelve_xkcp.rs) contains Rust bindings to this code and a `k12sum` utility. +Pre-built binaries can be found [there](https://github.com/oconnor663/kangarootwelve_xkcp.rs/releases). + + +# How can I build this K12 code? + +This repository uses the same build system as that of the XKCP. +To build, the following tools are needed: + +* *GCC* +* *GNU make* +* *xsltproc* + +The different targets are defined in [`Makefile.build`](Makefile.build). This file is expanded into a regular makefile using *xsltproc*. To use it, simply type, e.g., + +``` +make generic64/K12Tests +``` + +to build K12Tests generically optimized for 64-bit platforms. The name before the slash indicates the platform, while the part after the slash is the executable to build. As another example, the static (resp. dynamic) library is built by typing `make generic64/libK12.a` (resp. `.so`) or similarly with `generic64` replaced with the appropriate platform name. An alternate C compiler can be specified via the `CC` environment variable. + +Instead of building an executable with *GCC*, one can choose to select the files needed and make a package. For this, simply append `.pack` to the target name, e.g., + +``` +make generic64/K12Tests.pack +``` + +This creates a `.tar.gz` archive with all the necessary files to build the given target. + +The list of targets can be found at the end of [`Makefile.build`](Makefile.build) or by running `make` without parameters. + +## Microsoft Visual Studio support + +KangarooTwelve can be compiled with Microsoft Visual Studio (MSVC). The XKCP build system offers support for the creation of project files. To get a project file for a given target, simply append `.vcxproj` to the target name, e.g., + +``` +make generic64noAsm/K12Tests.vcxproj +``` + +The targets `generic32` and `generic64noAsm` can be used with MSVC, but not `generic64` as it contains assembly implementations in the GCC syntax, which at this point cannot be used with MSVC. +Please refer to the documention of [XKCP][xkcp] for more details on the limitations of the support of MSVC. + +[k12]: https://keccak.team/kangarootwelve.html +[xkcp]: https://github.com/XKCP/XKCP +[eprint]: https://eprint.iacr.org/2016/770.pdf + + +# Acknowledgments + +We wish to thank: + +- Andy Polyakov for his expertise with the ARMv8-A+SHA3 code, and in particular for his core routine from [CRYPTOGAMS](https://github.com/dot-asm/cryptogams) +- Duc Tri Nguyen for his benchmark on the Apple M1 +- Jack O'Connor for bug fixes and more importantly for his [Rust bindings](https://github.com/oconnor663/kangarootwelve_xkcp.rs) +- Kent Ross for his contributions to this code and its quality diff --git a/ffi-deps/K12/lib/KangarooTwelve.c b/ffi-deps/K12/lib/KangarooTwelve.c new file mode 100644 index 0000000..ad184b1 --- /dev/null +++ b/ffi-deps/K12/lib/KangarooTwelve.c @@ -0,0 +1,333 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include +#include "KangarooTwelve.h" +#include "KeccakP-1600-SnP.h" + +/* ---------------------------------------------------------------- */ + +#define K12_security 128 +#define K12_capacity (2*K12_security) +#define K12_capacityInBytes (K12_capacity/8) +#define K12_rate (1600-K12_capacity) +#define K12_rateInBytes (K12_rate/8) +#define K12_rateInLanes (K12_rate/64) + +static void TurboSHAKE128_Initialize(TurboSHAKE128_Instance *instance) +{ + KeccakP1600_Initialize(instance->state); + instance->byteIOIndex = 0; + instance->squeezing = 0; +} + +static void TurboSHAKE128_Absorb(TurboSHAKE128_Instance *instance, const unsigned char *data, size_t dataByteLen) +{ + size_t i, j; + uint8_t partialBlock; + const unsigned char *curData; + const uint8_t rateInBytes = K12_rateInBytes; + + assert(instance->squeezing == 0); + + i = 0; + curData = data; + while(i < dataByteLen) { + if ((instance->byteIOIndex == 0) && (dataByteLen-i >= rateInBytes)) { +#ifdef KeccakP1600_12rounds_FastLoop_supported + /* processing full blocks first */ + j = KeccakP1600_12rounds_FastLoop_Absorb(instance->state, K12_rateInLanes, curData, dataByteLen - i); + i += j; + curData += j; +#endif + for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) { + KeccakP1600_AddBytes(instance->state, curData, 0, rateInBytes); + KeccakP1600_Permute_12rounds(instance->state); + curData+=rateInBytes; + } + i = dataByteLen - j; + } else { + /* normal lane: using the message queue */ + if (dataByteLen - i > (size_t)rateInBytes - instance->byteIOIndex) { + partialBlock = rateInBytes-instance->byteIOIndex; + } else { + partialBlock = (uint8_t)(dataByteLen - i); + } + i += partialBlock; + + KeccakP1600_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock); + curData += partialBlock; + instance->byteIOIndex += partialBlock; + if (instance->byteIOIndex == rateInBytes) { + KeccakP1600_Permute_12rounds(instance->state); + instance->byteIOIndex = 0; + } + } + } +} + +static void TurboSHAKE128_AbsorbDomainSeparationByte(TurboSHAKE128_Instance *instance, unsigned char D) +{ + const unsigned int rateInBytes = K12_rateInBytes; + + assert(D != 0); + assert(instance->squeezing == 0); + + /* Last few bits, whose delimiter coincides with first bit of padding */ + KeccakP1600_AddByte(instance->state, D, instance->byteIOIndex); + /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */ + if ((D >= 0x80) && (instance->byteIOIndex == (rateInBytes-1))) + KeccakP1600_Permute_12rounds(instance->state); + /* Second bit of padding */ + KeccakP1600_AddByte(instance->state, 0x80, rateInBytes-1); + KeccakP1600_Permute_12rounds(instance->state); + instance->byteIOIndex = 0; + instance->squeezing = 1; +} + +static void TurboSHAKE128_Squeeze(TurboSHAKE128_Instance *instance, unsigned char *data, size_t dataByteLen) +{ + size_t i, j; + unsigned int partialBlock; + const unsigned int rateInBytes = K12_rateInBytes; + unsigned char *curData; + + if (!instance->squeezing) + TurboSHAKE128_AbsorbDomainSeparationByte(instance, 0x01); + + i = 0; + curData = data; + while(i < dataByteLen) { + if ((instance->byteIOIndex == rateInBytes) && (dataByteLen-i >= rateInBytes)) { + for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) { + KeccakP1600_Permute_12rounds(instance->state); + KeccakP1600_ExtractBytes(instance->state, curData, 0, rateInBytes); + curData+=rateInBytes; + } + i = dataByteLen - j; + } else { + /* normal lane: using the message queue */ + if (instance->byteIOIndex == rateInBytes) { + KeccakP1600_Permute_12rounds(instance->state); + instance->byteIOIndex = 0; + } + if (dataByteLen-i > rateInBytes-instance->byteIOIndex) + partialBlock = rateInBytes-instance->byteIOIndex; + else + partialBlock = (unsigned int)(dataByteLen - i); + i += partialBlock; + + KeccakP1600_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock); + curData += partialBlock; + instance->byteIOIndex += partialBlock; + } + } +} + +/* ---------------------------------------------------------------- */ + +typedef enum { + NOT_INITIALIZED, + ABSORBING, + FINAL, + SQUEEZING +} KCP_Phases; +typedef KCP_Phases KangarooTwelve_Phases; + +#define K12_chunkSize 8192 +#define K12_suffixLeaf 0x0B /* '110': message hop, simple padding, inner node */ + +#ifndef KeccakP1600_disableParallelism + +void KangarooTwelve_Process2Leaves(const unsigned char *input, unsigned char *output); +void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output); +void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output); + +#define ProcessLeaves( Parallellism ) \ + while (inputByteLen >= Parallellism * K12_chunkSize) { \ + unsigned char intermediate[Parallellism*K12_capacityInBytes]; \ + \ + KangarooTwelve_Process##Parallellism##Leaves(input, intermediate); \ + input += Parallellism * K12_chunkSize; \ + inputByteLen -= Parallellism * K12_chunkSize; \ + ktInstance->blockNumber += Parallellism; \ + TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, Parallellism * K12_capacityInBytes); \ + } + +#endif // KeccakP1600_disableParallelism + +static unsigned int right_encode(unsigned char * encbuf, size_t value) +{ + unsigned int n, i; + size_t v; + + for (v = value, n = 0; v && (n < sizeof(size_t)); ++n, v >>= 8) + ; /* empty */ + for (i = 1; i <= n; ++i) { + encbuf[i-1] = (unsigned char)(value >> (8 * (n-i))); + } + encbuf[n] = (unsigned char)n; + return n + 1; +} + +int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputByteLen) +{ + ktInstance->fixedOutputLength = outputByteLen; + ktInstance->queueAbsorbedLen = 0; + ktInstance->blockNumber = 0; + ktInstance->phase = ABSORBING; + TurboSHAKE128_Initialize(&ktInstance->finalNode); + return 0; +} + +int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen) +{ + if (ktInstance->phase != ABSORBING) + return 1; + + if (ktInstance->blockNumber == 0) { + /* First block, absorb in final node */ + unsigned int len = (inputByteLen < (K12_chunkSize - ktInstance->queueAbsorbedLen)) ? (unsigned int)inputByteLen : (K12_chunkSize - ktInstance->queueAbsorbedLen); + TurboSHAKE128_Absorb(&ktInstance->finalNode, input, len); + input += len; + inputByteLen -= len; + ktInstance->queueAbsorbedLen += len; + if ((ktInstance->queueAbsorbedLen == K12_chunkSize) && (inputByteLen != 0)) { + /* First block complete and more input data available, finalize it */ + const unsigned char padding = 0x03; /* '110^6': message hop, simple padding */ + ktInstance->queueAbsorbedLen = 0; + ktInstance->blockNumber = 1; + TurboSHAKE128_Absorb(&ktInstance->finalNode, &padding, 1); + ktInstance->finalNode.byteIOIndex = (ktInstance->finalNode.byteIOIndex + 7) & ~7; /* Zero padding up to 64 bits */ + } + } else if (ktInstance->queueAbsorbedLen != 0) { + /* There is data in the queue, absorb further in queue until block complete */ + unsigned int len = (inputByteLen < (K12_chunkSize - ktInstance->queueAbsorbedLen)) ? (unsigned int)inputByteLen : (K12_chunkSize - ktInstance->queueAbsorbedLen); + TurboSHAKE128_Absorb(&ktInstance->queueNode, input, len); + input += len; + inputByteLen -= len; + ktInstance->queueAbsorbedLen += len; + if (ktInstance->queueAbsorbedLen == K12_chunkSize) { + unsigned char intermediate[K12_capacityInBytes]; + ktInstance->queueAbsorbedLen = 0; + ++ktInstance->blockNumber; + TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->queueNode, K12_suffixLeaf); + TurboSHAKE128_Squeeze(&ktInstance->queueNode, intermediate, K12_capacityInBytes); + TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, K12_capacityInBytes); + } + } + +#ifndef KeccakP1600_disableParallelism + if (KeccakP1600times8_IsAvailable()) { + ProcessLeaves(8); + } + + if (KeccakP1600times4_IsAvailable()) { + ProcessLeaves(4); + } + + if (KeccakP1600times2_IsAvailable()) { + ProcessLeaves(2); + } +#endif + + while (inputByteLen > 0) { + unsigned int len = (inputByteLen < K12_chunkSize) ? (unsigned int)inputByteLen : K12_chunkSize; + TurboSHAKE128_Initialize(&ktInstance->queueNode); + TurboSHAKE128_Absorb(&ktInstance->queueNode, input, len); + input += len; + inputByteLen -= len; + if (len == K12_chunkSize) { + unsigned char intermediate[K12_capacityInBytes]; + ++ktInstance->blockNumber; + TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->queueNode, K12_suffixLeaf); + TurboSHAKE128_Squeeze(&ktInstance->queueNode, intermediate, K12_capacityInBytes); + TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, K12_capacityInBytes); + } else { + ktInstance->queueAbsorbedLen = len; + } + } + + return 0; +} + +int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen) +{ + unsigned char encbuf[sizeof(size_t)+1+2]; + unsigned char padding; + + if (ktInstance->phase != ABSORBING) + return 1; + + /* Absorb customization | right_encode(customByteLen) */ + if ((customByteLen != 0) && (KangarooTwelve_Update(ktInstance, customization, customByteLen) != 0)) + return 1; + if (KangarooTwelve_Update(ktInstance, encbuf, right_encode(encbuf, customByteLen)) != 0) + return 1; + + if (ktInstance->blockNumber == 0) { + /* Non complete first block in final node, pad it */ + padding = 0x07; /* '11': message hop, final node */ + } else { + unsigned int n; + + if (ktInstance->queueAbsorbedLen != 0) { + /* There is data in the queue node */ + unsigned char intermediate[K12_capacityInBytes]; + ++ktInstance->blockNumber; + TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->queueNode, K12_suffixLeaf); + TurboSHAKE128_Squeeze(&ktInstance->queueNode, intermediate, K12_capacityInBytes); + TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, K12_capacityInBytes); + } + --ktInstance->blockNumber; /* Absorb right_encode(number of Chaining Values) || 0xFF || 0xFF */ + n = right_encode(encbuf, ktInstance->blockNumber); + encbuf[n++] = 0xFF; + encbuf[n++] = 0xFF; + TurboSHAKE128_Absorb(&ktInstance->finalNode, encbuf, n); + padding = 0x06; /* '01': chaining hop, final node */ + } + TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->finalNode, padding); + if (ktInstance->fixedOutputLength != 0) { + ktInstance->phase = FINAL; + TurboSHAKE128_Squeeze(&ktInstance->finalNode, output, ktInstance->fixedOutputLength); + return 0; + } + ktInstance->phase = SQUEEZING; + return 0; +} + +int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen) +{ + if (ktInstance->phase != SQUEEZING) + return 1; + TurboSHAKE128_Squeeze(&ktInstance->finalNode, output, outputByteLen); + return 0; +} + +int KangarooTwelve(const unsigned char *input, size_t inputByteLen, + unsigned char *output, size_t outputByteLen, + const unsigned char *customization, size_t customByteLen) +{ + KangarooTwelve_Instance ktInstance; + + if (outputByteLen == 0) + return 1; + KangarooTwelve_Initialize(&ktInstance, outputByteLen); + if (KangarooTwelve_Update(&ktInstance, input, inputByteLen) != 0) + return 1; + return KangarooTwelve_Final(&ktInstance, output, customization, customByteLen); +} diff --git a/ffi-deps/K12/lib/KangarooTwelve.h b/ffi-deps/K12/lib/KangarooTwelve.h new file mode 100644 index 0000000..f7b3e33 --- /dev/null +++ b/ffi-deps/K12/lib/KangarooTwelve.h @@ -0,0 +1,134 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _KangarooTwelve_h_ +#define _KangarooTwelve_h_ + +#include +#include +#include "align.h" +#include "KeccakP-1600-SnP.h" + +typedef struct TurboSHAKE128_InstanceStruct { + uint8_t state[KeccakP1600_stateSizeInBytes]; + uint8_t byteIOIndex; + uint8_t squeezing; +} TurboSHAKE128_Instance; + +typedef struct KangarooTwelve_InstanceStruct { + ALIGN(KeccakP1600_stateAlignment) TurboSHAKE128_Instance queueNode; + ALIGN(KeccakP1600_stateAlignment) TurboSHAKE128_Instance finalNode; + size_t fixedOutputLength; + size_t blockNumber; + unsigned int queueAbsorbedLen; + int phase; +} KangarooTwelve_Instance; + +/** Extendable ouput function KangarooTwelve. + * @param input Pointer to the input message (M). + * @param inputByteLen The length of the input message in bytes. + * @param output Pointer to the output buffer. + * @param outputByteLen The desired number of output bytes. + * @param customization Pointer to the customization string (C). + * @param customByteLen The length of the customization string in bytes. + * @return 0 if successful, 1 otherwise. + */ +int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen); + +/** + * Function to initialize a KangarooTwelve instance. + * @param ktInstance Pointer to the instance to be initialized. + * @param outputByteLen The desired number of output bytes, + * or 0 for an arbitrarily-long output. + * @return 0 if successful, 1 otherwise. + */ +int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputByteLen); + +/** + * Function to give input data to be absorbed. + * @param ktInstance Pointer to the instance initialized by KangarooTwelve_Initialize(). + * @param input Pointer to the input message data (M). + * @param inputByteLen The number of bytes provided in the input message data. + * @return 0 if successful, 1 otherwise. + */ +int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen); + +/** + * Function to call after all the input message has been input, and to get + * output bytes if the length was specified when calling KangarooTwelve_Initialize(). + * @param ktInstance Pointer to the hash instance initialized by KangarooTwelve_Initialize(). + * If @a outputByteLen was not 0 in the call to KangarooTwelve_Initialize(), the number of + * output bytes is equal to @a outputByteLen. + * If @a outputByteLen was 0 in the call to KangarooTwelve_Initialize(), the output bytes + * must be extracted using the KangarooTwelve_Squeeze() function. + * @param output Pointer to the buffer where to store the output data. + * @param customization Pointer to the customization string (C). + * @param customByteLen The length of the customization string in bytes. + * @return 0 if successful, 1 otherwise. + */ +int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen); + +/** + * Function to squeeze output data. + * @param ktInstance Pointer to the hash instance initialized by KangarooTwelve_Initialize(). + * @param data Pointer to the buffer where to store the output data. + * @param outputByteLen The number of output bytes desired. + * @pre KangarooTwelve_Final() must have been already called. + * @return 0 if successful, 1 otherwise. + */ +int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen); + +#if !defined(KeccakP1600_disableParallelism) && defined(KeccakP1600_enable_simd_options) +/** + * Functions to selectively disable the use of CPU features. Should be rarely + * needed; if you're not sure this is what you want, don't worry about it. + * + * /!\ WARNING /!\: Calling these functions REQUIRES that there are no + * KangarooTwelve instances in use. The effects are global and affect the code + * paths taken by every call, as well as the details of the represented states. + * Calling these functions in the middle of your program (as opposed to during + * setup) is PROBABLY WRONG. + * + * These functions are at present only used to increase test suite coverage, + * and demonstrate comparative performance between implementations in different + * instruction sets. To enable them, the macro KeccakP1600_enable_simd_options + * must be defined at compile time. + * + * They can potentially also be useful in an environment where it is + * detrimental to online large vector units on the CPU, since doing so can lead + * to downclocking, performance hits in other threads sharing the same CPU + * core, and short delays while the CPU's power license is increased to online + * the vector unit. + * + * In the majority of situations, however, this should rarely matter and it is + * usually the case that the performance difference will be a wash or even an + * overall improvement despite the downsides. + * + * @return 1 if the feature was enabled and available and has been turned off, + * 0 if it was already disabled or unavailable. + */ +int KangarooTwelve_DisableAVX512(void); +int KangarooTwelve_DisableAVX2(void); +int KangarooTwelve_DisableSSSE3(void); + +/** + * Function to reset all CPU features to enabled-if-available. Calling this + * always has no effect if no CPU features have been explicitly disabled. + */ +void KangarooTwelve_EnableAllCpuFeatures(void); +#endif // !KeccakP1600_disableParallelism && KeccakP1600_enable_simd_options + +#endif diff --git a/ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h new file mode 100644 index 0000000..d9e0c6e --- /dev/null +++ b/ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h @@ -0,0 +1,48 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#ifndef _KeccakP_1600_SnP_h_ +#define _KeccakP_1600_SnP_h_ + +/* Keccak-p[1600] */ + +#define KeccakP1600_stateSizeInBytes 200 +#define KeccakP1600_stateAlignment 8 +#define KeccakP1600_12rounds_FastLoop_supported +#define KeccakP1600_disableParallelism + +const char * KeccakP1600_GetImplementation(); +void KeccakP1600_Initialize(void *state); +void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_Permute_12rounds(void *state); +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +// Instead of defining proxy functions which do nothing, simply rename the +// symbols of the opt64 implementation where they are used. +#define KeccakP1600_opt64_Initialize KeccakP1600_Initialize +#define KeccakP1600_opt64_AddByte KeccakP1600_AddByte +#define KeccakP1600_opt64_AddBytes KeccakP1600_AddBytes +#define KeccakP1600_opt64_Permute_12rounds KeccakP1600_Permute_12rounds +#define KeccakP1600_opt64_ExtractBytes KeccakP1600_ExtractBytes +#define KeccakP1600_opt64_12rounds_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb + +#endif diff --git a/ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c b/ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c new file mode 100644 index 0000000..0043b4f --- /dev/null +++ b/ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c @@ -0,0 +1,24 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +const char * KeccakP1600_GetImplementation() +{ + return "generic 64-bit implementation"; +} diff --git a/ffi-deps/K12/lib/align.h b/ffi-deps/K12/lib/align.h new file mode 100644 index 0000000..31586bb --- /dev/null +++ b/ffi-deps/K12/lib/align.h @@ -0,0 +1,34 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _align_h_ +#define _align_h_ + +#ifdef ALIGN +#undef ALIGN +#endif + +#if defined(__GNUC__) +#define ALIGN(x) __attribute__ ((aligned(x))) +#elif defined(_MSC_VER) +#define ALIGN(x) __declspec(align(x)) +#elif defined(__ARMCC_VERSION) +#define ALIGN(x) __align(x) +#else +#define ALIGN(x) +#endif + +#endif diff --git a/ffi-deps/K12/lib/brg_endian.h b/ffi-deps/K12/lib/brg_endian.h new file mode 100644 index 0000000..7c640b9 --- /dev/null +++ b/ffi-deps/K12/lib/brg_endian.h @@ -0,0 +1,143 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 + Changes for ARM 9/9/2010 +*/ + +#ifndef _BRG_ENDIAN_H +#define _BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +#if 0 +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __sun ) +# include +#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) && !defined( _AIX ) +# include +# if !defined( __BEOS__ ) +# include +# endif +# endif +#endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) || \ + defined( __s390__ ) || defined( __s390x__ ) || defined( __zarch__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif defined(__arm__) +# ifdef __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# else +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif 1 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order +#endif + +#endif + +#endif diff --git a/ffi-deps/chopper-linux.cpp b/ffi-deps/chopper-linux.cpp index 07ea48f..193b1ef 100644 --- a/ffi-deps/chopper-linux.cpp +++ b/ffi-deps/chopper-linux.cpp @@ -61,51 +61,6 @@ long long unsigned int __shiftright128( #endif -//From Qiner - - -#if AVX512 -const __m512i zero = _mm512_maskz_set1_epi64(0, 0); -const __m512i moveThetaPrev = _mm512_setr_epi64(4, 0, 1, 2, 3, 5, 6, 7); -const __m512i moveThetaNext = _mm512_setr_epi64(1, 2, 3, 4, 0, 5, 6, 7); -const __m512i rhoB = _mm512_setr_epi64(0, 1, 62, 28, 27, 0, 0, 0); -const __m512i rhoG = _mm512_setr_epi64(36, 44, 6, 55, 20, 0, 0, 0); -const __m512i rhoK = _mm512_setr_epi64(3, 10, 43, 25, 39, 0, 0, 0); -const __m512i rhoM = _mm512_setr_epi64(41, 45, 15, 21, 8, 0, 0, 0); -const __m512i rhoS = _mm512_setr_epi64(18, 2, 61, 56, 14, 0, 0, 0); -const __m512i pi1B = _mm512_setr_epi64(0, 3, 1, 4, 2, 5, 6, 7); -const __m512i pi1G = _mm512_setr_epi64(1, 4, 2, 0, 3, 5, 6, 7); -const __m512i pi1K = _mm512_setr_epi64(2, 0, 3, 1, 4, 5, 6, 7); -const __m512i pi1M = _mm512_setr_epi64(3, 1, 4, 2, 0, 5, 6, 7); -const __m512i pi1S = _mm512_setr_epi64(4, 2, 0, 3, 1, 5, 6, 7); -const __m512i pi2S1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 8, 10); -const __m512i pi2S2 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 9, 11); -const __m512i pi2BG = _mm512_setr_epi64(0, 1, 8, 9, 6, 5, 6, 7); -const __m512i pi2KM = _mm512_setr_epi64(2, 3, 10, 11, 7, 5, 6, 7); -const __m512i pi2S3 = _mm512_setr_epi64(4, 5, 12, 13, 4, 5, 6, 7); -const __m512i padding = _mm512_maskz_set1_epi64(1, 0x8000000000000000); - -const __m512i K12RoundConst0 = _mm512_maskz_set1_epi64(1, 0x000000008000808bULL); -const __m512i K12RoundConst1 = _mm512_maskz_set1_epi64(1, 0x800000000000008bULL); -const __m512i K12RoundConst2 = _mm512_maskz_set1_epi64(1, 0x8000000000008089ULL); -const __m512i K12RoundConst3 = _mm512_maskz_set1_epi64(1, 0x8000000000008003ULL); -const __m512i K12RoundConst4 = _mm512_maskz_set1_epi64(1, 0x8000000000008002ULL); -const __m512i K12RoundConst5 = _mm512_maskz_set1_epi64(1, 0x8000000000000080ULL); -const __m512i K12RoundConst6 = _mm512_maskz_set1_epi64(1, 0x000000000000800aULL); -const __m512i K12RoundConst7 = _mm512_maskz_set1_epi64(1, 0x800000008000000aULL); -const __m512i K12RoundConst8 = _mm512_maskz_set1_epi64(1, 0x8000000080008081ULL); -const __m512i K12RoundConst9 = _mm512_maskz_set1_epi64(1, 0x8000000000008080ULL); -const __m512i K12RoundConst10 = _mm512_maskz_set1_epi64(1, 0x0000000080000001ULL); -const __m512i K12RoundConst11 = _mm512_maskz_set1_epi64(1, 0x8000000080008008ULL); - -#endif - - - - - - -//End From Qiner #include #include @@ -133,448 +88,9 @@ extern "C" { #define CopyMemory(x, y, z) memcpy(x, y, z) - #define KeccakF1600RoundConstant0 0x000000008000808bULL - #define KeccakF1600RoundConstant1 0x800000000000008bULL - #define KeccakF1600RoundConstant2 0x8000000000008089ULL - #define KeccakF1600RoundConstant3 0x8000000000008003ULL - #define KeccakF1600RoundConstant4 0x8000000000008002ULL - #define KeccakF1600RoundConstant5 0x8000000000000080ULL - #define KeccakF1600RoundConstant6 0x000000000000800aULL - #define KeccakF1600RoundConstant7 0x800000008000000aULL - #define KeccakF1600RoundConstant8 0x8000000080008081ULL - #define KeccakF1600RoundConstant9 0x8000000000008080ULL - #define KeccakF1600RoundConstant10 0x0000000080000001ULL - - #define declareABCDE \ - unsigned long long Aba, Abe, Abi, Abo, Abu; \ - unsigned long long Aga, Age, Agi, Ago, Agu; \ - unsigned long long Aka, Ake, Aki, Ako, Aku; \ - unsigned long long Ama, Ame, Ami, Amo, Amu; \ - unsigned long long Asa, Ase, Asi, Aso, Asu; \ - unsigned long long Bba, Bbe, Bbi, Bbo, Bbu; \ - unsigned long long Bga, Bge, Bgi, Bgo, Bgu; \ - unsigned long long Bka, Bke, Bki, Bko, Bku; \ - unsigned long long Bma, Bme, Bmi, Bmo, Bmu; \ - unsigned long long Bsa, Bse, Bsi, Bso, Bsu; \ - unsigned long long Ca, Ce, Ci, Co, Cu; \ - unsigned long long Da, De, Di, Do, Du; \ - unsigned long long Eba, Ebe, Ebi, Ebo, Ebu; \ - unsigned long long Ega, Ege, Egi, Ego, Egu; \ - unsigned long long Eka, Eke, Eki, Eko, Eku; \ - unsigned long long Ema, Eme, Emi, Emo, Emu; \ - unsigned long long Esa, Ese, Esi, Eso, Esu; \ - - #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ - A##ba ^= Da; \ - Bba = A##ba; \ - A##ge ^= De; \ - Bbe = ROL64(A##ge, 44); \ - A##ki ^= Di; \ - Bbi = ROL64(A##ki, 43); \ - A##mo ^= Do; \ - Bbo = ROL64(A##mo, 21); \ - A##su ^= Du; \ - Bbu = ROL64(A##su, 14); \ - E##ba = Bba ^((~Bbe)& Bbi ); \ - E##ba ^= KeccakF1600RoundConstant##i; \ - Ca = E##ba; \ - E##be = Bbe ^((~Bbi)& Bbo ); \ - Ce = E##be; \ - E##bi = Bbi ^((~Bbo)& Bbu ); \ - Ci = E##bi; \ - E##bo = Bbo ^((~Bbu)& Bba ); \ - Co = E##bo; \ - E##bu = Bbu ^((~Bba)& Bbe ); \ - Cu = E##bu; \ - A##bo ^= Do; \ - Bga = ROL64(A##bo, 28); \ - A##gu ^= Du; \ - Bge = ROL64(A##gu, 20); \ - A##ka ^= Da; \ - Bgi = ROL64(A##ka, 3); \ - A##me ^= De; \ - Bgo = ROL64(A##me, 45); \ - A##si ^= Di; \ - Bgu = ROL64(A##si, 61); \ - E##ga = Bga ^((~Bge)& Bgi ); \ - Ca ^= E##ga; \ - E##ge = Bge ^((~Bgi)& Bgo ); \ - Ce ^= E##ge; \ - E##gi = Bgi ^((~Bgo)& Bgu ); \ - Ci ^= E##gi; \ - E##go = Bgo ^((~Bgu)& Bga ); \ - Co ^= E##go; \ - E##gu = Bgu ^((~Bga)& Bge ); \ - Cu ^= E##gu; \ - A##be ^= De; \ - Bka = ROL64(A##be, 1); \ - A##gi ^= Di; \ - Bke = ROL64(A##gi, 6); \ - A##ko ^= Do; \ - Bki = ROL64(A##ko, 25); \ - A##mu ^= Du; \ - Bko = ROL64(A##mu, 8); \ - A##sa ^= Da; \ - Bku = ROL64(A##sa, 18); \ - E##ka = Bka ^((~Bke)& Bki ); \ - Ca ^= E##ka; \ - E##ke = Bke ^((~Bki)& Bko ); \ - Ce ^= E##ke; \ - E##ki = Bki ^((~Bko)& Bku ); \ - Ci ^= E##ki; \ - E##ko = Bko ^((~Bku)& Bka ); \ - Co ^= E##ko; \ - E##ku = Bku ^((~Bka)& Bke ); \ - Cu ^= E##ku; \ - A##bu ^= Du; \ - Bma = ROL64(A##bu, 27); \ - A##ga ^= Da; \ - Bme = ROL64(A##ga, 36); \ - A##ke ^= De; \ - Bmi = ROL64(A##ke, 10); \ - A##mi ^= Di; \ - Bmo = ROL64(A##mi, 15); \ - A##so ^= Do; \ - Bmu = ROL64(A##so, 56); \ - E##ma = Bma ^((~Bme)& Bmi ); \ - Ca ^= E##ma; \ - E##me = Bme ^((~Bmi)& Bmo ); \ - Ce ^= E##me; \ - E##mi = Bmi ^((~Bmo)& Bmu ); \ - Ci ^= E##mi; \ - E##mo = Bmo ^((~Bmu)& Bma ); \ - Co ^= E##mo; \ - E##mu = Bmu ^((~Bma)& Bme ); \ - Cu ^= E##mu; \ - A##bi ^= Di; \ - Bsa = ROL64(A##bi, 62); \ - A##go ^= Do; \ - Bse = ROL64(A##go, 55); \ - A##ku ^= Du; \ - Bsi = ROL64(A##ku, 39); \ - A##ma ^= Da; \ - Bso = ROL64(A##ma, 41); \ - A##se ^= De; \ - Bsu = ROL64(A##se, 2); \ - E##sa = Bsa ^((~Bse)& Bsi ); \ - Ca ^= E##sa; \ - E##se = Bse ^((~Bsi)& Bso ); \ - Ce ^= E##se; \ - E##si = Bsi ^((~Bso)& Bsu ); \ - Ci ^= E##si; \ - E##so = Bso ^((~Bsu)& Bsa ); \ - Co ^= E##so; \ - E##su = Bsu ^((~Bsa)& Bse ); \ - Cu ^= E##su; - - #define copyFromState(state) \ - Aba = state[ 0]; \ - Abe = state[ 1]; \ - Abi = state[ 2]; \ - Abo = state[ 3]; \ - Abu = state[ 4]; \ - Aga = state[ 5]; \ - Age = state[ 6]; \ - Agi = state[ 7]; \ - Ago = state[ 8]; \ - Agu = state[ 9]; \ - Aka = state[10]; \ - Ake = state[11]; \ - Aki = state[12]; \ - Ako = state[13]; \ - Aku = state[14]; \ - Ama = state[15]; \ - Ame = state[16]; \ - Ami = state[17]; \ - Amo = state[18]; \ - Amu = state[19]; \ - Asa = state[20]; \ - Ase = state[21]; \ - Asi = state[22]; \ - Aso = state[23]; \ - Asu = state[24]; - - #define copyToState(state) \ - state[ 0] = Aba; \ - state[ 1] = Abe; \ - state[ 2] = Abi; \ - state[ 3] = Abo; \ - state[ 4] = Abu; \ - state[ 5] = Aga; \ - state[ 6] = Age; \ - state[ 7] = Agi; \ - state[ 8] = Ago; \ - state[ 9] = Agu; \ - state[10] = Aka; \ - state[11] = Ake; \ - state[12] = Aki; \ - state[13] = Ako; \ - state[14] = Aku; \ - state[15] = Ama; \ - state[16] = Ame; \ - state[17] = Ami; \ - state[18] = Amo; \ - state[19] = Amu; \ - state[20] = Asa; \ - state[21] = Ase; \ - state[22] = Asi; \ - state[23] = Aso; \ - state[24] = Asu; - - #define rounds12 \ - Ca = Aba^Aga^Aka^Ama^Asa; \ - Ce = Abe^Age^Ake^Ame^Ase; \ - Ci = Abi^Agi^Aki^Ami^Asi; \ - Co = Abo^Ago^Ako^Amo^Aso; \ - Cu = Abu^Agu^Aku^Amu^Asu; \ - thetaRhoPiChiIotaPrepareTheta(0, A, E) \ - thetaRhoPiChiIotaPrepareTheta(1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(5, E, A) \ - thetaRhoPiChiIotaPrepareTheta(6, A, E) \ - thetaRhoPiChiIotaPrepareTheta(7, E, A) \ - thetaRhoPiChiIotaPrepareTheta(8, A, E) \ - thetaRhoPiChiIotaPrepareTheta(9, E, A) \ - thetaRhoPiChiIotaPrepareTheta(10, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ - Eba ^= Da; \ - Bba = Eba; \ - Ege ^= De; \ - Bbe = ROL64(Ege, 44); \ - Eki ^= Di; \ - Bbi = ROL64(Eki, 43); \ - Emo ^= Do; \ - Bbo = ROL64(Emo, 21); \ - Esu ^= Du; \ - Bbu = ROL64(Esu, 14); \ - Aba = Bba ^((~Bbe)& Bbi ); \ - Aba ^= 0x8000000080008008ULL; \ - Abe = Bbe ^((~Bbi)& Bbo ); \ - Abi = Bbi ^((~Bbo)& Bbu ); \ - Abo = Bbo ^((~Bbu)& Bba ); \ - Abu = Bbu ^((~Bba)& Bbe ); \ - Ebo ^= Do; \ - Bga = ROL64(Ebo, 28); \ - Egu ^= Du; \ - Bge = ROL64(Egu, 20); \ - Eka ^= Da; \ - Bgi = ROL64(Eka, 3); \ - Eme ^= De; \ - Bgo = ROL64(Eme, 45); \ - Esi ^= Di; \ - Bgu = ROL64(Esi, 61); \ - Aga = Bga ^((~Bge)& Bgi ); \ - Age = Bge ^((~Bgi)& Bgo ); \ - Agi = Bgi ^((~Bgo)& Bgu ); \ - Ago = Bgo ^((~Bgu)& Bga ); \ - Agu = Bgu ^((~Bga)& Bge ); \ - Ebe ^= De; \ - Bka = ROL64(Ebe, 1); \ - Egi ^= Di; \ - Bke = ROL64(Egi, 6); \ - Eko ^= Do; \ - Bki = ROL64(Eko, 25); \ - Emu ^= Du; \ - Bko = ROL64(Emu, 8); \ - Esa ^= Da; \ - Bku = ROL64(Esa, 18); \ - Aka = Bka ^((~Bke)& Bki ); \ - Ake = Bke ^((~Bki)& Bko ); \ - Aki = Bki ^((~Bko)& Bku ); \ - Ako = Bko ^((~Bku)& Bka ); \ - Aku = Bku ^((~Bka)& Bke ); \ - Ebu ^= Du; \ - Bma = ROL64(Ebu, 27); \ - Ega ^= Da; \ - Bme = ROL64(Ega, 36); \ - Eke ^= De; \ - Bmi = ROL64(Eke, 10); \ - Emi ^= Di; \ - Bmo = ROL64(Emi, 15); \ - Eso ^= Do; \ - Bmu = ROL64(Eso, 56); \ - Ama = Bma ^((~Bme)& Bmi ); \ - Ame = Bme ^((~Bmi)& Bmo ); \ - Ami = Bmi ^((~Bmo)& Bmu ); \ - Amo = Bmo ^((~Bmu)& Bma ); \ - Amu = Bmu ^((~Bma)& Bme ); \ - Ebi ^= Di; \ - Bsa = ROL64(Ebi, 62); \ - Ego ^= Do; \ - Bse = ROL64(Ego, 55); \ - Eku ^= Du; \ - Bsi = ROL64(Eku, 39); \ - Ema ^= Da; \ - Bso = ROL64(Ema, 41); \ - Ese ^= De; \ - Bsu = ROL64(Ese, 2); \ - Asa = Bsa ^((~Bse)& Bsi ); \ - Ase = Bse ^((~Bsi)& Bso ); \ - Asi = Bsi ^((~Bso)& Bsu ); \ - Aso = Bso ^((~Bsu)& Bsa ); \ - Asu = Bsu ^((~Bsa)& Bse ); - - #define K12_security 128 - #define K12_capacity (2 * K12_security) - #define K12_capacityInBytes (K12_capacity / 8) - #define K12_rateInBytes ((1600 - K12_capacity) / 8) - #define K12_chunkSize 8192 - #define K12_suffixLeaf 0x0B - - typedef struct - { - unsigned char state[200]; - unsigned char byteIOIndex; - } KangarooTwelve_F; - - void KeccakP1600_Permute_12rounds(unsigned char* state) - { - declareABCDE - unsigned long long* stateAsLanes = (unsigned long long*)state; - copyFromState(stateAsLanes) - rounds12 - copyToState(stateAsLanes) - } - - void KangarooTwelve_F_Absorb(KangarooTwelve_F* instance, unsigned char* data, unsigned long long dataByteLen) - { - unsigned long long i = 0; - while (i < dataByteLen) - { - if (!instance->byteIOIndex && dataByteLen >= i + K12_rateInBytes) - { - declareABCDE - unsigned long long* stateAsLanes = (unsigned long long*)instance->state; - copyFromState(stateAsLanes) - unsigned long long modifiedDataByteLen = dataByteLen - i; - while (modifiedDataByteLen >= K12_rateInBytes) - { - Aba ^= ((unsigned long long*)data)[0]; - Abe ^= ((unsigned long long*)data)[1]; - Abi ^= ((unsigned long long*)data)[2]; - Abo ^= ((unsigned long long*)data)[3]; - Abu ^= ((unsigned long long*)data)[4]; - Aga ^= ((unsigned long long*)data)[5]; - Age ^= ((unsigned long long*)data)[6]; - Agi ^= ((unsigned long long*)data)[7]; - Ago ^= ((unsigned long long*)data)[8]; - Agu ^= ((unsigned long long*)data)[9]; - Aka ^= ((unsigned long long*)data)[10]; - Ake ^= ((unsigned long long*)data)[11]; - Aki ^= ((unsigned long long*)data)[12]; - Ako ^= ((unsigned long long*)data)[13]; - Aku ^= ((unsigned long long*)data)[14]; - Ama ^= ((unsigned long long*)data)[15]; - Ame ^= ((unsigned long long*)data)[16]; - Ami ^= ((unsigned long long*)data)[17]; - Amo ^= ((unsigned long long*)data)[18]; - Amu ^= ((unsigned long long*)data)[19]; - Asa ^= ((unsigned long long*)data)[20]; - rounds12 - data += K12_rateInBytes; - modifiedDataByteLen -= K12_rateInBytes; - } - copyToState(stateAsLanes) - i = dataByteLen - modifiedDataByteLen; - } - else - { - unsigned char partialBlock; - if ((dataByteLen - i) + instance->byteIOIndex > K12_rateInBytes) - { - partialBlock = K12_rateInBytes - instance->byteIOIndex; - } - else - { - partialBlock = (unsigned char)(dataByteLen - i); - } - i += partialBlock; - - if (!instance->byteIOIndex) - { - unsigned int j = 0; - for (; (j + 8) <= (unsigned int)(partialBlock >> 3); j += 8) - { - ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0]; - ((unsigned long long*)instance->state)[j + 1] ^= ((unsigned long long*)data)[j + 1]; - ((unsigned long long*)instance->state)[j + 2] ^= ((unsigned long long*)data)[j + 2]; - ((unsigned long long*)instance->state)[j + 3] ^= ((unsigned long long*)data)[j + 3]; - ((unsigned long long*)instance->state)[j + 4] ^= ((unsigned long long*)data)[j + 4]; - ((unsigned long long*)instance->state)[j + 5] ^= ((unsigned long long*)data)[j + 5]; - ((unsigned long long*)instance->state)[j + 6] ^= ((unsigned long long*)data)[j + 6]; - ((unsigned long long*)instance->state)[j + 7] ^= ((unsigned long long*)data)[j + 7]; - } - for (; (j + 4) <= (unsigned int)(partialBlock >> 3); j += 4) - { - ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0]; - ((unsigned long long*)instance->state)[j + 1] ^= ((unsigned long long*)data)[j + 1]; - ((unsigned long long*)instance->state)[j + 2] ^= ((unsigned long long*)data)[j + 2]; - ((unsigned long long*)instance->state)[j + 3] ^= ((unsigned long long*)data)[j + 3]; - } - for (; (j + 2) <= (unsigned int)(partialBlock >> 3); j += 2) - { - ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0]; - ((unsigned long long*)instance->state)[j + 1] ^= ((unsigned long long*)data)[j + 1]; - } - if (j < (unsigned int)(partialBlock >> 3)) - { - ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0]; - } - if (partialBlock & 7) - { - unsigned long long lane = 0; - CopyMemory(&lane, data + (partialBlock & 0xFFFFFFF8), partialBlock & 7); - ((unsigned long long*)instance->state)[partialBlock >> 3] ^= lane; - } - } - else - { - unsigned int _sizeLeft = partialBlock; - unsigned int _lanePosition = instance->byteIOIndex >> 3; - unsigned int _offsetInLane = instance->byteIOIndex & 7; - const unsigned char* _curData = data; - while (_sizeLeft > 0) - { - unsigned int _bytesInLane = 8 - _offsetInLane; - if (_bytesInLane > _sizeLeft) - { - _bytesInLane = _sizeLeft; - } - if (_bytesInLane) - { - unsigned long long lane = 0; - CopyMemory(&lane, (void*)_curData, _bytesInLane); - ((unsigned long long*)instance->state)[_lanePosition] ^= (lane << (_offsetInLane << 3)); - } - _sizeLeft -= _bytesInLane; - _lanePosition++; - _offsetInLane = 0; - _curData += _bytesInLane; - } - } - data += partialBlock; - instance->byteIOIndex += partialBlock; - if (instance->byteIOIndex == K12_rateInBytes) - { - KeccakP1600_Permute_12rounds(instance->state); - instance->byteIOIndex = 0; - } - } - } - } +/* void KangarooTwelve(unsigned char* input, unsigned int inputByteLen, unsigned char* output, unsigned int outputByteLen) { KangarooTwelve_F queueNode; @@ -712,972 +228,27 @@ extern "C" { KeccakP1600_Permute_12rounds(finalNode.state); CopyMemory(output, finalNode.state, outputByteLen); } - - int KangarooTwelveCryptoHashFunction(unsigned char* input, unsigned int inputByteLen, unsigned char* output) +*/ + +/** Extendable ouput function KangarooTwelve. + * @param input Pointer to the input message (M). + * @param inputByteLen The length of the input message in bytes. + * @param output Pointer to the output buffer. + * @param outputByteLen The desired number of output bytes. + * @param customization Pointer to the customization string (C). + * @param customByteLen The length of the customization string in bytes. + * @return 0 if successful, 1 otherwise. + */ + //int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen); + extern int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen); + + + int KangarooTwelveCryptoHashFunction(const unsigned char* input, const unsigned int inputByteLen, unsigned char* output) { - KangarooTwelve(input, inputByteLen, output, 64); + KangarooTwelve(input, inputByteLen, output, 64, NULL, 0); return 0; } - - void KangarooTwelve64To32(unsigned char* input, unsigned char* output) - { - #if AVX512 - __m512i Baeiou = _mm512_maskz_loadu_epi64(0x1F, input); - __m512i Gaeiou = _mm512_set_epi64(0, 0, 0, 0, 0x0700, ((unsigned long long*)input)[7], ((unsigned long long*)input)[6], ((unsigned long long*)input)[5]); - - __m512i b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, zero, 0x96), zero, padding, 0x96); - __m512i b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - __m512i b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(zero, b0, b1, 0x96), rhoK)); - __m512i b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(zero, b0, b1, 0x96), rhoM)); - __m512i b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(padding, b0, b1, 0x96), rhoS)); - __m512i b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst0); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - __m512i Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - __m512i Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - __m512i Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst1); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst2); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst3); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst4); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst5); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst6); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst7); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst8); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst9); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst10); - Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2); - Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2); - Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2); - Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2); - b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou); - b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou); - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); - Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou); - - b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96); - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); - b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1); - b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK)); - b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM)); - b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS)); - b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG)); - b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB)); - - _mm512_mask_storeu_epi64(output, 0xF, _mm512_permutex2var_epi64(_mm512_permutex2var_epi64(_mm512_unpacklo_epi64(_mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst11), _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2)), pi2S1, _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2)), pi2BG, _mm512_unpacklo_epi64(_mm512_ternarylogic_epi64(b2, b3, b4, 0xD2), _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2)))); - #else - unsigned long long Aba, Abe, Abi, Abo, Abu; - unsigned long long Aga, Age, Agi, Ago, Agu; - unsigned long long Aka, Ake, Aki, Ako, Aku; - unsigned long long Ama, Ame, Ami, Amo, Amu; - unsigned long long Asa, Ase, Asi, Aso, Asu; - unsigned long long Bba, Bbe, Bbi, Bbo, Bbu; - unsigned long long Bga, Bge, Bgi, Bgo, Bgu; - unsigned long long Bka, Bke, Bki, Bko, Bku; - unsigned long long Bma, Bme, Bmi, Bmo, Bmu; - unsigned long long Bsa, Bse, Bsi, Bso, Bsu; - unsigned long long Ca, Ce, Ci, Co, Cu; - unsigned long long Da, De, Di, Do, Du; - unsigned long long Eba, Ebe, Ebi, Ebo, Ebu; - unsigned long long Ega, Ege, Egi, Ego, Egu; - unsigned long long Eka, Eke, Eki, Eko, Eku; - unsigned long long Ema, Eme, Emi, Emo, Emu; - unsigned long long Esa, Ese, Esi, Eso, Esu; - - Ca = ((unsigned long long*)input)[0] ^ ((unsigned long long*)input)[5] ^ 0x8000000000000000; - Ce = ((unsigned long long*)input)[1] ^ ((unsigned long long*)input)[6]; - Ci = ((unsigned long long*)input)[2] ^ ((unsigned long long*)input)[7]; - Co = ((unsigned long long*)input)[3] ^ 0x0700; - - Da = ((unsigned long long*)input)[4] ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(((unsigned long long*)input)[4], 1); - Du = Co ^ ROL64(Ca, 1); - Aba = ((unsigned long long*)input)[0] ^ Da; - Bbe = ROL64(((unsigned long long*)input)[6] ^ De, 44); - Bbi = ROL64(Di, 43); - Bbo = ROL64(Do, 21); - Bbu = ROL64(Du, 14); - Eba = Aba ^ __andn_u64(Bbe, Bbi) ^ 0x000000008000808bULL; - Ebe = Bbe ^ __andn_u64(Bbi, Bbo); - Ebi = Bbi ^ __andn_u64(Bbo, Bbu); - Ebo = Bbo ^ __andn_u64(Bbu, Aba); - Ebu = Bbu ^ __andn_u64(Aba, Bbe); - Bga = ROL64(((unsigned long long*)input)[3] ^ Do, 28); - Bge = ROL64(Du, 20); - Bgi = ROL64(Da, 3); - Bgo = ROL64(De, 45); - Bgu = ROL64(Di, 61); - Ega = Bga ^ __andn_u64(Bge, Bgi); - Ege = Bge ^ __andn_u64(Bgi, Bgo); - Egi = Bgi ^ __andn_u64(Bgo, Bgu); - Ego = Bgo ^ __andn_u64(Bgu, Bga); - Egu = Bgu ^ __andn_u64(Bga, Bge); - Bka = ROL64(((unsigned long long*)input)[1] ^ De, 1); - Bke = ROL64(((unsigned long long*)input)[7] ^ Di, 6); - Bki = ROL64(Do, 25); - Bko = ROL64(Du, 8); - Bku = ROL64(Da ^ 0x8000000000000000, 18); - Eka = Bka ^ __andn_u64(Bke, Bki); - Eke = Bke ^ __andn_u64(Bki, Bko); - Eki = Bki ^ __andn_u64(Bko, Bku); - Eko = Bko ^ __andn_u64(Bku, Bka); - Eku = Bku ^ __andn_u64(Bka, Bke); - Bma = ROL64(((unsigned long long*)input)[4] ^ Du, 27); - Bme = ROL64(((unsigned long long*)input)[5] ^ Da, 36); - Bmi = ROL64(De, 10); - Bmo = ROL64(Di, 15); - Bmu = ROL64(Do, 56); - Ema = Bma ^ __andn_u64(Bme, Bmi); - Eme = Bme ^ __andn_u64(Bmi, Bmo); - Emi = Bmi ^ __andn_u64(Bmo, Bmu); - Emo = Bmo ^ __andn_u64(Bmu, Bma); - Emu = Bmu ^ __andn_u64(Bma, Bme); - Bsa = ROL64(((unsigned long long*)input)[2] ^ Di, 62); - Bse = ROL64(Do ^ 0x0700, 55); - Bsi = ROL64(Du, 39); - Bso = ROL64(Da, 41); - Bsu = ROL64(De, 2); - Esa = Bsa ^ __andn_u64(Bse, Bsi); - Ese = Bse ^ __andn_u64(Bsi, Bso); - Esi = Bsi ^ __andn_u64(Bso, Bsu); - Eso = Bso ^ __andn_u64(Bsu, Bsa); - Esu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa; - Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; - Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; - Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; - Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Eba ^= Da; - Bbe = ROL64(Ege ^ De, 44); - Bbi = ROL64(Eki ^ Di, 43); - Bbo = ROL64(Emo ^ Do, 21); - Bbu = ROL64(Esu ^ Du, 14); - Aba = Eba ^ __andn_u64(Bbe, Bbi) ^ 0x800000000000008bULL; - Abe = Bbe ^ __andn_u64(Bbi, Bbo); - Abi = Bbi ^ __andn_u64(Bbo, Bbu); - Abo = Bbo ^ __andn_u64(Bbu, Eba); - Abu = Bbu ^ __andn_u64(Eba, Bbe); - Bga = ROL64(Ebo ^ Do, 28); - Bge = ROL64(Egu ^ Du, 20); - Bgi = ROL64(Eka ^ Da, 3); - Bgo = ROL64(Eme ^ De, 45); - Bgu = ROL64(Esi ^ Di, 61); - Aga = Bga ^ __andn_u64(Bge, Bgi); - Age = Bge ^ __andn_u64(Bgi, Bgo); - Agi = Bgi ^ __andn_u64(Bgo, Bgu); - Ago = Bgo ^ __andn_u64(Bgu, Bga); - Agu = Bgu ^ __andn_u64(Bga, Bge); - Bka = ROL64(Ebe ^ De, 1); - Bke = ROL64(Egi ^ Di, 6); - Bki = ROL64(Eko ^ Do, 25); - Bko = ROL64(Emu ^ Du, 8); - Bku = ROL64(Esa ^ Da, 18); - Aka = Bka ^ __andn_u64(Bke, Bki); - Ake = Bke ^ __andn_u64(Bki, Bko); - Aki = Bki ^ __andn_u64(Bko, Bku); - Ako = Bko ^ __andn_u64(Bku, Bka); - Aku = Bku ^ __andn_u64(Bka, Bke); - Bma = ROL64(Ebu ^ Du, 27); - Bme = ROL64(Ega ^ Da, 36); - Bmi = ROL64(Eke ^ De, 10); - Bmo = ROL64(Emi ^ Di, 15); - Bmu = ROL64(Eso ^ Do, 56); - Ama = Bma ^ __andn_u64(Bme, Bmi); - Ame = Bme ^ __andn_u64(Bmi, Bmo); - Ami = Bmi ^ __andn_u64(Bmo, Bmu); - Amo = Bmo ^ __andn_u64(Bmu, Bma); - Amu = Bmu ^ __andn_u64(Bma, Bme); - Bsa = ROL64(Ebi ^ Di, 62); - Bse = ROL64(Ego ^ Do, 55); - Bsi = ROL64(Eku ^ Du, 39); - Bso = ROL64(Ema ^ Da, 41); - Bsu = ROL64(Ese ^ De, 2); - Asa = Bsa ^ __andn_u64(Bse, Bsi); - Ase = Bse ^ __andn_u64(Bsi, Bso); - Asi = Bsi ^ __andn_u64(Bso, Bsu); - Aso = Bso ^ __andn_u64(Bsu, Bsa); - Asu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa; - Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase; - Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi; - Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso; - Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Aba ^= Da; - Bbe = ROL64(Age ^ De, 44); - Bbi = ROL64(Aki ^ Di, 43); - Bbo = ROL64(Amo ^ Do, 21); - Bbu = ROL64(Asu ^ Du, 14); - Eba = Aba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000000008089ULL; - Ebe = Bbe ^ __andn_u64(Bbi, Bbo); - Ebi = Bbi ^ __andn_u64(Bbo, Bbu); - Ebo = Bbo ^ __andn_u64(Bbu, Aba); - Ebu = Bbu ^ __andn_u64(Aba, Bbe); - Bga = ROL64(Abo ^ Do, 28); - Bge = ROL64(Agu ^ Du, 20); - Bgi = ROL64(Aka ^ Da, 3); - Bgo = ROL64(Ame ^ De, 45); - Bgu = ROL64(Asi ^ Di, 61); - Ega = Bga ^ __andn_u64(Bge, Bgi); - Ege = Bge ^ __andn_u64(Bgi, Bgo); - Egi = Bgi ^ __andn_u64(Bgo, Bgu); - Ego = Bgo ^ __andn_u64(Bgu, Bga); - Egu = Bgu ^ __andn_u64(Bga, Bge); - Bka = ROL64(Abe ^ De, 1); - Bke = ROL64(Agi ^ Di, 6); - Bki = ROL64(Ako ^ Do, 25); - Bko = ROL64(Amu ^ Du, 8); - Bku = ROL64(Asa ^ Da, 18); - Eka = Bka ^ __andn_u64(Bke, Bki); - Eke = Bke ^ __andn_u64(Bki, Bko); - Eki = Bki ^ __andn_u64(Bko, Bku); - Eko = Bko ^ __andn_u64(Bku, Bka); - Eku = Bku ^ __andn_u64(Bka, Bke); - Bma = ROL64(Abu ^ Du, 27); - Bme = ROL64(Aga ^ Da, 36); - Bmi = ROL64(Ake ^ De, 10); - Bmo = ROL64(Ami ^ Di, 15); - Bmu = ROL64(Aso ^ Do, 56); - Ema = Bma ^ __andn_u64(Bme, Bmi); - Eme = Bme ^ __andn_u64(Bmi, Bmo); - Emi = Bmi ^ __andn_u64(Bmo, Bmu); - Emo = Bmo ^ __andn_u64(Bmu, Bma); - Emu = Bmu ^ __andn_u64(Bma, Bme); - Bsa = ROL64(Abi ^ Di, 62); - Bse = ROL64(Ago ^ Do, 55); - Bsi = ROL64(Aku ^ Du, 39); - Bso = ROL64(Ama ^ Da, 41); - Bsu = ROL64(Ase ^ De, 2); - Esa = Bsa ^ __andn_u64(Bse, Bsi); - Ese = Bse ^ __andn_u64(Bsi, Bso); - Esi = Bsi ^ __andn_u64(Bso, Bsu); - Eso = Bso ^ __andn_u64(Bsu, Bsa); - Esu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa; - Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; - Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; - Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; - Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Eba ^= Da; - Bbe = ROL64(Ege ^ De, 44); - Bbi = ROL64(Eki ^ Di, 43); - Bbo = ROL64(Emo ^ Do, 21); - Bbu = ROL64(Esu ^ Du, 14); - Aba = Eba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000000008003ULL; - Abe = Bbe ^ __andn_u64(Bbi, Bbo); - Abi = Bbi ^ __andn_u64(Bbo, Bbu); - Abo = Bbo ^ __andn_u64(Bbu, Eba); - Abu = Bbu ^ __andn_u64(Eba, Bbe); - Bga = ROL64(Ebo ^ Do, 28); - Bge = ROL64(Egu ^ Du, 20); - Bgi = ROL64(Eka ^ Da, 3); - Bgo = ROL64(Eme ^ De, 45); - Bgu = ROL64(Esi ^ Di, 61); - Aga = Bga ^ __andn_u64(Bge, Bgi); - Age = Bge ^ __andn_u64(Bgi, Bgo); - Agi = Bgi ^ __andn_u64(Bgo, Bgu); - Ago = Bgo ^ __andn_u64(Bgu, Bga); - Agu = Bgu ^ __andn_u64(Bga, Bge); - Bka = ROL64(Ebe ^ De, 1); - Bke = ROL64(Egi ^ Di, 6); - Bki = ROL64(Eko ^ Do, 25); - Bko = ROL64(Emu ^ Du, 8); - Bku = ROL64(Esa ^ Da, 18); - Aka = Bka ^ __andn_u64(Bke, Bki); - Ake = Bke ^ __andn_u64(Bki, Bko); - Aki = Bki ^ __andn_u64(Bko, Bku); - Ako = Bko ^ __andn_u64(Bku, Bka); - Aku = Bku ^ __andn_u64(Bka, Bke); - Bma = ROL64(Ebu ^ Du, 27); - Bme = ROL64(Ega ^ Da, 36); - Bmi = ROL64(Eke ^ De, 10); - Bmo = ROL64(Emi ^ Di, 15); - Bmu = ROL64(Eso ^ Do, 56); - Ama = Bma ^ __andn_u64(Bme, Bmi); - Ame = Bme ^ __andn_u64(Bmi, Bmo); - Ami = Bmi ^ __andn_u64(Bmo, Bmu); - Amo = Bmo ^ __andn_u64(Bmu, Bma); - Amu = Bmu ^ __andn_u64(Bma, Bme); - Bsa = ROL64(Ebi ^ Di, 62); - Bse = ROL64(Ego ^ Do, 55); - Bsi = ROL64(Eku ^ Du, 39); - Bso = ROL64(Ema ^ Da, 41); - Bsu = ROL64(Ese ^ De, 2); - Asa = Bsa ^ __andn_u64(Bse, Bsi); - Ase = Bse ^ __andn_u64(Bsi, Bso); - Asi = Bsi ^ __andn_u64(Bso, Bsu); - Aso = Bso ^ __andn_u64(Bsu, Bsa); - Asu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa; - Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase; - Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi; - Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso; - Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Aba ^= Da; - Bbe = ROL64(Age ^ De, 44); - Bbi = ROL64(Aki ^ Di, 43); - Bbo = ROL64(Amo ^ Do, 21); - Bbu = ROL64(Asu ^ Du, 14); - Eba = Aba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000000008002ULL; - Ebe = Bbe ^ __andn_u64(Bbi, Bbo); - Ebi = Bbi ^ __andn_u64(Bbo, Bbu); - Ebo = Bbo ^ __andn_u64(Bbu, Aba); - Ebu = Bbu ^ __andn_u64(Aba, Bbe); - Bga = ROL64(Abo ^ Do, 28); - Bge = ROL64(Agu ^ Du, 20); - Bgi = ROL64(Aka ^ Da, 3); - Bgo = ROL64(Ame ^ De, 45); - Bgu = ROL64(Asi ^ Di, 61); - Ega = Bga ^ __andn_u64(Bge, Bgi); - Ege = Bge ^ __andn_u64(Bgi, Bgo); - Egi = Bgi ^ __andn_u64(Bgo, Bgu); - Ego = Bgo ^ __andn_u64(Bgu, Bga); - Egu = Bgu ^ __andn_u64(Bga, Bge); - Bka = ROL64(Abe ^ De, 1); - Bke = ROL64(Agi ^ Di, 6); - Bki = ROL64(Ako ^ Do, 25); - Bko = ROL64(Amu ^ Du, 8); - Bku = ROL64(Asa ^ Da, 18); - Eka = Bka ^ __andn_u64(Bke, Bki); - Eke = Bke ^ __andn_u64(Bki, Bko); - Eki = Bki ^ __andn_u64(Bko, Bku); - Eko = Bko ^ __andn_u64(Bku, Bka); - Eku = Bku ^ __andn_u64(Bka, Bke); - Bma = ROL64(Abu ^ Du, 27); - Bme = ROL64(Aga ^ Da, 36); - Bmi = ROL64(Ake ^ De, 10); - Bmo = ROL64(Ami ^ Di, 15); - Bmu = ROL64(Aso ^ Do, 56); - Ema = Bma ^ __andn_u64(Bme, Bmi); - Eme = Bme ^ __andn_u64(Bmi, Bmo); - Emi = Bmi ^ __andn_u64(Bmo, Bmu); - Emo = Bmo ^ __andn_u64(Bmu, Bma); - Emu = Bmu ^ __andn_u64(Bma, Bme); - Bsa = ROL64(Abi ^ Di, 62); - Bse = ROL64(Ago ^ Do, 55); - Bsi = ROL64(Aku ^ Du, 39); - Bso = ROL64(Ama ^ Da, 41); - Bsu = ROL64(Ase ^ De, 2); - Esa = Bsa ^ __andn_u64(Bse, Bsi); - Ese = Bse ^ __andn_u64(Bsi, Bso); - Esi = Bsi ^ __andn_u64(Bso, Bsu); - Eso = Bso ^ __andn_u64(Bsu, Bsa); - Esu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa; - Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; - Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; - Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; - Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Eba ^= Da; - Bbe = ROL64(Ege ^ De, 44); - Bbi = ROL64(Eki ^ Di, 43); - Bbo = ROL64(Emo ^ Do, 21); - Bbu = ROL64(Esu ^ Du, 14); - Aba = Eba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000000000080ULL; - Abe = Bbe ^ __andn_u64(Bbi, Bbo); - Abi = Bbi ^ __andn_u64(Bbo, Bbu); - Abo = Bbo ^ __andn_u64(Bbu, Eba); - Abu = Bbu ^ __andn_u64(Eba, Bbe); - Bga = ROL64(Ebo ^ Do, 28); - Bge = ROL64(Egu ^ Du, 20); - Bgi = ROL64(Eka ^ Da, 3); - Bgo = ROL64(Eme ^ De, 45); - Bgu = ROL64(Esi ^ Di, 61); - Aga = Bga ^ __andn_u64(Bge, Bgi); - Age = Bge ^ __andn_u64(Bgi, Bgo); - Agi = Bgi ^ __andn_u64(Bgo, Bgu); - Ago = Bgo ^ __andn_u64(Bgu, Bga); - Agu = Bgu ^ __andn_u64(Bga, Bge); - Bka = ROL64(Ebe ^ De, 1); - Bke = ROL64(Egi ^ Di, 6); - Bki = ROL64(Eko ^ Do, 25); - Bko = ROL64(Emu ^ Du, 8); - Bku = ROL64(Esa ^ Da, 18); - Aka = Bka ^ __andn_u64(Bke, Bki); - Ake = Bke ^ __andn_u64(Bki, Bko); - Aki = Bki ^ __andn_u64(Bko, Bku); - Ako = Bko ^ __andn_u64(Bku, Bka); - Aku = Bku ^ __andn_u64(Bka, Bke); - Bma = ROL64(Ebu ^ Du, 27); - Bme = ROL64(Ega ^ Da, 36); - Bmi = ROL64(Eke ^ De, 10); - Bmo = ROL64(Emi ^ Di, 15); - Bmu = ROL64(Eso ^ Do, 56); - Ama = Bma ^ __andn_u64(Bme, Bmi); - Ame = Bme ^ __andn_u64(Bmi, Bmo); - Ami = Bmi ^ __andn_u64(Bmo, Bmu); - Amo = Bmo ^ __andn_u64(Bmu, Bma); - Amu = Bmu ^ __andn_u64(Bma, Bme); - Bsa = ROL64(Ebi ^ Di, 62); - Bse = ROL64(Ego ^ Do, 55); - Bsi = ROL64(Eku ^ Du, 39); - Bso = ROL64(Ema ^ Da, 41); - Bsu = ROL64(Ese ^ De, 2); - Asa = Bsa ^ __andn_u64(Bse, Bsi); - Ase = Bse ^ __andn_u64(Bsi, Bso); - Asi = Bsi ^ __andn_u64(Bso, Bsu); - Aso = Bso ^ __andn_u64(Bsu, Bsa); - Asu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa; - Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase; - Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi; - Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso; - Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Aba ^= Da; - Bbe = ROL64(Age ^ De, 44); - Bbi = ROL64(Aki ^ Di, 43); - Bbo = ROL64(Amo ^ Do, 21); - Bbu = ROL64(Asu ^ Du, 14); - Eba = Aba ^ __andn_u64(Bbe, Bbi) ^ 0x000000000000800aULL; - Ebe = Bbe ^ __andn_u64(Bbi, Bbo); - Ebi = Bbi ^ __andn_u64(Bbo, Bbu); - Ebo = Bbo ^ __andn_u64(Bbu, Aba); - Ebu = Bbu ^ __andn_u64(Aba, Bbe); - Bga = ROL64(Abo ^ Do, 28); - Bge = ROL64(Agu ^ Du, 20); - Bgi = ROL64(Aka ^ Da, 3); - Bgo = ROL64(Ame ^ De, 45); - Bgu = ROL64(Asi ^ Di, 61); - Ega = Bga ^ __andn_u64(Bge, Bgi); - Ege = Bge ^ __andn_u64(Bgi, Bgo); - Egi = Bgi ^ __andn_u64(Bgo, Bgu); - Ego = Bgo ^ __andn_u64(Bgu, Bga); - Egu = Bgu ^ __andn_u64(Bga, Bge); - Bka = ROL64(Abe ^ De, 1); - Bke = ROL64(Agi ^ Di, 6); - Bki = ROL64(Ako ^ Do, 25); - Bko = ROL64(Amu ^ Du, 8); - Bku = ROL64(Asa ^ Da, 18); - Eka = Bka ^ __andn_u64(Bke, Bki); - Eke = Bke ^ __andn_u64(Bki, Bko); - Eki = Bki ^ __andn_u64(Bko, Bku); - Eko = Bko ^ __andn_u64(Bku, Bka); - Eku = Bku ^ __andn_u64(Bka, Bke); - Bma = ROL64(Abu ^ Du, 27); - Bme = ROL64(Aga ^ Da, 36); - Bmi = ROL64(Ake ^ De, 10); - Bmo = ROL64(Ami ^ Di, 15); - Bmu = ROL64(Aso ^ Do, 56); - Ema = Bma ^ __andn_u64(Bme, Bmi); - Eme = Bme ^ __andn_u64(Bmi, Bmo); - Emi = Bmi ^ __andn_u64(Bmo, Bmu); - Emo = Bmo ^ __andn_u64(Bmu, Bma); - Emu = Bmu ^ __andn_u64(Bma, Bme); - Bsa = ROL64(Abi ^ Di, 62); - Bse = ROL64(Ago ^ Do, 55); - Bsi = ROL64(Aku ^ Du, 39); - Bso = ROL64(Ama ^ Da, 41); - Bsu = ROL64(Ase ^ De, 2); - Esa = Bsa ^ __andn_u64(Bse, Bsi); - Ese = Bse ^ __andn_u64(Bsi, Bso); - Esi = Bsi ^ __andn_u64(Bso, Bsu); - Eso = Bso ^ __andn_u64(Bsu, Bsa); - Esu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa; - Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; - Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; - Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; - Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Eba ^= Da; - Bbe = ROL64(Ege ^ De, 44); - Bbi = ROL64(Eki ^ Di, 43); - Bbo = ROL64(Emo ^ Do, 21); - Bbu = ROL64(Esu ^ Du, 14); - Aba = Eba ^ __andn_u64(Bbe, Bbi) ^ 0x800000008000000aULL; - Abe = Bbe ^ __andn_u64(Bbi, Bbo); - Abi = Bbi ^ __andn_u64(Bbo, Bbu); - Abo = Bbo ^ __andn_u64(Bbu, Eba); - Abu = Bbu ^ __andn_u64(Eba, Bbe); - Bga = ROL64(Ebo ^ Do, 28); - Bge = ROL64(Egu ^ Du, 20); - Bgi = ROL64(Eka ^ Da, 3); - Bgo = ROL64(Eme ^ De, 45); - Bgu = ROL64(Esi ^ Di, 61); - Aga = Bga ^ __andn_u64(Bge, Bgi); - Age = Bge ^ __andn_u64(Bgi, Bgo); - Agi = Bgi ^ __andn_u64(Bgo, Bgu); - Ago = Bgo ^ __andn_u64(Bgu, Bga); - Agu = Bgu ^ __andn_u64(Bga, Bge); - Bka = ROL64(Ebe ^ De, 1); - Bke = ROL64(Egi ^ Di, 6); - Bki = ROL64(Eko ^ Do, 25); - Bko = ROL64(Emu ^ Du, 8); - Bku = ROL64(Esa ^ Da, 18); - Aka = Bka ^ __andn_u64(Bke, Bki); - Ake = Bke ^ __andn_u64(Bki, Bko); - Aki = Bki ^ __andn_u64(Bko, Bku); - Ako = Bko ^ __andn_u64(Bku, Bka); - Aku = Bku ^ __andn_u64(Bka, Bke); - Bma = ROL64(Ebu ^ Du, 27); - Bme = ROL64(Ega ^ Da, 36); - Bmi = ROL64(Eke ^ De, 10); - Bmo = ROL64(Emi ^ Di, 15); - Bmu = ROL64(Eso ^ Do, 56); - Ama = Bma ^ __andn_u64(Bme, Bmi); - Ame = Bme ^ __andn_u64(Bmi, Bmo); - Ami = Bmi ^ __andn_u64(Bmo, Bmu); - Amo = Bmo ^ __andn_u64(Bmu, Bma); - Amu = Bmu ^ __andn_u64(Bma, Bme); - Bsa = ROL64(Ebi ^ Di, 62); - Bse = ROL64(Ego ^ Do, 55); - Bsi = ROL64(Eku ^ Du, 39); - Bso = ROL64(Ema ^ Da, 41); - Bsu = ROL64(Ese ^ De, 2); - Asa = Bsa ^ __andn_u64(Bse, Bsi); - Ase = Bse ^ __andn_u64(Bsi, Bso); - Asi = Bsi ^ __andn_u64(Bso, Bsu); - Aso = Bso ^ __andn_u64(Bsu, Bsa); - Asu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa; - Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase; - Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi; - Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso; - Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Aba ^= Da; - Bbe = ROL64(Age ^ De, 44); - Bbi = ROL64(Aki ^ Di, 43); - Bbo = ROL64(Amo ^ Do, 21); - Bbu = ROL64(Asu ^ Du, 14); - Eba = Aba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000080008081ULL; - Ebe = Bbe ^ __andn_u64(Bbi, Bbo); - Ebi = Bbi ^ __andn_u64(Bbo, Bbu); - Ebo = Bbo ^ __andn_u64(Bbu, Aba); - Ebu = Bbu ^ __andn_u64(Aba, Bbe); - Bga = ROL64(Abo ^ Do, 28); - Bge = ROL64(Agu ^ Du, 20); - Bgi = ROL64(Aka ^ Da, 3); - Bgo = ROL64(Ame ^ De, 45); - Bgu = ROL64(Asi ^ Di, 61); - Ega = Bga ^ __andn_u64(Bge, Bgi); - Ege = Bge ^ __andn_u64(Bgi, Bgo); - Egi = Bgi ^ __andn_u64(Bgo, Bgu); - Ego = Bgo ^ __andn_u64(Bgu, Bga); - Egu = Bgu ^ __andn_u64(Bga, Bge); - Bka = ROL64(Abe ^ De, 1); - Bke = ROL64(Agi ^ Di, 6); - Bki = ROL64(Ako ^ Do, 25); - Bko = ROL64(Amu ^ Du, 8); - Bku = ROL64(Asa ^ Da, 18); - Eka = Bka ^ __andn_u64(Bke, Bki); - Eke = Bke ^ __andn_u64(Bki, Bko); - Eki = Bki ^ __andn_u64(Bko, Bku); - Eko = Bko ^ __andn_u64(Bku, Bka); - Eku = Bku ^ __andn_u64(Bka, Bke); - Bma = ROL64(Abu ^ Du, 27); - Bme = ROL64(Aga ^ Da, 36); - Bmi = ROL64(Ake ^ De, 10); - Bmo = ROL64(Ami ^ Di, 15); - Bmu = ROL64(Aso ^ Do, 56); - Ema = Bma ^ __andn_u64(Bme, Bmi); - Eme = Bme ^ __andn_u64(Bmi, Bmo); - Emi = Bmi ^ __andn_u64(Bmo, Bmu); - Emo = Bmo ^ __andn_u64(Bmu, Bma); - Emu = Bmu ^ __andn_u64(Bma, Bme); - Bsa = ROL64(Abi ^ Di, 62); - Bse = ROL64(Ago ^ Do, 55); - Bsi = ROL64(Aku ^ Du, 39); - Bso = ROL64(Ama ^ Da, 41); - Bsu = ROL64(Ase ^ De, 2); - Esa = Bsa ^ __andn_u64(Bse, Bsi); - Ese = Bse ^ __andn_u64(Bsi, Bso); - Esi = Bsi ^ __andn_u64(Bso, Bsu); - Eso = Bso ^ __andn_u64(Bsu, Bsa); - Esu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa; - Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; - Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; - Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; - Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Eba ^= Da; - Bbe = ROL64(Ege ^ De, 44); - Bbi = ROL64(Eki ^ Di, 43); - Bbo = ROL64(Emo ^ Do, 21); - Bbu = ROL64(Esu ^ Du, 14); - Aba = Eba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000000008080ULL; - Abe = Bbe ^ __andn_u64(Bbi, Bbo); - Abi = Bbi ^ __andn_u64(Bbo, Bbu); - Abo = Bbo ^ __andn_u64(Bbu, Eba); - Abu = Bbu ^ __andn_u64(Eba, Bbe); - Bga = ROL64(Ebo ^ Do, 28); - Bge = ROL64(Egu ^ Du, 20); - Bgi = ROL64(Eka ^ Da, 3); - Bgo = ROL64(Eme ^ De, 45); - Bgu = ROL64(Esi ^ Di, 61); - Aga = Bga ^ __andn_u64(Bge, Bgi); - Age = Bge ^ __andn_u64(Bgi, Bgo); - Agi = Bgi ^ __andn_u64(Bgo, Bgu); - Ago = Bgo ^ __andn_u64(Bgu, Bga); - Agu = Bgu ^ __andn_u64(Bga, Bge); - Bka = ROL64(Ebe ^ De, 1); - Bke = ROL64(Egi ^ Di, 6); - Bki = ROL64(Eko ^ Do, 25); - Bko = ROL64(Emu ^ Du, 8); - Bku = ROL64(Esa ^ Da, 18); - Aka = Bka ^ __andn_u64(Bke, Bki); - Ake = Bke ^ __andn_u64(Bki, Bko); - Aki = Bki ^ __andn_u64(Bko, Bku); - Ako = Bko ^ __andn_u64(Bku, Bka); - Aku = Bku ^ __andn_u64(Bka, Bke); - Bma = ROL64(Ebu ^ Du, 27); - Bme = ROL64(Ega ^ Da, 36); - Bmi = ROL64(Eke ^ De, 10); - Bmo = ROL64(Emi ^ Di, 15); - Bmu = ROL64(Eso ^ Do, 56); - Ama = Bma ^ __andn_u64(Bme, Bmi); - Ame = Bme ^ __andn_u64(Bmi, Bmo); - Ami = Bmi ^ __andn_u64(Bmo, Bmu); - Amo = Bmo ^ __andn_u64(Bmu, Bma); - Amu = Bmu ^ __andn_u64(Bma, Bme); - Bsa = ROL64(Ebi ^ Di, 62); - Bse = ROL64(Ego ^ Do, 55); - Bsi = ROL64(Eku ^ Du, 39); - Bso = ROL64(Ema ^ Da, 41); - Bsu = ROL64(Ese ^ De, 2); - Asa = Bsa ^ __andn_u64(Bse, Bsi); - Ase = Bse ^ __andn_u64(Bsi, Bso); - Asi = Bsi ^ __andn_u64(Bso, Bsu); - Aso = Bso ^ __andn_u64(Bsu, Bsa); - Asu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa; - Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase; - Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi; - Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso; - Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Bba = Aba ^ Da; - Bbe = ROL64(Age ^ De, 44); - Bbi = ROL64(Aki ^ Di, 43); - Bbo = ROL64(Amo ^ Do, 21); - Bbu = ROL64(Asu ^ Du, 14); - Bga = ROL64(Abo ^ Do, 28); - Bge = ROL64(Agu ^ Du, 20); - Bgi = ROL64(Aka ^ Da, 3); - Bgo = ROL64(Ame ^ De, 45); - Bgu = ROL64(Asi ^ Di, 61); - Bka = ROL64(Abe ^ De, 1); - Bke = ROL64(Agi ^ Di, 6); - Bki = ROL64(Ako ^ Do, 25); - Bko = ROL64(Amu ^ Du, 8); - Bku = ROL64(Asa ^ Da, 18); - Bma = ROL64(Abu ^ Du, 27); - Bme = ROL64(Aga ^ Da, 36); - Bmi = ROL64(Ake ^ De, 10); - Bmo = ROL64(Ami ^ Di, 15); - Bmu = ROL64(Aso ^ Do, 56); - Bsa = ROL64(Abi ^ Di, 62); - Bse = ROL64(Ago ^ Do, 55); - Bsi = ROL64(Aku ^ Du, 39); - Bso = ROL64(Ama ^ Da, 41); - Bsu = ROL64(Ase ^ De, 2); - Eba = Bba ^ __andn_u64(Bbe, Bbi) ^ 0x0000000080000001ULL; - Ege = Bge ^ __andn_u64(Bgi, Bgo); - Eki = Bki ^ __andn_u64(Bko, Bku); - Emo = Bmo ^ __andn_u64(Bmu, Bma); - Esu = Bsu ^ __andn_u64(Bsa, Bse); - Ca = Eba ^ Bga ^ Bka ^ Bma ^ Bsa ^ __andn_u64(Bge, Bgi) ^ __andn_u64(Bke, Bki) ^ __andn_u64(Bme, Bmi) ^ __andn_u64(Bse, Bsi); - Ce = Bbe ^ Ege ^ Bke ^ Bme ^ Bse ^ __andn_u64(Bbi, Bbo) ^ __andn_u64(Bki, Bko) ^ __andn_u64(Bmi, Bmo) ^ __andn_u64(Bsi, Bso); - Ci = Bbi ^ Bgi ^ Eki ^ Bmi ^ Bsi ^ __andn_u64(Bbo, Bbu) ^ __andn_u64(Bgo, Bgu) ^ __andn_u64(Bmo, Bmu) ^ __andn_u64(Bso, Bsu); - Co = Bbo ^ Bgo ^ Bko ^ Emo ^ Bso ^ __andn_u64(Bbu, Bba) ^ __andn_u64(Bgu, Bga) ^ __andn_u64(Bku, Bka) ^ __andn_u64(Bsu, Bsa); - Cu = Bbu ^ Bgu ^ Bku ^ Bmu ^ Esu ^ __andn_u64(Bba, Bbe) ^ __andn_u64(Bga, Bge) ^ __andn_u64(Bka, Bke) ^ __andn_u64(Bma, Bme); - - Bba = Eba ^ Cu ^ ROL64(Ce, 1); - Bbe = ROL64(Ege ^ Ca ^ ROL64(Ci, 1), 44); - Bbi = ROL64(Eki ^ Ce ^ ROL64(Co, 1), 43); - Bbo = ROL64(Emo ^ Ci ^ ROL64(Cu, 1), 21); - Bbu = ROL64(Esu ^ Co ^ ROL64(Ca, 1), 14); - ((unsigned long long*)output)[0] = Bba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000080008008ULL; - ((unsigned long long*)output)[1] = Bbe ^ __andn_u64(Bbi, Bbo); - ((unsigned long long*)output)[2] = Bbi ^ __andn_u64(Bbo, Bbu); - ((unsigned long long*)output)[3] = Bbo ^ __andn_u64(Bbu, Bba); - #endif - } - - - /* Qubic Specific */ typedef unsigned long long felm_t[2]; // Datatype for representing 128-bit field elements typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements @@ -1709,7 +280,7 @@ extern "C" { } } unsigned int identityBytesChecksum; - KangarooTwelve(publicKey, 32, (unsigned char*)&identityBytesChecksum, 3); + KangarooTwelve(publicKey, 32, (unsigned char*)&identityBytesChecksum, 3, NULL, 0); identityBytesChecksum &= 0x3FFFF; for (int i = 0; i < 4; i++) { @@ -1721,7 +292,7 @@ extern "C" { void getPrivateKey(unsigned char* subseed, unsigned char* privateKey) { - KangarooTwelve(subseed, 32, privateKey, 32); + KangarooTwelve(subseed, 32, privateKey, 32, NULL, 0); } @@ -1754,7 +325,7 @@ extern "C" { } } unsigned int identityBytesChecksum; - KangarooTwelve(publicKeyBuffer, 32, (unsigned char*)&identityBytesChecksum, 3); + KangarooTwelve(publicKeyBuffer, 32, (unsigned char*)&identityBytesChecksum, 3, NULL, 0); identityBytesChecksum &= 0x3FFFF; for (int i = 0; i < 4; i++) { @@ -1780,7 +351,7 @@ extern "C" { } seedBytes[i] = seed[i] - 'a'; } - KangarooTwelve(seedBytes, sizeof(seedBytes), subseed, 32); + KangarooTwelve(seedBytes, sizeof(seedBytes), subseed, 32, NULL, 0); return true; } diff --git a/identity/build.rs b/identity/build.rs index 11cfb38..3f29496 100644 --- a/identity/build.rs +++ b/identity/build.rs @@ -13,7 +13,7 @@ fn main() { } else { cc::Build::new() .define("__LINUX__", "1") - .define("_ARM_", "1") + .define("_X86_", "1") .define("_AVX_", "1") .define("USE_ENDO", "true") .include("../ffi-deps/FourQlib/FourQ_32bit") @@ -32,9 +32,11 @@ fn main() { cc::Build::new() .file("../ffi-deps/chopper-linux.cpp") - .define("_AMD64_", "1") + .define("__LINUX__", "1") + .define("_X86_", "1") .define("_AVX_", "1") - .include("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h") + .include("../ffi-deps/FourQlib/FourQ_32bit") + .file("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h") .compile("Chopper") } } From 23d99914e5aa3476403c73375c7d14d7cc588bb7 Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Sat, 9 Dec 2023 18:04:36 -0500 Subject: [PATCH 2/7] Add more K12 functions --- .../lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S | 623 ++++++++++ .../K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h | 65 + .../K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c | 227 ++++ .../K12/lib/Inplace32BI/KeccakP-1600-SnP.h | 35 + .../Inplace32BI/KeccakP-1600-inplace32BI.c | 1068 +++++++++++++++++ .../K12/lib/Optimized64/KeccakP-1600-AVX2.s | 664 ++++++++++ .../Optimized64/KeccakP-1600-AVX512-plainC.c | 241 ++++ .../K12/lib/Optimized64/KeccakP-1600-AVX512.s | 551 +++++++++ .../K12/lib/Optimized64/KeccakP-1600-SnP.h | 74 ++ .../K12/lib/Optimized64/KeccakP-1600-opt64.c | 1026 ++++++++++++++++ .../KeccakP-1600-runtimeDispatch.c | 406 +++++++ .../Optimized64/KeccakP-1600-timesN-AVX2.c | 419 +++++++ .../Optimized64/KeccakP-1600-timesN-AVX512.c | 458 +++++++ .../Optimized64/KeccakP-1600-timesN-SSSE3.c | 438 +++++++ 14 files changed, 6295 insertions(+) create mode 100644 ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S create mode 100644 ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h create mode 100644 ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c create mode 100644 ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h create mode 100644 ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S new file mode 100644 index 0000000..09aa0d2 --- /dev/null +++ b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S @@ -0,0 +1,623 @@ +# K12 based on the eXtended Keccak Code Package (XKCP) +# https://github.com/XKCP/XKCP +# +# The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. +# +# Implementation by Gilles Van Assche, hereby denoted as "the implementer". +# Core subroutine is based on one by Andy Polyakov, available +# at https://github.com/dot-asm/cryptogams. Used with permission. +# +# For more information, feedback or questions, please refer to the Keccak Team website: +# https://keccak.team/ +# +# To the extent possible under law, the implementer has waived all copyright +# and related or neighboring rights to the source code in this file. +# http://creativecommons.org/publicdomain/zero/1.0/ + +.text + +.balign 64 // strategic alignment and padding that allows to use + // address value as loop termination condition... + .quad 0,0,0,0,0,0,0,0 +.ifdef macOS +.else +.type iotas,%object +.endif +iotas: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a +iotas12: + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +.ifdef macOS +.else +.size iotas,.-iotas +.endif + +.ifdef macOS +.else +.type KeccakP1600_ARMv8Asha3_Permute_12rounds_internal,%function +.endif +KeccakP1600_ARMv8Asha3_Permute_12rounds_internal: +.balign 32 + mov x9,#12 + adr x10,iotas12 + b .Loop_ce +.balign 16 +.Loop_ce: + ////////////////////////////////////////////////// Theta + eor3 v25.16b,v20.16b,v15.16b,v10.16b + eor3 v26.16b,v21.16b,v16.16b,v11.16b + eor3 v27.16b,v22.16b,v17.16b,v12.16b + eor3 v28.16b,v23.16b,v18.16b,v13.16b + eor3 v29.16b,v24.16b,v19.16b,v14.16b + eor3 v25.16b,v25.16b, v5.16b,v0.16b + eor3 v26.16b,v26.16b, v6.16b,v1.16b + eor3 v27.16b,v27.16b, v7.16b,v2.16b + eor3 v28.16b,v28.16b, v8.16b,v3.16b + eor3 v29.16b,v29.16b, v9.16b,v4.16b + + rax1 v30.2d,v25.2d,v27.2d // D[1] + rax1 v31.2d,v26.2d,v28.2d // D[2] + rax1 v27.2d,v27.2d,v29.2d // D[3] + rax1 v28.2d,v28.2d,v25.2d // D[4] + rax1 v29.2d,v29.2d,v26.2d // D[0] + + ////////////////////////////////////////////////// Theta+Rho+Pi + xar v25.2d, v1.2d,v30.2d,#64-1 // C[0]=A[2][0] + + xar v1.2d,v6.2d,v30.2d,#64-44 + xar v6.2d,v9.2d,v28.2d,#64-20 + xar v9.2d,v22.2d,v31.2d,#64-61 + xar v22.2d,v14.2d,v28.2d,#64-39 + xar v14.2d,v20.2d,v29.2d,#64-18 + + xar v26.2d, v2.2d,v31.2d,#64-62 // C[1]=A[4][0] + + xar v2.2d,v12.2d,v31.2d,#64-43 + xar v12.2d,v13.2d,v27.2d,#64-25 + xar v13.2d,v19.2d,v28.2d,#64-8 + xar v19.2d,v23.2d,v27.2d,#64-56 + xar v23.2d,v15.2d,v29.2d,#64-41 + + xar v15.2d,v4.2d,v28.2d,#64-27 + + xar v28.2d, v24.2d,v28.2d,#64-14 // D[4]=A[0][4] + xar v24.2d,v21.2d,v30.2d,#64-2 + xar v8.2d,v8.2d,v27.2d,#64-55 // A[1][3]=A[4][1] + xar v4.2d,v16.2d,v30.2d,#64-45 // A[0][4]=A[1][3] + xar v16.2d,v5.2d,v29.2d,#64-36 + + xar v5.2d,v3.2d,v27.2d,#64-28 + + eor v0.16b,v0.16b,v29.16b + + xar v27.2d, v18.2d,v27.2d,#64-21 // D[3]=A[0][3] + xar v3.2d,v17.2d,v31.2d,#64-15 // A[0][3]=A[3][3] + xar v30.2d, v11.2d,v30.2d,#64-10 // D[1]=A[3][2] + xar v31.2d, v7.2d,v31.2d,#64-6 // D[2]=A[2][1] + xar v29.2d, v10.2d,v29.2d,#64-3 // D[0]=A[1][2] + + ////////////////////////////////////////////////// Chi+Iota + bcax v20.16b,v26.16b, v22.16b,v8.16b // A[1][3]=A[4][1] + bcax v21.16b,v8.16b,v23.16b,v22.16b // A[1][3]=A[4][1] + bcax v22.16b,v22.16b,v24.16b,v23.16b + bcax v23.16b,v23.16b,v26.16b, v24.16b + bcax v24.16b,v24.16b,v8.16b,v26.16b // A[1][3]=A[4][1] + + ld1r {v26.2d},[x10],#8 + + bcax v17.16b,v30.16b, v19.16b,v3.16b // A[0][3]=A[3][3] + bcax v18.16b,v3.16b,v15.16b,v19.16b // A[0][3]=A[3][3] + bcax v19.16b,v19.16b,v16.16b,v15.16b + bcax v15.16b,v15.16b,v30.16b, v16.16b + bcax v16.16b,v16.16b,v3.16b,v30.16b // A[0][3]=A[3][3] + + bcax v10.16b,v25.16b, v12.16b,v31.16b + bcax v11.16b,v31.16b, v13.16b,v12.16b + bcax v12.16b,v12.16b,v14.16b,v13.16b + bcax v13.16b,v13.16b,v25.16b, v14.16b + bcax v14.16b,v14.16b,v31.16b, v25.16b + + bcax v7.16b,v29.16b, v9.16b,v4.16b // A[0][4]=A[1][3] + bcax v8.16b,v4.16b,v5.16b,v9.16b // A[0][4]=A[1][3] + bcax v9.16b,v9.16b,v6.16b,v5.16b + bcax v5.16b,v5.16b,v29.16b, v6.16b + bcax v6.16b,v6.16b,v4.16b,v29.16b // A[0][4]=A[1][3] + + bcax v3.16b,v27.16b, v0.16b,v28.16b + bcax v4.16b,v28.16b, v1.16b,v0.16b + bcax v0.16b,v0.16b,v2.16b,v1.16b + bcax v1.16b,v1.16b,v27.16b, v2.16b + bcax v2.16b,v2.16b,v28.16b, v27.16b + + eor v0.16b,v0.16b,v26.16b + + subs x9,x9,#1 + bne .Loop_ce + + ret +.ifdef macOS +.else +.size KeccakP1600_ARMv8Asha3_Permute_12rounds_internal,.-KeccakP1600_ARMv8Asha3_Permute_12rounds_internal +.endif + +.ifdef macOS +.globl _KeccakP1600_ARMv8Asha3_Permute_12rounds +_KeccakP1600_ARMv8Asha3_Permute_12rounds: +.else +.globl KeccakP1600_ARMv8Asha3_Permute_12rounds +.type KeccakP1600_ARMv8Asha3_Permute_12rounds,%function +KeccakP1600_ARMv8Asha3_Permute_12rounds: +.endif +.balign 32 + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + ldp d0,d1,[x0,#8*0] + ldp d2,d3,[x0,#8*2] + ldp d4,d5,[x0,#8*4] + ldp d6,d7,[x0,#8*6] + ldp d8,d9,[x0,#8*8] + ldp d10,d11,[x0,#8*10] + ldp d12,d13,[x0,#8*12] + ldp d14,d15,[x0,#8*14] + ldp d16,d17,[x0,#8*16] + ldp d18,d19,[x0,#8*18] + ldp d20,d21,[x0,#8*20] + ldp d22,d23,[x0,#8*22] + ldr d24,[x0,#8*24] + bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal + ldr x30,[sp,#8] + stp d0,d1,[x0,#8*0] + stp d2,d3,[x0,#8*2] + stp d4,d5,[x0,#8*4] + stp d6,d7,[x0,#8*6] + stp d8,d9,[x0,#8*8] + stp d10,d11,[x0,#8*10] + stp d12,d13,[x0,#8*12] + stp d14,d15,[x0,#8*14] + stp d16,d17,[x0,#8*16] + stp d18,d19,[x0,#8*18] + stp d20,d21,[x0,#8*20] + stp d22,d23,[x0,#8*22] + str d24,[x0,#8*24] + + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 + ret +.ifdef macOS +.else +.size KeccakP1600_ARMv8Asha3_Permute_12rounds,.-KeccakP1600_ARMv8Asha3_Permute_12rounds +.endif + +// size_t KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb( +// void *state(x0), +// unsigned int laneCount(x1) = 21, +// const unsigned char *data(x2), +// size_t dataByteLen(x3)) +.ifdef macOS +.globl _KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb +_KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb: +.else +.globl KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb +.type KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb,%function +KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb: +.endif +.balign 32 + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + ldp d0,d1,[x0,#8*0] + ldp d2,d3,[x0,#8*2] + ldp d4,d5,[x0,#8*4] + ldp d6,d7,[x0,#8*6] + ldp d8,d9,[x0,#8*8] + ldp d10,d11,[x0,#8*10] + ldp d12,d13,[x0,#8*12] + ldp d14,d15,[x0,#8*14] + ldp d16,d17,[x0,#8*16] + ldp d18,d19,[x0,#8*18] + ldp d20,d21,[x0,#8*20] + ldp d22,d23,[x0,#8*22] + ldr d24,[x0,#8*24] + + // Prepare the return value + mov x11, #0 + b .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop + +.balign 16 +.KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop: + subs x3, x3, #8*21 + b.cc .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_end + + // Lanes 0-3 + ld1 {v27.8b-v30.8b}, [x2], #32 + eor v0.16b, v0.16b, v27.16b + eor v1.16b, v1.16b, v28.16b + eor v2.16b, v2.16b, v29.16b + eor v3.16b, v3.16b, v30.16b + + // Lanes 4-7 + ld1 {v27.8b-v30.8b}, [x2], #32 + eor v4.16b, v4.16b, v27.16b + eor v5.16b, v5.16b, v28.16b + eor v6.16b, v6.16b, v29.16b + eor v7.16b, v7.16b, v30.16b + + // Lanes 8-11 + ld1 {v27.8b-v30.8b}, [x2], #32 + eor v8.16b, v8.16b, v27.16b + eor v9.16b, v9.16b, v28.16b + eor v10.16b, v10.16b, v29.16b + eor v11.16b, v11.16b, v30.16b + + // Lanes 12-15 + ld1 {v27.8b-v30.8b}, [x2], #32 + eor v12.16b, v12.16b, v27.16b + eor v13.16b, v13.16b, v28.16b + eor v14.16b, v14.16b, v29.16b + eor v15.16b, v15.16b, v30.16b + + // Lanes 16-20 + ld1 {v27.8b-v30.8b}, [x2], #32 + eor v16.16b, v16.16b, v27.16b + eor v17.16b, v17.16b, v28.16b + eor v18.16b, v18.16b, v29.16b + eor v19.16b, v19.16b, v30.16b + ld1 {v27.8b}, [x2], #8 + eor v20.16b, v20.16b, v27.16b + + bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal + + add x11, x11, #8*21 + + b .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop +.KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_end: + + stp d0,d1,[x0,#8*0] + stp d2,d3,[x0,#8*2] + stp d4,d5,[x0,#8*4] + stp d6,d7,[x0,#8*6] + stp d8,d9,[x0,#8*8] + stp d10,d11,[x0,#8*10] + stp d12,d13,[x0,#8*12] + stp d14,d15,[x0,#8*14] + stp d16,d17,[x0,#8*16] + stp d18,d19,[x0,#8*18] + stp d20,d21,[x0,#8*20] + stp d22,d23,[x0,#8*22] + str d24,[x0,#8*24] + + mov x0, x11 + + ldr x30,[sp,#8] + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 + + ret +.ifdef macOS +.else +.size KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb,.-KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb +.endif + +.ifdef macOS +.globl _KeccakP1600times2_ARMv8Asha3_Permute_12rounds +_KeccakP1600times2_ARMv8Asha3_Permute_12rounds: +.else +.globl KeccakP1600times2_ARMv8Asha3_Permute_12rounds +.type KeccakP1600times2_ARMv8Asha3_Permute_12rounds,%function +KeccakP1600times2_ARMv8Asha3_Permute_12rounds: +.endif +.balign 32 + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + ld1 { v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 + ld1 { v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 + ld1 { v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 + ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 + ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 + ld1 {v24.2d}, [x0] + sub x0, x0, #64*6 + + bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal + + ldr x30,[sp,#8] + st1 { v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 + st1 { v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 + st1 { v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 + st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 + st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 + st1 {v24.2d}, [x0] + + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 + + ret +.ifdef macOS +.else +.size KeccakP1600times2_ARMv8Asha3_Permute_12rounds,.-KeccakP1600times2_ARMv8Asha3_Permute_12rounds +.endif + +.ifdef macOS +.globl _KangarooTwelve_ARMv8Asha3_Process2Leaves +_KangarooTwelve_ARMv8Asha3_Process2Leaves: +.else +.globl KangarooTwelve_ARMv8Asha3_Process2Leaves +.type KangarooTwelve_ARMv8Asha3_Process2Leaves,%function +KangarooTwelve_ARMv8Asha3_Process2Leaves: +.endif +.balign 32 + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + movi v0.2d, #0 + movi v1.2d, #0 + movi v2.2d, #0 + movi v3.2d, #0 + movi v4.2d, #0 + movi v5.2d, #0 + movi v6.2d, #0 + movi v7.2d, #0 + movi v8.2d, #0 + movi v9.2d, #0 + movi v10.2d, #0 + movi v11.2d, #0 + movi v12.2d, #0 + movi v13.2d, #0 + movi v14.2d, #0 + movi v15.2d, #0 + movi v16.2d, #0 + movi v17.2d, #0 + movi v18.2d, #0 + movi v19.2d, #0 + movi v20.2d, #0 + movi v21.2d, #0 + movi v22.2d, #0 + movi v23.2d, #0 + movi v24.2d, #0 + + // x12 is input + chunkSize + add x12, x0, #8192 + + // Loop over the first 48 blocks + mov x11, 48 + b .KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks +.KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks: + + // Lanes 0-3 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v0.16b, v0.16b, v25.16b + eor v1.16b, v1.16b, v26.16b + eor v2.16b, v2.16b, v27.16b + eor v3.16b, v3.16b, v28.16b + + // Lanes 4-7 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v4.16b, v4.16b, v25.16b + eor v5.16b, v5.16b, v26.16b + eor v6.16b, v6.16b, v27.16b + eor v7.16b, v7.16b, v28.16b + + // Lanes 8-11 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v8.16b, v8.16b, v25.16b + eor v9.16b, v9.16b, v26.16b + eor v10.16b, v10.16b, v27.16b + eor v11.16b, v11.16b, v28.16b + + // Lanes 12-15 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v12.16b, v12.16b, v25.16b + eor v13.16b, v13.16b, v26.16b + eor v14.16b, v14.16b, v27.16b + eor v15.16b, v15.16b, v28.16b + + // Lanes 16-20 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 + ld1 {v29.d}[0], [x0], #8 + ld1 {v29.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b + rev64 v29.16b, v29.16b +#endif + eor v16.16b, v16.16b, v25.16b + eor v17.16b, v17.16b, v26.16b + eor v18.16b, v18.16b, v27.16b + eor v19.16b, v19.16b, v28.16b + eor v20.16b, v20.16b, v29.16b + + bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal + + subs x11, x11, #1 + bne .KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks + + // Lanes 0-3 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v0.16b, v0.16b, v25.16b + eor v1.16b, v1.16b, v26.16b + eor v2.16b, v2.16b, v27.16b + eor v3.16b, v3.16b, v28.16b + + // Lanes 4-7 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v4.16b, v4.16b, v25.16b + eor v5.16b, v5.16b, v26.16b + eor v6.16b, v6.16b, v27.16b + eor v7.16b, v7.16b, v28.16b + + // Lanes 8-11 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v8.16b, v8.16b, v25.16b + eor v9.16b, v9.16b, v26.16b + eor v10.16b, v10.16b, v27.16b + eor v11.16b, v11.16b, v28.16b + + // Lanes 12-15 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v12.16b, v12.16b, v25.16b + eor v13.16b, v13.16b, v26.16b + eor v14.16b, v14.16b, v27.16b + eor v15.16b, v15.16b, v28.16b + + mov x13, #0x0B + dup v25.2d, x13 + mov x13, #0x8000000000000000 + dup v26.2d, x13 + eor v16.16b, v16.16b, v25.16b + eor v20.16b, v20.16b, v26.16b + + bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal + + st1 {v0.1d-v3.1d}, [x1], #32 + st1 {v0.d}[1], [x1], #8 + st1 {v1.d}[1], [x1], #8 + st1 {v2.d}[1], [x1], #8 + st1 {v3.d}[1], [x1], #8 + + ldr x30,[sp,#8] + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 + + ret +.ifdef macOS +.else +.size KangarooTwelve_ARMv8Asha3_Process2Leaves,.-KangarooTwelve_ARMv8Asha3_Process2Leaves +.endif diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h new file mode 100644 index 0000000..512eca3 --- /dev/null +++ b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h @@ -0,0 +1,65 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#ifndef _KeccakP_1600_SnP_h_ +#define _KeccakP_1600_SnP_h_ + +/* Keccak-p[1600] */ + +#define KeccakP1600_stateSizeInBytes 200 +#define KeccakP1600_stateAlignment 8 +#define KeccakP1600_12rounds_FastLoop_supported + +const char * KeccakP1600_GetImplementation(); +void KeccakP1600_opt64_Initialize(void *state); +void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_ARMv8Asha3_Permute_12rounds(void *state); +void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +#define KeccakP1600_Initialize KeccakP1600_opt64_Initialize +#define KeccakP1600_AddByte KeccakP1600_opt64_AddByte +#define KeccakP1600_AddBytes KeccakP1600_opt64_AddBytes +#define KeccakP1600_Permute_12rounds KeccakP1600_ARMv8Asha3_Permute_12rounds +#define KeccakP1600_ExtractBytes KeccakP1600_opt64_ExtractBytes +#define KeccakP1600_12rounds_FastLoop_Absorb KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb + +/* Keccak-p[1600]×2 */ + +int KeccakP1600times2_IsAvailable(); +const char * KeccakP1600times2_GetImplementation(); +void KeccakP1600times2_ARMv8Asha3_Permute_12rounds(void *state); +void KangarooTwelve_ARMv8Asha3_Process2Leaves(const unsigned char *input, unsigned char *output); + +#define KeccakP1600times2_Permute_12rounds KeccakP1600times2_ARMv8Asha3_Permute_12rounds +#define KangarooTwelve_Process2Leaves KangarooTwelve_ARMv8Asha3_Process2Leaves + +/* Keccak-p[1600]×4 */ + +int KeccakP1600times4_IsAvailable(); +const char * KeccakP1600times4_GetImplementation(); + +/* Keccak-p[1600]×8 */ + +int KeccakP1600times8_IsAvailable(); +const char * KeccakP1600times8_GetImplementation(); + +#endif diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c new file mode 100644 index 0000000..7228d7a --- /dev/null +++ b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c @@ -0,0 +1,227 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include +#include + +const char * KeccakP1600_GetImplementation() +{ + return "ARMv8-A+SHA3 optimized implementation"; +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_Initialize(void *state) +{ + memset(state, 0, 200); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) +{ + uint64_t lane; + + if (length == 0) + return; + if (length == 1) + lane = data[0]; + else { + lane = 0; + memcpy(&lane, data, length); + } + lane <<= offset*8; + ((uint64_t*)state)[lanePosition] ^= lane; +} + +/* ---------------------------------------------------------------- */ + +static void KeccakP1600_opt64_AddLanes(void *state, const unsigned char *data, unsigned int laneCount) +{ + unsigned int i = 0; + + for( ; (i+8)<=laneCount; i+=8) { + ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; + ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; + ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2]; + ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3]; + ((uint64_t*)state)[i+4] ^= ((uint64_t*)data)[i+4]; + ((uint64_t*)state)[i+5] ^= ((uint64_t*)data)[i+5]; + ((uint64_t*)state)[i+6] ^= ((uint64_t*)data)[i+6]; + ((uint64_t*)state)[i+7] ^= ((uint64_t*)data)[i+7]; + } + for( ; (i+4)<=laneCount; i+=4) { + ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; + ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; + ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2]; + ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3]; + } + for( ; (i+2)<=laneCount; i+=2) { + ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; + ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; + } + if (i 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) +{ + uint64_t lane = ((uint64_t*)state)[lanePosition]; + { + uint64_t lane1[1]; + lane1[0] = lane; + memcpy(data, (uint8_t*)lane1+offset, length); + } +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) +{ + memcpy(data, state, laneCount*8); +} + +/* ---------------------------------------------------------------- */ + +#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ + { \ + if ((offset) == 0) { \ + SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ + SnP_ExtractBytesInLane(state, \ + (length)/SnP_laneLengthInBytes, \ + (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ + 0, \ + (length)%SnP_laneLengthInBytes); \ + } \ + else { \ + unsigned int _sizeLeft = (length); \ + unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ + unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ + unsigned char *_curData = (data); \ + while(_sizeLeft > 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +/* Keccak-p[1600]×2 */ + +int KeccakP1600times2_IsAvailable() +{ + return 1; +} + +const char * KeccakP1600times2_GetImplementation() +{ + return "ARMv8-A+SHA3 optimized implementation"; +} + +/* Keccak-p[1600]×4 */ + +int KeccakP1600times4_IsAvailable() +{ + return 0; +} + +const char * KeccakP1600times4_GetImplementation() +{ + return ""; +} + +void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output) +{ +} + +/* Keccak-p[1600]×8 */ + +int KeccakP1600times8_IsAvailable() +{ + return 0; +} + +const char * KeccakP1600times8_GetImplementation() +{ + return ""; +} + +void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output) +{ +} diff --git a/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h new file mode 100644 index 0000000..ac76272 --- /dev/null +++ b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h @@ -0,0 +1,35 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#ifndef _KeccakP_1600_SnP_h_ +#define _KeccakP_1600_SnP_h_ + +#define KeccakP1600_stateSizeInBytes 200 +#define KeccakP1600_stateAlignment 8 +#define KeccakP1600_disableParallelism + +const char * KeccakP1600_GetImplementation(); +void KeccakP1600_Initialize(void *state); +void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_Permute_12rounds(void *state); +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); + +#endif diff --git a/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c new file mode 100644 index 0000000..a72dc7c --- /dev/null +++ b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c @@ -0,0 +1,1068 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include "brg_endian.h" +#include "KeccakP-1600-SnP.h" + +const char * KeccakP1600_GetImplementation() +{ + return "in-place 32-bit implementation"; +} + + +#define ROL32(a, offset) ((((uint32_t)a) << (offset)) ^ (((uint32_t)a) >> (32-(offset)))) + +/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ +#define prepareToBitInterleaving(low, high, temp, temp0, temp1) \ + temp0 = (low); \ + temp = (temp0 ^ (temp0 >> 1)) & 0x22222222UL; temp0 = temp0 ^ temp ^ (temp << 1); \ + temp = (temp0 ^ (temp0 >> 2)) & 0x0C0C0C0CUL; temp0 = temp0 ^ temp ^ (temp << 2); \ + temp = (temp0 ^ (temp0 >> 4)) & 0x00F000F0UL; temp0 = temp0 ^ temp ^ (temp << 4); \ + temp = (temp0 ^ (temp0 >> 8)) & 0x0000FF00UL; temp0 = temp0 ^ temp ^ (temp << 8); \ + temp1 = (high); \ + temp = (temp1 ^ (temp1 >> 1)) & 0x22222222UL; temp1 = temp1 ^ temp ^ (temp << 1); \ + temp = (temp1 ^ (temp1 >> 2)) & 0x0C0C0C0CUL; temp1 = temp1 ^ temp ^ (temp << 2); \ + temp = (temp1 ^ (temp1 >> 4)) & 0x00F000F0UL; temp1 = temp1 ^ temp ^ (temp << 4); \ + temp = (temp1 ^ (temp1 >> 8)) & 0x0000FF00UL; temp1 = temp1 ^ temp ^ (temp << 8); + +#define toBitInterleavingAndXOR(low, high, even, odd, temp, temp0, temp1) \ + prepareToBitInterleaving(low, high, temp, temp0, temp1) \ + even ^= (temp0 & 0x0000FFFF) | (temp1 << 16); \ + odd ^= (temp0 >> 16) | (temp1 & 0xFFFF0000); + +#define toBitInterleavingAndAND(low, high, even, odd, temp, temp0, temp1) \ + prepareToBitInterleaving(low, high, temp, temp0, temp1) \ + even &= (temp0 & 0x0000FFFF) | (temp1 << 16); \ + odd &= (temp0 >> 16) | (temp1 & 0xFFFF0000); + +#define toBitInterleavingAndSet(low, high, even, odd, temp, temp0, temp1) \ + prepareToBitInterleaving(low, high, temp, temp0, temp1) \ + even = (temp0 & 0x0000FFFF) | (temp1 << 16); \ + odd = (temp0 >> 16) | (temp1 & 0xFFFF0000); + +/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ +#define prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ + temp0 = (even); \ + temp1 = (odd); \ + temp = (temp0 & 0x0000FFFF) | (temp1 << 16); \ + temp1 = (temp0 >> 16) | (temp1 & 0xFFFF0000); \ + temp0 = temp; \ + temp = (temp0 ^ (temp0 >> 8)) & 0x0000FF00UL; temp0 = temp0 ^ temp ^ (temp << 8); \ + temp = (temp0 ^ (temp0 >> 4)) & 0x00F000F0UL; temp0 = temp0 ^ temp ^ (temp << 4); \ + temp = (temp0 ^ (temp0 >> 2)) & 0x0C0C0C0CUL; temp0 = temp0 ^ temp ^ (temp << 2); \ + temp = (temp0 ^ (temp0 >> 1)) & 0x22222222UL; temp0 = temp0 ^ temp ^ (temp << 1); \ + temp = (temp1 ^ (temp1 >> 8)) & 0x0000FF00UL; temp1 = temp1 ^ temp ^ (temp << 8); \ + temp = (temp1 ^ (temp1 >> 4)) & 0x00F000F0UL; temp1 = temp1 ^ temp ^ (temp << 4); \ + temp = (temp1 ^ (temp1 >> 2)) & 0x0C0C0C0CUL; temp1 = temp1 ^ temp ^ (temp << 2); \ + temp = (temp1 ^ (temp1 >> 1)) & 0x22222222UL; temp1 = temp1 ^ temp ^ (temp << 1); + +#define fromBitInterleaving(even, odd, low, high, temp, temp0, temp1) \ + prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ + low = temp0; \ + high = temp1; + +#define fromBitInterleavingAndXOR(even, odd, lowIn, highIn, lowOut, highOut, temp, temp0, temp1) \ + prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ + lowOut = lowIn ^ temp0; \ + highOut = highIn ^ temp1; + +void KeccakP1600_SetBytesInLaneToZero(void *state, unsigned int lanePosition, unsigned int offset, unsigned int length) +{ + uint8_t laneAsBytes[8]; + uint32_t low, high; + uint32_t temp, temp0, temp1; + uint32_t *stateAsHalfLanes = (uint32_t*)state; + + memset(laneAsBytes, 0xFF, offset); + memset(laneAsBytes+offset, 0x00, length); + memset(laneAsBytes+offset+length, 0xFF, 8-offset-length); +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + low = *((uint32_t*)(laneAsBytes+0)); + high = *((uint32_t*)(laneAsBytes+4)); +#else + low = laneAsBytes[0] + | ((uint32_t)(laneAsBytes[1]) << 8) + | ((uint32_t)(laneAsBytes[2]) << 16) + | ((uint32_t)(laneAsBytes[3]) << 24); + high = laneAsBytes[4] + | ((uint32_t)(laneAsBytes[5]) << 8) + | ((uint32_t)(laneAsBytes[6]) << 16) + | ((uint32_t)(laneAsBytes[7]) << 24); +#endif + toBitInterleavingAndAND(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_Initialize(void *state) +{ + memset(state, 0, 200); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset) +{ + unsigned int lanePosition = offset/8; + unsigned int offsetInLane = offset%8; + uint32_t low, high; + uint32_t temp, temp0, temp1; + uint32_t *stateAsHalfLanes = (uint32_t*)state; + + if (offsetInLane < 4) { + low = (uint32_t)byte << (offsetInLane*8); + high = 0; + } + else { + low = 0; + high = (uint32_t)byte << ((offsetInLane-4)*8); + } + toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) +{ + uint8_t laneAsBytes[8]; + uint32_t low, high; + uint32_t temp, temp0, temp1; + uint32_t *stateAsHalfLanes = (uint32_t*)state; + + memset(laneAsBytes, 0, 8); + memcpy(laneAsBytes+offset, data, length); +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + low = *((uint32_t*)(laneAsBytes+0)); + high = *((uint32_t*)(laneAsBytes+4)); +#else + low = laneAsBytes[0] + | ((uint32_t)(laneAsBytes[1]) << 8) + | ((uint32_t)(laneAsBytes[2]) << 16) + | ((uint32_t)(laneAsBytes[3]) << 24); + high = laneAsBytes[4] + | ((uint32_t)(laneAsBytes[5]) << 8) + | ((uint32_t)(laneAsBytes[6]) << 16) + | ((uint32_t)(laneAsBytes[7]) << 24); +#endif + toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); +} + +/* ---------------------------------------------------------------- */ + +static void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + const uint32_t * pI = (const uint32_t *)data; + uint32_t * pS = (uint32_t*)state; + uint32_t t, x0, x1; + int i; + for (i = laneCount-1; i >= 0; --i) { +#ifdef NO_MISALIGNED_ACCESSES + uint32_t low; + uint32_t high; + memcpy(&low, pI++, 4); + memcpy(&high, pI++, 4); + toBitInterleavingAndXOR(low, high, *(pS++), *(pS++), t, x0, x1); +#else + toBitInterleavingAndXOR(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1) +#endif + } +#else + unsigned int lanePosition; + for(lanePosition=0; lanePosition 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) +{ + uint32_t *stateAsHalfLanes = (uint32_t*)state; + uint32_t low, high, temp, temp0, temp1; + uint8_t laneAsBytes[8]; + + fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1); +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + *((uint32_t*)(laneAsBytes+0)) = low; + *((uint32_t*)(laneAsBytes+4)) = high; +#else + laneAsBytes[0] = low & 0xFF; + laneAsBytes[1] = (low >> 8) & 0xFF; + laneAsBytes[2] = (low >> 16) & 0xFF; + laneAsBytes[3] = (low >> 24) & 0xFF; + laneAsBytes[4] = high & 0xFF; + laneAsBytes[5] = (high >> 8) & 0xFF; + laneAsBytes[6] = (high >> 16) & 0xFF; + laneAsBytes[7] = (high >> 24) & 0xFF; +#endif + memcpy(data, laneAsBytes+offset, length); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + uint32_t * pI = (uint32_t *)data; + const uint32_t * pS = ( const uint32_t *)state; + uint32_t t, x0, x1; + int i; + for (i = laneCount-1; i >= 0; --i) { +#ifdef NO_MISALIGNED_ACCESSES + uint32_t low; + uint32_t high; + fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1); + memcpy(pI++, &low, 4); + memcpy(pI++, &high, 4); +#else + fromBitInterleaving(*(pS++), *(pS++), *(pI++), *(pI++), t, x0, x1) +#endif + } +#else + unsigned int lanePosition; + for(lanePosition=0; lanePosition> 8) & 0xFF; + laneAsBytes[2] = (low >> 16) & 0xFF; + laneAsBytes[3] = (low >> 24) & 0xFF; + laneAsBytes[4] = high & 0xFF; + laneAsBytes[5] = (high >> 8) & 0xFF; + laneAsBytes[6] = (high >> 16) & 0xFF; + laneAsBytes[7] = (high >> 24) & 0xFF; + memcpy(data+lanePosition*8, laneAsBytes, 8); + } +#endif +} + +/* ---------------------------------------------------------------- */ + +#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ + { \ + if ((offset) == 0) { \ + SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ + SnP_ExtractBytesInLane(state, \ + (length)/SnP_laneLengthInBytes, \ + (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ + 0, \ + (length)%SnP_laneLengthInBytes); \ + } \ + else { \ + unsigned int _sizeLeft = (length); \ + unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ + unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ + unsigned char *_curData = (data); \ + while(_sizeLeft > 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +static const uint32_t KeccakF1600RoundConstants_int2[2*24+1] = +{ + 0x00000001UL, 0x00000000UL, + 0x00000000UL, 0x00000089UL, + 0x00000000UL, 0x8000008bUL, + 0x00000000UL, 0x80008080UL, + 0x00000001UL, 0x0000008bUL, + 0x00000001UL, 0x00008000UL, + 0x00000001UL, 0x80008088UL, + 0x00000001UL, 0x80000082UL, + 0x00000000UL, 0x0000000bUL, + 0x00000000UL, 0x0000000aUL, + 0x00000001UL, 0x00008082UL, + 0x00000000UL, 0x00008003UL, + 0x00000001UL, 0x0000808bUL, + 0x00000001UL, 0x8000000bUL, + 0x00000001UL, 0x8000008aUL, + 0x00000001UL, 0x80000081UL, + 0x00000000UL, 0x80000081UL, + 0x00000000UL, 0x80000008UL, + 0x00000000UL, 0x00000083UL, + 0x00000000UL, 0x80008003UL, + 0x00000001UL, 0x80008088UL, + 0x00000000UL, 0x80000088UL, + 0x00000001UL, 0x00008000UL, + 0x00000000UL, 0x80008082UL, + 0x000000FFUL +}; + +#define KeccakRound0() \ + Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \ + Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \ + Da0 = Cx^ROL32(Du1, 1); \ + Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \ + Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \ + Da1 = Cz^Du0; \ + Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \ + Do0 = Cw^ROL32(Cz, 1); \ + Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \ + Do1 = Cy^Cx; \ + Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \ + De0 = Cx^ROL32(Cy, 1); \ + Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \ + De1 = Cz^Cw; \ + Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \ + Di0 = Du0^ROL32(Cy, 1); \ + Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \ + Di1 = Du1^Cw; \ + Du0 = Cw^ROL32(Cz, 1); \ + Du1 = Cy^Cx; \ +\ + Ba = (Aba0^Da0); \ + Be = ROL32((Age0^De0), 22); \ + Bi = ROL32((Aki1^Di1), 22); \ + Bo = ROL32((Amo1^Do1), 11); \ + Bu = ROL32((Asu0^Du0), 7); \ + Aba0 = Ba ^((~Be)& Bi ); \ + Aba0 ^= *(pRoundConstants++); \ + Age0 = Be ^((~Bi)& Bo ); \ + Aki1 = Bi ^((~Bo)& Bu ); \ + Amo1 = Bo ^((~Bu)& Ba ); \ + Asu0 = Bu ^((~Ba)& Be ); \ + Ba = (Aba1^Da1); \ + Be = ROL32((Age1^De1), 22); \ + Bi = ROL32((Aki0^Di0), 21); \ + Bo = ROL32((Amo0^Do0), 10); \ + Bu = ROL32((Asu1^Du1), 7); \ + Aba1 = Ba ^((~Be)& Bi ); \ + Aba1 ^= *(pRoundConstants++); \ + Age1 = Be ^((~Bi)& Bo ); \ + Aki0 = Bi ^((~Bo)& Bu ); \ + Amo0 = Bo ^((~Bu)& Ba ); \ + Asu1 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Aka1^Da1), 2); \ + Bo = ROL32((Ame1^De1), 23); \ + Bu = ROL32((Asi1^Di1), 31); \ + Ba = ROL32((Abo0^Do0), 14); \ + Be = ROL32((Agu0^Du0), 10); \ + Aka1 = Ba ^((~Be)& Bi ); \ + Ame1 = Be ^((~Bi)& Bo ); \ + Asi1 = Bi ^((~Bo)& Bu ); \ + Abo0 = Bo ^((~Bu)& Ba ); \ + Agu0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Aka0^Da0), 1); \ + Bo = ROL32((Ame0^De0), 22); \ + Bu = ROL32((Asi0^Di0), 30); \ + Ba = ROL32((Abo1^Do1), 14); \ + Be = ROL32((Agu1^Du1), 10); \ + Aka0 = Ba ^((~Be)& Bi ); \ + Ame0 = Be ^((~Bi)& Bo ); \ + Asi0 = Bi ^((~Bo)& Bu ); \ + Abo1 = Bo ^((~Bu)& Ba ); \ + Agu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Asa0^Da0), 9); \ + Ba = ROL32((Abe1^De1), 1); \ + Be = ROL32((Agi0^Di0), 3); \ + Bi = ROL32((Ako1^Do1), 13); \ + Bo = ROL32((Amu0^Du0), 4); \ + Asa0 = Ba ^((~Be)& Bi ); \ + Abe1 = Be ^((~Bi)& Bo ); \ + Agi0 = Bi ^((~Bo)& Bu ); \ + Ako1 = Bo ^((~Bu)& Ba ); \ + Amu0 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Asa1^Da1), 9); \ + Ba = (Abe0^De0); \ + Be = ROL32((Agi1^Di1), 3); \ + Bi = ROL32((Ako0^Do0), 12); \ + Bo = ROL32((Amu1^Du1), 4); \ + Asa1 = Ba ^((~Be)& Bi ); \ + Abe0 = Be ^((~Bi)& Bo ); \ + Agi1 = Bi ^((~Bo)& Bu ); \ + Ako0 = Bo ^((~Bu)& Ba ); \ + Amu1 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Aga0^Da0), 18); \ + Bi = ROL32((Ake0^De0), 5); \ + Bo = ROL32((Ami1^Di1), 8); \ + Bu = ROL32((Aso0^Do0), 28); \ + Ba = ROL32((Abu1^Du1), 14); \ + Aga0 = Ba ^((~Be)& Bi ); \ + Ake0 = Be ^((~Bi)& Bo ); \ + Ami1 = Bi ^((~Bo)& Bu ); \ + Aso0 = Bo ^((~Bu)& Ba ); \ + Abu1 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Aga1^Da1), 18); \ + Bi = ROL32((Ake1^De1), 5); \ + Bo = ROL32((Ami0^Di0), 7); \ + Bu = ROL32((Aso1^Do1), 28); \ + Ba = ROL32((Abu0^Du0), 13); \ + Aga1 = Ba ^((~Be)& Bi ); \ + Ake1 = Be ^((~Bi)& Bo ); \ + Ami0 = Bi ^((~Bo)& Bu ); \ + Aso1 = Bo ^((~Bu)& Ba ); \ + Abu0 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Ama1^Da1), 21); \ + Bu = ROL32((Ase0^De0), 1); \ + Ba = ROL32((Abi0^Di0), 31); \ + Be = ROL32((Ago1^Do1), 28); \ + Bi = ROL32((Aku1^Du1), 20); \ + Ama1 = Ba ^((~Be)& Bi ); \ + Ase0 = Be ^((~Bi)& Bo ); \ + Abi0 = Bi ^((~Bo)& Bu ); \ + Ago1 = Bo ^((~Bu)& Ba ); \ + Aku1 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Ama0^Da0), 20); \ + Bu = ROL32((Ase1^De1), 1); \ + Ba = ROL32((Abi1^Di1), 31); \ + Be = ROL32((Ago0^Do0), 27); \ + Bi = ROL32((Aku0^Du0), 19); \ + Ama0 = Ba ^((~Be)& Bi ); \ + Ase1 = Be ^((~Bi)& Bo ); \ + Abi1 = Bi ^((~Bo)& Bu ); \ + Ago0 = Bo ^((~Bu)& Ba ); \ + Aku0 = Bu ^((~Ba)& Be ) + +#define KeccakRound1() \ + Cx = Asu0^Agu0^Amu0^Abu1^Aku1; \ + Du1 = Age1^Ame0^Abe0^Ake1^Ase1; \ + Da0 = Cx^ROL32(Du1, 1); \ + Cz = Asu1^Agu1^Amu1^Abu0^Aku0; \ + Du0 = Age0^Ame1^Abe1^Ake0^Ase0; \ + Da1 = Cz^Du0; \ + Cw = Aki1^Asi1^Agi0^Ami1^Abi0; \ + Do0 = Cw^ROL32(Cz, 1); \ + Cy = Aki0^Asi0^Agi1^Ami0^Abi1; \ + Do1 = Cy^Cx; \ + Cx = Aba0^Aka1^Asa0^Aga0^Ama1; \ + De0 = Cx^ROL32(Cy, 1); \ + Cz = Aba1^Aka0^Asa1^Aga1^Ama0; \ + De1 = Cz^Cw; \ + Cy = Amo0^Abo1^Ako0^Aso1^Ago0; \ + Di0 = Du0^ROL32(Cy, 1); \ + Cw = Amo1^Abo0^Ako1^Aso0^Ago1; \ + Di1 = Du1^Cw; \ + Du0 = Cw^ROL32(Cz, 1); \ + Du1 = Cy^Cx; \ +\ + Ba = (Aba0^Da0); \ + Be = ROL32((Ame1^De0), 22); \ + Bi = ROL32((Agi1^Di1), 22); \ + Bo = ROL32((Aso1^Do1), 11); \ + Bu = ROL32((Aku1^Du0), 7); \ + Aba0 = Ba ^((~Be)& Bi ); \ + Aba0 ^= *(pRoundConstants++); \ + Ame1 = Be ^((~Bi)& Bo ); \ + Agi1 = Bi ^((~Bo)& Bu ); \ + Aso1 = Bo ^((~Bu)& Ba ); \ + Aku1 = Bu ^((~Ba)& Be ); \ + Ba = (Aba1^Da1); \ + Be = ROL32((Ame0^De1), 22); \ + Bi = ROL32((Agi0^Di0), 21); \ + Bo = ROL32((Aso0^Do0), 10); \ + Bu = ROL32((Aku0^Du1), 7); \ + Aba1 = Ba ^((~Be)& Bi ); \ + Aba1 ^= *(pRoundConstants++); \ + Ame0 = Be ^((~Bi)& Bo ); \ + Agi0 = Bi ^((~Bo)& Bu ); \ + Aso0 = Bo ^((~Bu)& Ba ); \ + Aku0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Asa1^Da1), 2); \ + Bo = ROL32((Ake1^De1), 23); \ + Bu = ROL32((Abi1^Di1), 31); \ + Ba = ROL32((Amo1^Do0), 14); \ + Be = ROL32((Agu0^Du0), 10); \ + Asa1 = Ba ^((~Be)& Bi ); \ + Ake1 = Be ^((~Bi)& Bo ); \ + Abi1 = Bi ^((~Bo)& Bu ); \ + Amo1 = Bo ^((~Bu)& Ba ); \ + Agu0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Asa0^Da0), 1); \ + Bo = ROL32((Ake0^De0), 22); \ + Bu = ROL32((Abi0^Di0), 30); \ + Ba = ROL32((Amo0^Do1), 14); \ + Be = ROL32((Agu1^Du1), 10); \ + Asa0 = Ba ^((~Be)& Bi ); \ + Ake0 = Be ^((~Bi)& Bo ); \ + Abi0 = Bi ^((~Bo)& Bu ); \ + Amo0 = Bo ^((~Bu)& Ba ); \ + Agu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Ama1^Da0), 9); \ + Ba = ROL32((Age1^De1), 1); \ + Be = ROL32((Asi1^Di0), 3); \ + Bi = ROL32((Ako0^Do1), 13); \ + Bo = ROL32((Abu1^Du0), 4); \ + Ama1 = Ba ^((~Be)& Bi ); \ + Age1 = Be ^((~Bi)& Bo ); \ + Asi1 = Bi ^((~Bo)& Bu ); \ + Ako0 = Bo ^((~Bu)& Ba ); \ + Abu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Ama0^Da1), 9); \ + Ba = (Age0^De0); \ + Be = ROL32((Asi0^Di1), 3); \ + Bi = ROL32((Ako1^Do0), 12); \ + Bo = ROL32((Abu0^Du1), 4); \ + Ama0 = Ba ^((~Be)& Bi ); \ + Age0 = Be ^((~Bi)& Bo ); \ + Asi0 = Bi ^((~Bo)& Bu ); \ + Ako1 = Bo ^((~Bu)& Ba ); \ + Abu0 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Aka1^Da0), 18); \ + Bi = ROL32((Abe1^De0), 5); \ + Bo = ROL32((Ami0^Di1), 8); \ + Bu = ROL32((Ago1^Do0), 28); \ + Ba = ROL32((Asu1^Du1), 14); \ + Aka1 = Ba ^((~Be)& Bi ); \ + Abe1 = Be ^((~Bi)& Bo ); \ + Ami0 = Bi ^((~Bo)& Bu ); \ + Ago1 = Bo ^((~Bu)& Ba ); \ + Asu1 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Aka0^Da1), 18); \ + Bi = ROL32((Abe0^De1), 5); \ + Bo = ROL32((Ami1^Di0), 7); \ + Bu = ROL32((Ago0^Do1), 28); \ + Ba = ROL32((Asu0^Du0), 13); \ + Aka0 = Ba ^((~Be)& Bi ); \ + Abe0 = Be ^((~Bi)& Bo ); \ + Ami1 = Bi ^((~Bo)& Bu ); \ + Ago0 = Bo ^((~Bu)& Ba ); \ + Asu0 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Aga1^Da1), 21); \ + Bu = ROL32((Ase0^De0), 1); \ + Ba = ROL32((Aki1^Di0), 31); \ + Be = ROL32((Abo1^Do1), 28); \ + Bi = ROL32((Amu1^Du1), 20); \ + Aga1 = Ba ^((~Be)& Bi ); \ + Ase0 = Be ^((~Bi)& Bo ); \ + Aki1 = Bi ^((~Bo)& Bu ); \ + Abo1 = Bo ^((~Bu)& Ba ); \ + Amu1 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Aga0^Da0), 20); \ + Bu = ROL32((Ase1^De1), 1); \ + Ba = ROL32((Aki0^Di1), 31); \ + Be = ROL32((Abo0^Do0), 27); \ + Bi = ROL32((Amu0^Du0), 19); \ + Aga0 = Ba ^((~Be)& Bi ); \ + Ase1 = Be ^((~Bi)& Bo ); \ + Aki0 = Bi ^((~Bo)& Bu ); \ + Abo0 = Bo ^((~Bu)& Ba ); \ + Amu0 = Bu ^((~Ba)& Be ); + +#define KeccakRound2() \ + Cx = Aku1^Agu0^Abu1^Asu1^Amu1; \ + Du1 = Ame0^Ake0^Age0^Abe0^Ase1; \ + Da0 = Cx^ROL32(Du1, 1); \ + Cz = Aku0^Agu1^Abu0^Asu0^Amu0; \ + Du0 = Ame1^Ake1^Age1^Abe1^Ase0; \ + Da1 = Cz^Du0; \ + Cw = Agi1^Abi1^Asi1^Ami0^Aki1; \ + Do0 = Cw^ROL32(Cz, 1); \ + Cy = Agi0^Abi0^Asi0^Ami1^Aki0; \ + Do1 = Cy^Cx; \ + Cx = Aba0^Asa1^Ama1^Aka1^Aga1; \ + De0 = Cx^ROL32(Cy, 1); \ + Cz = Aba1^Asa0^Ama0^Aka0^Aga0; \ + De1 = Cz^Cw; \ + Cy = Aso0^Amo0^Ako1^Ago0^Abo0; \ + Di0 = Du0^ROL32(Cy, 1); \ + Cw = Aso1^Amo1^Ako0^Ago1^Abo1; \ + Di1 = Du1^Cw; \ + Du0 = Cw^ROL32(Cz, 1); \ + Du1 = Cy^Cx; \ +\ + Ba = (Aba0^Da0); \ + Be = ROL32((Ake1^De0), 22); \ + Bi = ROL32((Asi0^Di1), 22); \ + Bo = ROL32((Ago0^Do1), 11); \ + Bu = ROL32((Amu1^Du0), 7); \ + Aba0 = Ba ^((~Be)& Bi ); \ + Aba0 ^= *(pRoundConstants++); \ + Ake1 = Be ^((~Bi)& Bo ); \ + Asi0 = Bi ^((~Bo)& Bu ); \ + Ago0 = Bo ^((~Bu)& Ba ); \ + Amu1 = Bu ^((~Ba)& Be ); \ + Ba = (Aba1^Da1); \ + Be = ROL32((Ake0^De1), 22); \ + Bi = ROL32((Asi1^Di0), 21); \ + Bo = ROL32((Ago1^Do0), 10); \ + Bu = ROL32((Amu0^Du1), 7); \ + Aba1 = Ba ^((~Be)& Bi ); \ + Aba1 ^= *(pRoundConstants++); \ + Ake0 = Be ^((~Bi)& Bo ); \ + Asi1 = Bi ^((~Bo)& Bu ); \ + Ago1 = Bo ^((~Bu)& Ba ); \ + Amu0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Ama0^Da1), 2); \ + Bo = ROL32((Abe0^De1), 23); \ + Bu = ROL32((Aki0^Di1), 31); \ + Ba = ROL32((Aso1^Do0), 14); \ + Be = ROL32((Agu0^Du0), 10); \ + Ama0 = Ba ^((~Be)& Bi ); \ + Abe0 = Be ^((~Bi)& Bo ); \ + Aki0 = Bi ^((~Bo)& Bu ); \ + Aso1 = Bo ^((~Bu)& Ba ); \ + Agu0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Ama1^Da0), 1); \ + Bo = ROL32((Abe1^De0), 22); \ + Bu = ROL32((Aki1^Di0), 30); \ + Ba = ROL32((Aso0^Do1), 14); \ + Be = ROL32((Agu1^Du1), 10); \ + Ama1 = Ba ^((~Be)& Bi ); \ + Abe1 = Be ^((~Bi)& Bo ); \ + Aki1 = Bi ^((~Bo)& Bu ); \ + Aso0 = Bo ^((~Bu)& Ba ); \ + Agu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Aga1^Da0), 9); \ + Ba = ROL32((Ame0^De1), 1); \ + Be = ROL32((Abi1^Di0), 3); \ + Bi = ROL32((Ako1^Do1), 13); \ + Bo = ROL32((Asu1^Du0), 4); \ + Aga1 = Ba ^((~Be)& Bi ); \ + Ame0 = Be ^((~Bi)& Bo ); \ + Abi1 = Bi ^((~Bo)& Bu ); \ + Ako1 = Bo ^((~Bu)& Ba ); \ + Asu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Aga0^Da1), 9); \ + Ba = (Ame1^De0); \ + Be = ROL32((Abi0^Di1), 3); \ + Bi = ROL32((Ako0^Do0), 12); \ + Bo = ROL32((Asu0^Du1), 4); \ + Aga0 = Ba ^((~Be)& Bi ); \ + Ame1 = Be ^((~Bi)& Bo ); \ + Abi0 = Bi ^((~Bo)& Bu ); \ + Ako0 = Bo ^((~Bu)& Ba ); \ + Asu0 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Asa1^Da0), 18); \ + Bi = ROL32((Age1^De0), 5); \ + Bo = ROL32((Ami1^Di1), 8); \ + Bu = ROL32((Abo1^Do0), 28); \ + Ba = ROL32((Aku0^Du1), 14); \ + Asa1 = Ba ^((~Be)& Bi ); \ + Age1 = Be ^((~Bi)& Bo ); \ + Ami1 = Bi ^((~Bo)& Bu ); \ + Abo1 = Bo ^((~Bu)& Ba ); \ + Aku0 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Asa0^Da1), 18); \ + Bi = ROL32((Age0^De1), 5); \ + Bo = ROL32((Ami0^Di0), 7); \ + Bu = ROL32((Abo0^Do1), 28); \ + Ba = ROL32((Aku1^Du0), 13); \ + Asa0 = Ba ^((~Be)& Bi ); \ + Age0 = Be ^((~Bi)& Bo ); \ + Ami0 = Bi ^((~Bo)& Bu ); \ + Abo0 = Bo ^((~Bu)& Ba ); \ + Aku1 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Aka0^Da1), 21); \ + Bu = ROL32((Ase0^De0), 1); \ + Ba = ROL32((Agi1^Di0), 31); \ + Be = ROL32((Amo0^Do1), 28); \ + Bi = ROL32((Abu0^Du1), 20); \ + Aka0 = Ba ^((~Be)& Bi ); \ + Ase0 = Be ^((~Bi)& Bo ); \ + Agi1 = Bi ^((~Bo)& Bu ); \ + Amo0 = Bo ^((~Bu)& Ba ); \ + Abu0 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Aka1^Da0), 20); \ + Bu = ROL32((Ase1^De1), 1); \ + Ba = ROL32((Agi0^Di1), 31); \ + Be = ROL32((Amo1^Do0), 27); \ + Bi = ROL32((Abu1^Du0), 19); \ + Aka1 = Ba ^((~Be)& Bi ); \ + Ase1 = Be ^((~Bi)& Bo ); \ + Agi0 = Bi ^((~Bo)& Bu ); \ + Amo1 = Bo ^((~Bu)& Ba ); \ + Abu1 = Bu ^((~Ba)& Be ); + +#define KeccakRound3() \ + Cx = Amu1^Agu0^Asu1^Aku0^Abu0; \ + Du1 = Ake0^Abe1^Ame1^Age0^Ase1; \ + Da0 = Cx^ROL32(Du1, 1); \ + Cz = Amu0^Agu1^Asu0^Aku1^Abu1; \ + Du0 = Ake1^Abe0^Ame0^Age1^Ase0; \ + Da1 = Cz^Du0; \ + Cw = Asi0^Aki0^Abi1^Ami1^Agi1; \ + Do0 = Cw^ROL32(Cz, 1); \ + Cy = Asi1^Aki1^Abi0^Ami0^Agi0; \ + Do1 = Cy^Cx; \ + Cx = Aba0^Ama0^Aga1^Asa1^Aka0; \ + De0 = Cx^ROL32(Cy, 1); \ + Cz = Aba1^Ama1^Aga0^Asa0^Aka1; \ + De1 = Cz^Cw; \ + Cy = Ago1^Aso0^Ako0^Abo0^Amo1; \ + Di0 = Du0^ROL32(Cy, 1); \ + Cw = Ago0^Aso1^Ako1^Abo1^Amo0; \ + Di1 = Du1^Cw; \ + Du0 = Cw^ROL32(Cz, 1); \ + Du1 = Cy^Cx; \ +\ + Ba = (Aba0^Da0); \ + Be = ROL32((Abe0^De0), 22); \ + Bi = ROL32((Abi0^Di1), 22); \ + Bo = ROL32((Abo0^Do1), 11); \ + Bu = ROL32((Abu0^Du0), 7); \ + Aba0 = Ba ^((~Be)& Bi ); \ + Aba0 ^= *(pRoundConstants++); \ + Abe0 = Be ^((~Bi)& Bo ); \ + Abi0 = Bi ^((~Bo)& Bu ); \ + Abo0 = Bo ^((~Bu)& Ba ); \ + Abu0 = Bu ^((~Ba)& Be ); \ + Ba = (Aba1^Da1); \ + Be = ROL32((Abe1^De1), 22); \ + Bi = ROL32((Abi1^Di0), 21); \ + Bo = ROL32((Abo1^Do0), 10); \ + Bu = ROL32((Abu1^Du1), 7); \ + Aba1 = Ba ^((~Be)& Bi ); \ + Aba1 ^= *(pRoundConstants++); \ + Abe1 = Be ^((~Bi)& Bo ); \ + Abi1 = Bi ^((~Bo)& Bu ); \ + Abo1 = Bo ^((~Bu)& Ba ); \ + Abu1 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Aga0^Da1), 2); \ + Bo = ROL32((Age0^De1), 23); \ + Bu = ROL32((Agi0^Di1), 31); \ + Ba = ROL32((Ago0^Do0), 14); \ + Be = ROL32((Agu0^Du0), 10); \ + Aga0 = Ba ^((~Be)& Bi ); \ + Age0 = Be ^((~Bi)& Bo ); \ + Agi0 = Bi ^((~Bo)& Bu ); \ + Ago0 = Bo ^((~Bu)& Ba ); \ + Agu0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Aga1^Da0), 1); \ + Bo = ROL32((Age1^De0), 22); \ + Bu = ROL32((Agi1^Di0), 30); \ + Ba = ROL32((Ago1^Do1), 14); \ + Be = ROL32((Agu1^Du1), 10); \ + Aga1 = Ba ^((~Be)& Bi ); \ + Age1 = Be ^((~Bi)& Bo ); \ + Agi1 = Bi ^((~Bo)& Bu ); \ + Ago1 = Bo ^((~Bu)& Ba ); \ + Agu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Aka0^Da0), 9); \ + Ba = ROL32((Ake0^De1), 1); \ + Be = ROL32((Aki0^Di0), 3); \ + Bi = ROL32((Ako0^Do1), 13); \ + Bo = ROL32((Aku0^Du0), 4); \ + Aka0 = Ba ^((~Be)& Bi ); \ + Ake0 = Be ^((~Bi)& Bo ); \ + Aki0 = Bi ^((~Bo)& Bu ); \ + Ako0 = Bo ^((~Bu)& Ba ); \ + Aku0 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Aka1^Da1), 9); \ + Ba = (Ake1^De0); \ + Be = ROL32((Aki1^Di1), 3); \ + Bi = ROL32((Ako1^Do0), 12); \ + Bo = ROL32((Aku1^Du1), 4); \ + Aka1 = Ba ^((~Be)& Bi ); \ + Ake1 = Be ^((~Bi)& Bo ); \ + Aki1 = Bi ^((~Bo)& Bu ); \ + Ako1 = Bo ^((~Bu)& Ba ); \ + Aku1 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Ama0^Da0), 18); \ + Bi = ROL32((Ame0^De0), 5); \ + Bo = ROL32((Ami0^Di1), 8); \ + Bu = ROL32((Amo0^Do0), 28); \ + Ba = ROL32((Amu0^Du1), 14); \ + Ama0 = Ba ^((~Be)& Bi ); \ + Ame0 = Be ^((~Bi)& Bo ); \ + Ami0 = Bi ^((~Bo)& Bu ); \ + Amo0 = Bo ^((~Bu)& Ba ); \ + Amu0 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Ama1^Da1), 18); \ + Bi = ROL32((Ame1^De1), 5); \ + Bo = ROL32((Ami1^Di0), 7); \ + Bu = ROL32((Amo1^Do1), 28); \ + Ba = ROL32((Amu1^Du0), 13); \ + Ama1 = Ba ^((~Be)& Bi ); \ + Ame1 = Be ^((~Bi)& Bo ); \ + Ami1 = Bi ^((~Bo)& Bu ); \ + Amo1 = Bo ^((~Bu)& Ba ); \ + Amu1 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Asa0^Da1), 21); \ + Bu = ROL32((Ase0^De0), 1); \ + Ba = ROL32((Asi0^Di0), 31); \ + Be = ROL32((Aso0^Do1), 28); \ + Bi = ROL32((Asu0^Du1), 20); \ + Asa0 = Ba ^((~Be)& Bi ); \ + Ase0 = Be ^((~Bi)& Bo ); \ + Asi0 = Bi ^((~Bo)& Bu ); \ + Aso0 = Bo ^((~Bu)& Ba ); \ + Asu0 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Asa1^Da0), 20); \ + Bu = ROL32((Ase1^De1), 1); \ + Ba = ROL32((Asi1^Di1), 31); \ + Be = ROL32((Aso1^Do0), 27); \ + Bi = ROL32((Asu1^Du0), 19); \ + Asa1 = Ba ^((~Be)& Bi ); \ + Ase1 = Be ^((~Bi)& Bo ); \ + Asi1 = Bi ^((~Bo)& Bu ); \ + Aso1 = Bo ^((~Bu)& Ba ); \ + Asu1 = Bu ^((~Ba)& Be ); + +void KeccakP1600_Permute_Nrounds(void *state, unsigned int nRounds) +{ + uint32_t Da0, De0, Di0, Do0, Du0; + uint32_t Da1, De1, Di1, Do1, Du1; + uint32_t Ba, Be, Bi, Bo, Bu; + uint32_t Cx, Cy, Cz, Cw; + const uint32_t *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2; + uint32_t *stateAsHalfLanes = (uint32_t*)state; + #define Aba0 stateAsHalfLanes[ 0] + #define Aba1 stateAsHalfLanes[ 1] + #define Abe0 stateAsHalfLanes[ 2] + #define Abe1 stateAsHalfLanes[ 3] + #define Abi0 stateAsHalfLanes[ 4] + #define Abi1 stateAsHalfLanes[ 5] + #define Abo0 stateAsHalfLanes[ 6] + #define Abo1 stateAsHalfLanes[ 7] + #define Abu0 stateAsHalfLanes[ 8] + #define Abu1 stateAsHalfLanes[ 9] + #define Aga0 stateAsHalfLanes[10] + #define Aga1 stateAsHalfLanes[11] + #define Age0 stateAsHalfLanes[12] + #define Age1 stateAsHalfLanes[13] + #define Agi0 stateAsHalfLanes[14] + #define Agi1 stateAsHalfLanes[15] + #define Ago0 stateAsHalfLanes[16] + #define Ago1 stateAsHalfLanes[17] + #define Agu0 stateAsHalfLanes[18] + #define Agu1 stateAsHalfLanes[19] + #define Aka0 stateAsHalfLanes[20] + #define Aka1 stateAsHalfLanes[21] + #define Ake0 stateAsHalfLanes[22] + #define Ake1 stateAsHalfLanes[23] + #define Aki0 stateAsHalfLanes[24] + #define Aki1 stateAsHalfLanes[25] + #define Ako0 stateAsHalfLanes[26] + #define Ako1 stateAsHalfLanes[27] + #define Aku0 stateAsHalfLanes[28] + #define Aku1 stateAsHalfLanes[29] + #define Ama0 stateAsHalfLanes[30] + #define Ama1 stateAsHalfLanes[31] + #define Ame0 stateAsHalfLanes[32] + #define Ame1 stateAsHalfLanes[33] + #define Ami0 stateAsHalfLanes[34] + #define Ami1 stateAsHalfLanes[35] + #define Amo0 stateAsHalfLanes[36] + #define Amo1 stateAsHalfLanes[37] + #define Amu0 stateAsHalfLanes[38] + #define Amu1 stateAsHalfLanes[39] + #define Asa0 stateAsHalfLanes[40] + #define Asa1 stateAsHalfLanes[41] + #define Ase0 stateAsHalfLanes[42] + #define Ase1 stateAsHalfLanes[43] + #define Asi0 stateAsHalfLanes[44] + #define Asi1 stateAsHalfLanes[45] + #define Aso0 stateAsHalfLanes[46] + #define Aso1 stateAsHalfLanes[47] + #define Asu0 stateAsHalfLanes[48] + #define Asu1 stateAsHalfLanes[49] + + nRounds &= 3; + switch ( nRounds ) + { + #define I0 Ba + #define I1 Be + #define T0 Bi + #define T1 Bo + #define SwapPI13( in0,in1,in2,in3,eo0,eo1,eo2,eo3 ) \ + I0 = (in0)[0]; I1 = (in0)[1]; \ + T0 = (in1)[0]; T1 = (in1)[1]; \ + (in0)[eo0] = T0; (in0)[eo0^1] = T1; \ + T0 = (in2)[0]; T1 = (in2)[1]; \ + (in1)[eo1] = T0; (in1)[eo1^1] = T1; \ + T0 = (in3)[0]; T1 = (in3)[1]; \ + (in2)[eo2] = T0; (in2)[eo2^1] = T1; \ + (in3)[eo3] = I0; (in3)[eo3^1] = I1 + #define SwapPI2( in0,in1,in2,in3 ) \ + I0 = (in0)[0]; I1 = (in0)[1]; \ + T0 = (in1)[0]; T1 = (in1)[1]; \ + (in0)[1] = T0; (in0)[0] = T1; \ + (in1)[1] = I0; (in1)[0] = I1; \ + I0 = (in2)[0]; I1 = (in2)[1]; \ + T0 = (in3)[0]; T1 = (in3)[1]; \ + (in2)[1] = T0; (in2)[0] = T1; \ + (in3)[1] = I0; (in3)[0] = I1 + #define SwapEO( even,odd ) T0 = even; even = odd; odd = T0 + + case 1: + SwapPI13( &Aga0, &Aka0, &Asa0, &Ama0, 1, 0, 1, 0 ); + SwapPI13( &Abe0, &Age0, &Ame0, &Ake0, 0, 1, 0, 1 ); + SwapPI13( &Abi0, &Aki0, &Agi0, &Asi0, 1, 0, 1, 0 ); + SwapEO( Ami0, Ami1 ); + SwapPI13( &Abo0, &Amo0, &Aso0, &Ago0, 1, 0, 1, 0 ); + SwapEO( Ako0, Ako1 ); + SwapPI13( &Abu0, &Asu0, &Aku0, &Amu0, 0, 1, 0, 1 ); + break; + + case 2: + SwapPI2( &Aga0, &Asa0, &Aka0, &Ama0 ); + SwapPI2( &Abe0, &Ame0, &Age0, &Ake0 ); + SwapPI2( &Abi0, &Agi0, &Aki0, &Asi0 ); + SwapPI2( &Abo0, &Aso0, &Ago0, &Amo0 ); + SwapPI2( &Abu0, &Aku0, &Amu0, &Asu0 ); + break; + + case 3: + SwapPI13( &Aga0, &Ama0, &Asa0, &Aka0, 0, 1, 0, 1 ); + SwapPI13( &Abe0, &Ake0, &Ame0, &Age0, 1, 0, 1, 0 ); + SwapPI13( &Abi0, &Asi0, &Agi0, &Aki0, 0, 1, 0, 1 ); + SwapEO( Ami0, Ami1 ); + SwapPI13( &Abo0, &Ago0, &Aso0, &Amo0, 0, 1, 0, 1 ); + SwapEO( Ako0, Ako1 ); + SwapPI13( &Abu0, &Amu0, &Aku0, &Asu0, 1, 0, 1, 0 ); + break; + #undef I0 + #undef I1 + #undef T0 + #undef T1 + #undef SwapPI13 + #undef SwapPI2 + #undef SwapEO + } + + do + { + /* Code for 4 rounds, using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */ + switch ( nRounds ) + { + case 0: KeccakRound0(); /* fall through */ + case 3: KeccakRound1(); + case 2: KeccakRound2(); + case 1: KeccakRound3(); + } + nRounds = 0; + } + while ( *pRoundConstants != 0xFF ); + + #undef Aba0 + #undef Aba1 + #undef Abe0 + #undef Abe1 + #undef Abi0 + #undef Abi1 + #undef Abo0 + #undef Abo1 + #undef Abu0 + #undef Abu1 + #undef Aga0 + #undef Aga1 + #undef Age0 + #undef Age1 + #undef Agi0 + #undef Agi1 + #undef Ago0 + #undef Ago1 + #undef Agu0 + #undef Agu1 + #undef Aka0 + #undef Aka1 + #undef Ake0 + #undef Ake1 + #undef Aki0 + #undef Aki1 + #undef Ako0 + #undef Ako1 + #undef Aku0 + #undef Aku1 + #undef Ama0 + #undef Ama1 + #undef Ame0 + #undef Ame1 + #undef Ami0 + #undef Ami1 + #undef Amo0 + #undef Amo1 + #undef Amu0 + #undef Amu1 + #undef Asa0 + #undef Asa1 + #undef Ase0 + #undef Ase1 + #undef Asi0 + #undef Asi1 + #undef Aso0 + #undef Aso1 + #undef Asu0 + #undef Asu1 +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_Permute_12rounds(void *state) +{ + KeccakP1600_Permute_Nrounds(state, 12); +} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s new file mode 100644 index 0000000..d7ae46b --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s @@ -0,0 +1,664 @@ +# Copyright (c) 2006-2017, CRYPTOGAMS by +# Copyright (c) 2017 Ronny Van Keer +# All rights reserved. +# +# The source code in this file is licensed under the CRYPTOGAMS license. +# For further details see http://www.openssl.org/~appro/cryptogams/. +# +# Notes: +# The code for the permutation (__KeccakF1600) was generated with +# Andy Polyakov's keccak1600-avx2.pl from the CRYPTOGAMS project +# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx2.pl). +# The rest of the code was written by Ronny Van Keer. +# Adaptations for macOS by Stéphane Léon. + +.text + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX2_Initialize(void *state); +# +.ifdef macOS +.globl _KeccakP1600_AVX2_Initialize +_KeccakP1600_AVX2_Initialize: +.else +.globl KeccakP1600_AVX2_Initialize +.type KeccakP1600_AVX2_Initialize,@function +KeccakP1600_AVX2_Initialize: +.endif +.balign 32 + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm0,0*32(%rdi) + vmovdqu %ymm0,1*32(%rdi) + vmovdqu %ymm0,2*32(%rdi) + vmovdqu %ymm0,3*32(%rdi) + vmovdqu %ymm0,4*32(%rdi) + vmovdqu %ymm0,5*32(%rdi) + movq $0,6*32(%rdi) + ret +.ifdef macOS +.else +.size KeccakP1600_AVX2_Initialize,.-KeccakP1600_AVX2_Initialize +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset); +# %rdi %rsi %rdx +# +.ifdef macOS +.globl _KeccakP1600_AVX2_AddByte +_KeccakP1600_AVX2_AddByte: +.else +.globl KeccakP1600_AVX2_AddByte +.type KeccakP1600_AVX2_AddByte,@function +KeccakP1600_AVX2_AddByte: +.endif +.balign 32 + mov %rdx, %rax + and $7, %rax + and $0xFFFFFFF8, %edx + lea mapState(%rip), %r9 + mov (%r9, %rdx), %rdx + add %rdx, %rdi + add %rax, %rdi + xorb %sil, (%rdi) + ret +.ifdef macOS +.else +.size KeccakP1600_AVX2_AddByte,.-KeccakP1600_AVX2_AddByte +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX2_AddBytes +_KeccakP1600_AVX2_AddBytes: +.else +.globl KeccakP1600_AVX2_AddBytes +.type KeccakP1600_AVX2_AddBytes,@function +KeccakP1600_AVX2_AddBytes: +.endif +.balign 32 + cmp $0, %rcx + jz KeccakP1600_AVX2_AddBytes_Exit + mov %rdx, %rax # rax offset in lane + and $0xFFFFFFF8, %edx # rdx pointer into state index mapper + lea mapState(%rip), %r9 + add %r9, %rdx + and $7, %rax + jz KeccakP1600_AVX2_AddBytes_LaneAlignedCheck + mov $8, %r9 # r9 is (max) length of incomplete lane + sub %rax, %r9 + cmp %rcx, %r9 + cmovae %rcx, %r9 + sub %r9, %rcx # length -= length of incomplete lane + add (%rdx), %rax # rax = pointer to state lane + add $8, %rdx + add %rdi, %rax +KeccakP1600_AVX2_AddBytes_NotAlignedLoop: + mov (%rsi), %r8b + inc %rsi + xorb %r8b, (%rax) + inc %rax + dec %r9 + jnz KeccakP1600_AVX2_AddBytes_NotAlignedLoop + jmp KeccakP1600_AVX2_AddBytes_LaneAlignedCheck +KeccakP1600_AVX2_AddBytes_LaneAlignedLoop: + mov (%rsi), %r8 + add $8, %rsi + mov (%rdx), %rax + add $8, %rdx + add %rdi, %rax + xor %r8, (%rax) +KeccakP1600_AVX2_AddBytes_LaneAlignedCheck: + sub $8, %rcx + jnc KeccakP1600_AVX2_AddBytes_LaneAlignedLoop +KeccakP1600_AVX2_AddBytes_LastIncompleteLane: + add $8, %rcx + jz KeccakP1600_AVX2_AddBytes_Exit + mov (%rdx), %rax + add %rdi, %rax +KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop: + mov (%rsi), %r8b + inc %rsi + xor %r8b, (%rax) + inc %rax + dec %rcx + jnz KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop +KeccakP1600_AVX2_AddBytes_Exit: + ret +.ifdef macOS +.else +.size KeccakP1600_AVX2_AddBytes,.-KeccakP1600_AVX2_AddBytes +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX2_ExtractBytes +_KeccakP1600_AVX2_ExtractBytes: +.else +.globl KeccakP1600_AVX2_ExtractBytes +.type KeccakP1600_AVX2_ExtractBytes,@function +KeccakP1600_AVX2_ExtractBytes: +.endif +.balign 32 + push %rbx + cmp $0, %rcx + jz KeccakP1600_AVX2_ExtractBytes_Exit + mov %rdx, %rax # rax offset in lane + and $0xFFFFFFF8, %edx # rdx pointer into state index mapper + lea mapState(%rip), %r9 + add %r9, %rdx + and $7, %rax + jz KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck + mov $8, %rbx # rbx is (max) length of incomplete lane + sub %rax, %rbx + cmp %rcx, %rbx + cmovae %rcx, %rbx + sub %rbx, %rcx # length -= length of incomplete lane + mov (%rdx), %r9 + add $8, %rdx + add %rdi, %r9 + add %rax, %r9 +KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop: + mov (%r9), %r8b + inc %r9 + mov %r8b, (%rsi) + inc %rsi + dec %rbx + jnz KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop + jmp KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck +KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop: + mov (%rdx), %rax + add $8, %rdx + add %rdi, %rax + mov (%rax), %r8 + mov %r8, (%rsi) + add $8, %rsi +KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck: + sub $8, %rcx + jnc KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop +KeccakP1600_AVX2_ExtractBytes_LastIncompleteLane: + add $8, %rcx + jz KeccakP1600_AVX2_ExtractBytes_Exit + mov (%rdx), %rax + add %rdi, %rax + mov (%rax), %r8 +KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop: + mov %r8b, (%rsi) + shr $8, %r8 + inc %rsi + dec %rcx + jnz KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop +KeccakP1600_AVX2_ExtractBytes_Exit: + pop %rbx + ret +.ifdef macOS +.else +.size KeccakP1600_AVX2_ExtractBytes,.-KeccakP1600_AVX2_ExtractBytes +.endif + +# ----------------------------------------------------------------------------- +# +# internal +# +.ifdef macOS +.else +.type __KeccakF1600,@function +.endif +.balign 32 +__KeccakF1600: +.Loop_avx2: + ######################################### Theta + vpshufd $0b01001110,%ymm2,%ymm13 + vpxor %ymm3,%ymm5,%ymm12 + vpxor %ymm6,%ymm4,%ymm9 + vpxor %ymm1,%ymm12,%ymm12 + vpxor %ymm9,%ymm12,%ymm12 # C[1..4] + + vpermq $0b10010011,%ymm12,%ymm11 + vpxor %ymm2,%ymm13,%ymm13 + vpermq $0b01001110,%ymm13,%ymm7 + + vpsrlq $63,%ymm12,%ymm8 + vpaddq %ymm12,%ymm12,%ymm9 + vpor %ymm9,%ymm8,%ymm8 # ROL64(C[1..4],1) + + vpermq $0b00111001,%ymm8,%ymm15 + vpxor %ymm11,%ymm8,%ymm14 + vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4] + + vpxor %ymm0,%ymm13,%ymm13 + vpxor %ymm7,%ymm13,%ymm13 # C[0..0] + + vpsrlq $63,%ymm13,%ymm7 + vpaddq %ymm13,%ymm13,%ymm8 + vpor %ymm7,%ymm8,%ymm8 # ROL64(C[0..0],1) + + vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0] + vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0] + + vpblendd $0b11000000,%ymm8,%ymm15,%ymm15 + vpblendd $0b00000011,%ymm13,%ymm11,%ymm11 + vpxor %ymm11,%ymm15,%ymm15 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3] + + ######################################### Rho + Pi + pre-Chi shuffle + vpsllvq 0*32-96(%r8),%ymm2,%ymm10 + vpsrlvq 0*32-96(%r9),%ymm2,%ymm2 + vpor %ymm10,%ymm2,%ymm2 + + vpxor %ymm15,%ymm3,%ymm3 # ^= D[1..4] from Theta + vpsllvq 2*32-96(%r8),%ymm3,%ymm11 + vpsrlvq 2*32-96(%r9),%ymm3,%ymm3 + vpor %ymm11,%ymm3,%ymm3 + + vpxor %ymm15,%ymm4,%ymm4 # ^= D[1..4] from Theta + vpsllvq 3*32-96(%r8),%ymm4,%ymm12 + vpsrlvq 3*32-96(%r9),%ymm4,%ymm4 + vpor %ymm12,%ymm4,%ymm4 + + vpxor %ymm15,%ymm5,%ymm5 # ^= D[1..4] from Theta + vpsllvq 4*32-96(%r8),%ymm5,%ymm13 + vpsrlvq 4*32-96(%r9),%ymm5,%ymm5 + vpor %ymm13,%ymm5,%ymm5 + + vpxor %ymm15,%ymm6,%ymm6 # ^= D[1..4] from Theta + vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3 + vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4 + vpsllvq 5*32-96(%r8),%ymm6,%ymm14 + vpsrlvq 5*32-96(%r9),%ymm6,%ymm8 + vpor %ymm14,%ymm8,%ymm8 # %ymm6 -> future %ymm1 + + vpxor %ymm15,%ymm1,%ymm1 # ^= D[1..4] from Theta + vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5 + vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6 + vpsllvq 1*32-96(%r8),%ymm1,%ymm15 + vpsrlvq 1*32-96(%r9),%ymm1,%ymm9 + vpor %ymm15,%ymm9,%ymm9 # %ymm1 -> future %ymm2 + + ######################################### Chi + vpsrldq $8,%ymm8,%ymm14 + vpandn %ymm14,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0] + + vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0] + vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1] + vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4] + vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0] + vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0] + vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1] + vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4] + vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0] + vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0] + vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1] + vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4] + vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0] + vpandn %ymm15,%ymm3,%ymm3 # tgting [3][1] [1][2] [4][3] [2][4] + vpandn %ymm14,%ymm5,%ymm5 # tgting [3][2] [1][4] [4][1] [2][3] + + vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3] + vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4] + vpxor %ymm10,%ymm3,%ymm3 + vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3] + vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4] + vpxor %ymm12,%ymm5,%ymm5 + vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3] + vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4] + vpandn %ymm15,%ymm6,%ymm6 # tgting [3][3] [1][1] [4][4] [2][2] + vpxor %ymm13,%ymm6,%ymm6 + + vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3] + vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3] + vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2] + vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2] + vpandn %ymm15,%ymm1,%ymm1 # tgting [0][4] [0][3] [0][2] [0][1] + + vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1] + vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2] + vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1] + vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2] + vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1] + vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2] + vpandn %ymm14,%ymm2,%ymm2 # tgting [3][0] [1][0] [4][0] [2][0] + vpxor %ymm9,%ymm2,%ymm2 + + vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0] + vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle + vpermq $0b10001101,%ymm5,%ymm5 + vpermq $0b01110010,%ymm6,%ymm6 + + vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2] + vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3] + vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2] + vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3] + vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2] + vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3] + vpandn %ymm14,%ymm4,%ymm4 # tgting [3][4] [1][3] [4][2] [2][1] + + vpxor %ymm7,%ymm0,%ymm0 + vpxor %ymm8,%ymm1,%ymm1 + vpxor %ymm11,%ymm4,%ymm4 + + ######################################### Iota + vpxor (%r10),%ymm0,%ymm0 + lea 32(%r10),%r10 + + dec %eax + jnz .Loop_avx2 + ret +.ifdef macOS +.else +.size __KeccakF1600,.-__KeccakF1600 +.endif + + + +.ifdef macOS +.globl _KeccakP1600_AVX2_Permute_12rounds +_KeccakP1600_AVX2_Permute_12rounds: +.else +.globl KeccakP1600_AVX2_Permute_12rounds +.type KeccakP1600_AVX2_Permute_12rounds,@function +KeccakP1600_AVX2_Permute_12rounds: +.endif +.balign 32 + lea rhotates_left+96(%rip),%r8 + lea rhotates_right+96(%rip),%r9 + lea iotas+12*4*8(%rip),%r10 + mov $12,%eax + lea 96(%rdi),%rdi + vzeroupper + vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] + vmovdqu 8+32*0-96(%rdi),%ymm1 + vmovdqu 8+32*1-96(%rdi),%ymm2 + vmovdqu 8+32*2-96(%rdi),%ymm3 + vmovdqu 8+32*3-96(%rdi),%ymm4 + vmovdqu 8+32*4-96(%rdi),%ymm5 + vmovdqu 8+32*5-96(%rdi),%ymm6 + call __KeccakF1600 + vmovq %xmm0,-96(%rdi) + vmovdqu %ymm1,8+32*0-96(%rdi) + vmovdqu %ymm2,8+32*1-96(%rdi) + vmovdqu %ymm3,8+32*2-96(%rdi) + vmovdqu %ymm4,8+32*3-96(%rdi) + vmovdqu %ymm5,8+32*4-96(%rdi) + vmovdqu %ymm6,8+32*5-96(%rdi) + vzeroupper + ret +.ifdef macOS +.else +.size KeccakP1600_AVX2_Permute_12rounds,.-KeccakP1600_AVX2_Permute_12rounds +.endif + +# ----------------------------------------------------------------------------- +# +# size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX2_12rounds_FastLoop_Absorb +_KeccakP1600_AVX2_12rounds_FastLoop_Absorb: +.else +.globl KeccakP1600_AVX2_12rounds_FastLoop_Absorb +.type KeccakP1600_AVX2_12rounds_FastLoop_Absorb,@function +KeccakP1600_AVX2_12rounds_FastLoop_Absorb: +.endif +.balign 32 + push %rbx + push %r10 + shr $3, %rcx # rcx = data length in lanes + mov %rdx, %rbx # rbx = initial data pointer + cmp %rsi, %rcx + jb KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit + vzeroupper + cmp $21, %rsi + jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes + sub $21, %rcx + lea rhotates_left+96(%rip),%r8 + lea rhotates_right+96(%rip),%r9 + lea 96(%rdi),%rdi + vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] + vmovdqu 8+32*0-96(%rdi),%ymm1 + vmovdqu 8+32*1-96(%rdi),%ymm2 + vmovdqu 8+32*2-96(%rdi),%ymm3 + vmovdqu 8+32*3-96(%rdi),%ymm4 + vmovdqu 8+32*4-96(%rdi),%ymm5 + vmovdqu 8+32*5-96(%rdi),%ymm6 +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes: + vpbroadcastq (%rdx),%ymm7 + vmovdqu 8(%rdx),%ymm8 + + vmovdqa map2(%rip), %xmm15 + vpcmpeqd %ymm14, %ymm14, %ymm14 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 + + vmovdqa mask3_21(%rip), %ymm14 + vpxor %ymm10, %ymm10, %ymm10 + vmovdqa map3(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 + + vmovdqa mask4_21(%rip), %ymm14 + vpxor %ymm11, %ymm11, %ymm11 + vmovdqa map4(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 + + vmovdqa mask5_21(%rip), %ymm14 + vpxor %ymm12, %ymm12, %ymm12 + vmovdqa map5(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 + + vmovdqa mask6_21(%rip), %ymm14 + vpxor %ymm13, %ymm13, %ymm13 + vmovdqa map6(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 + + vpxor %ymm7,%ymm0,%ymm0 + vpxor %ymm8,%ymm1,%ymm1 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm11,%ymm4,%ymm4 + vpxor %ymm12,%ymm5,%ymm5 + vpxor %ymm13,%ymm6,%ymm6 + add $21*8, %rdx + lea iotas+12*4*8(%rip),%r10 + mov $12,%eax + call __KeccakF1600 + sub $21, %rcx + jnc KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit: + vmovq %xmm0,-96(%rdi) + vmovdqu %ymm1,8+32*0-96(%rdi) + vmovdqu %ymm2,8+32*1-96(%rdi) + vmovdqu %ymm3,8+32*2-96(%rdi) + vmovdqu %ymm4,8+32*3-96(%rdi) + vmovdqu %ymm5,8+32*4-96(%rdi) + vmovdqu %ymm6,8+32*5-96(%rdi) +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit: + vzeroupper + mov %rdx, %rax # return number of bytes processed + sub %rbx, %rax + pop %r10 + pop %rbx + ret +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes: + cmp $17, %rsi + jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes + sub $17, %rcx + lea rhotates_left+96(%rip),%r8 + lea rhotates_right+96(%rip),%r9 + lea 96(%rdi),%rdi + vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] + vmovdqu 8+32*0-96(%rdi),%ymm1 + vmovdqu 8+32*1-96(%rdi),%ymm2 + vmovdqu 8+32*2-96(%rdi),%ymm3 + vmovdqu 8+32*3-96(%rdi),%ymm4 + vmovdqu 8+32*4-96(%rdi),%ymm5 + vmovdqu 8+32*5-96(%rdi),%ymm6 +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes: + vpbroadcastq (%rdx),%ymm7 + vmovdqu 8(%rdx),%ymm8 + + vmovdqa mask2_17(%rip), %ymm14 + vpxor %ymm9, %ymm9, %ymm9 + vmovdqa map2(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 + + vmovdqa mask3_17(%rip), %ymm14 + vpxor %ymm10, %ymm10, %ymm10 + vmovdqa map3(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 + + vmovdqa mask4_17(%rip), %ymm14 + vpxor %ymm11, %ymm11, %ymm11 + vmovdqa map4(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 + + vmovdqa mask5_17(%rip), %ymm14 + vpxor %ymm12, %ymm12, %ymm12 + vmovdqa map5(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 + + vmovdqa mask6_17(%rip), %ymm14 + vpxor %ymm13, %ymm13, %ymm13 + vmovdqa map6(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 + + vpxor %ymm7,%ymm0,%ymm0 + vpxor %ymm8,%ymm1,%ymm1 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm11,%ymm4,%ymm4 + vpxor %ymm12,%ymm5,%ymm5 + vpxor %ymm13,%ymm6,%ymm6 + add $17*8, %rdx + lea iotas+12*4*8(%rip),%r10 + mov $12,%eax + call __KeccakF1600 + sub $17, %rcx + jnc KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes + jmp KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes: + lea mapState(%rip), %r9 + mov %rsi, %rax +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop: + mov (%rdx), %r8 + add $8, %rdx + mov (%r9), %r10 + add $8, %r9 + add %rdi, %r10 + xor %r8, (%r10) + sub $1, %rax + jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop + sub %rsi, %rcx + push %rdi + push %rsi + push %rdx + push %rcx +.ifdef macOS + call _KeccakP1600_AVX2_Permute_12rounds +.else + call KeccakP1600_AVX2_Permute_12rounds@PLT +.endif + pop %rcx + pop %rdx + pop %rsi + pop %rdi + cmp %rsi, %rcx + jae KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes + jmp KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit +.ifdef macOS +.else +.size KeccakP1600_AVX2_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX2_12rounds_FastLoop_Absorb +.endif + +.equ ALLON, 0xFFFFFFFFFFFFFFFF + +.balign 64 +rhotates_left: + .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] + .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] + .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] + .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] + .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] + .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] +rhotates_right: + .quad 64-3, 64-18, 64-36, 64-41 + .quad 64-1, 64-62, 64-28, 64-27 + .quad 64-45, 64-6, 64-56, 64-39 + .quad 64-10, 64-61, 64-55, 64-8 + .quad 64-2, 64-15, 64-25, 64-20 + .quad 64-44, 64-43, 64-21, 64-14 +iotas: + .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 + .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 + .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a + .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 + .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b + .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 + .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 + .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 + .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a + .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 + .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 + .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a + .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b + .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b + .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 + .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 + .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 + .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 + .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a + .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a + .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 + .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 + .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 + .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 + +mapState: + .quad 0*8, 1*8, 2*8, 3*8, 4*8 + .quad 7*8, 21*8, 10*8, 15*8, 20*8 + .quad 5*8, 13*8, 22*8, 19*8, 12*8 + .quad 8*8, 9*8, 18*8, 23*8, 16*8 + .quad 6*8, 17*8, 14*8, 11*8, 24*8 + + .balign 16 +map2: + .long 10*8, 20*8, 5*8, 15*8 +map3: + .long 16*8, 7*8, 23*8, 14*8 +map4: + .long 11*8, 22*8, 8*8, 19*8 +map5: + .long 21*8, 17*8, 13*8, 9*8 +map6: + .long 6*8, 12*8, 18*8, 24*8 + + .balign 32 +mask3_21: + .quad ALLON, ALLON, 0, ALLON +mask4_21: + .quad ALLON, 0, ALLON, ALLON +mask5_21: + .quad 0, ALLON, ALLON, ALLON +mask6_21: + .quad ALLON, ALLON, ALLON, 0 + +mask2_17: + .quad ALLON, 0, ALLON, ALLON +mask3_17: + .quad ALLON, ALLON, 0, ALLON +mask4_17: + .quad ALLON, 0, ALLON, 0 +mask5_17: + .quad 0, 0, ALLON, ALLON +mask6_17: + .quad ALLON, ALLON, 0, 0 + +.asciz "Keccak-1600 for AVX2, CRYPTOGAMS by " diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c new file mode 100644 index 0000000..b426421 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c @@ -0,0 +1,241 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +We would like to thank Vladimir Sedach, we have used parts of his Keccak AVX-512 C++ code. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "align.h" + +typedef __m512i V512; + +#define XOR(a,b) _mm512_xor_si512(a,b) +#define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96) +#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) +#define ROL(a,offset) _mm512_rol_epi64(a,offset) +#define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2) + +#define LOAD_Lanes(m,a) _mm512_maskz_loadu_epi64(m,a) +#define LOAD_Lane(a) LOAD_Lanes(0x01,a) +#define LOAD_Plane(a) LOAD_Lanes(0x1F,a) +#define LOAD_8Lanes(a) LOAD_Lanes(0xFF,a) +#define STORE_Lanes(a,m,v) _mm512_mask_storeu_epi64(a,m,v) +#define STORE_Lane(a,v) STORE_Lanes(a,0x01,v) +#define STORE_Plane(a,v) STORE_Lanes(a,0x1F,v) +#define STORE_8Lanes(a,v) STORE_Lanes(a,0xFF,v) + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AVX512_Initialize(void *state) +{ + memset(state, 0, 1600/8); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + uint8_t *stateAsBytes; + uint64_t *stateAsLanes; + + for( stateAsBytes = (uint8_t*)state; ((offset % 8) != 0) && (length != 0); ++offset, --length) + stateAsBytes[offset] ^= *(data++); + for (stateAsLanes = (uint64_t*)(stateAsBytes + offset); length >= 8*8; stateAsLanes += 8, data += 8*8, length -= 8*8) + STORE_8Lanes( stateAsLanes, XOR(LOAD_8Lanes(stateAsLanes), LOAD_8Lanes((const uint64_t*)data))); + for (/* empty */; length >= 8; ++stateAsLanes, data += 8, length -= 8) + STORE_Lane( stateAsLanes, XOR(LOAD_Lane(stateAsLanes), LOAD_Lane((const uint64_t*)data))); + for ( stateAsBytes = (uint8_t*)stateAsLanes; length != 0; --length) + *(stateAsBytes++) ^= *(data++); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + memcpy(data, (unsigned char*)state+offset, length); +} + +/* ---------------------------------------------------------------- */ + +const uint64_t KeccakP1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL }; + +#define KeccakP_DeclareVars \ + V512 b0, b1, b2, b3, b4; \ + V512 Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou; \ + V512 moveThetaPrev = _mm512_setr_epi64(4, 0, 1, 2, 3, 5, 6, 7); \ + V512 moveThetaNext = _mm512_setr_epi64(1, 2, 3, 4, 0, 5, 6, 7); \ + V512 rhoB = _mm512_setr_epi64( 0, 1, 62, 28, 27, 0, 0, 0); \ + V512 rhoG = _mm512_setr_epi64(36, 44, 6, 55, 20, 0, 0, 0); \ + V512 rhoK = _mm512_setr_epi64( 3, 10, 43, 25, 39, 0, 0, 0); \ + V512 rhoM = _mm512_setr_epi64(41, 45, 15, 21, 8, 0, 0, 0); \ + V512 rhoS = _mm512_setr_epi64(18, 2, 61, 56, 14, 0, 0, 0); \ + V512 pi1B = _mm512_setr_epi64(0, 3, 1, 4, 2, 5, 6, 7); \ + V512 pi1G = _mm512_setr_epi64(1, 4, 2, 0, 3, 5, 6, 7); \ + V512 pi1K = _mm512_setr_epi64(2, 0, 3, 1, 4, 5, 6, 7); \ + V512 pi1M = _mm512_setr_epi64(3, 1, 4, 2, 0, 5, 6, 7); \ + V512 pi1S = _mm512_setr_epi64(4, 2, 0, 3, 1, 5, 6, 7); \ + V512 pi2S1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 0+8, 2+8); \ + V512 pi2S2 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 1+8, 3+8); \ + V512 pi2BG = _mm512_setr_epi64(0, 1, 0+8, 1+8, 6, 5, 6, 7); \ + V512 pi2KM = _mm512_setr_epi64(2, 3, 2+8, 3+8, 7, 5, 6, 7); \ + V512 pi2S3 = _mm512_setr_epi64(4, 5, 4+8, 5+8, 4, 5, 6, 7); + +#define copyFromState(pState) \ + Baeiou = LOAD_Plane(pState+ 0); \ + Gaeiou = LOAD_Plane(pState+ 5); \ + Kaeiou = LOAD_Plane(pState+10); \ + Maeiou = LOAD_Plane(pState+15); \ + Saeiou = LOAD_Plane(pState+20); + +#define copyToState(pState) \ + STORE_Plane(pState+ 0, Baeiou); \ + STORE_Plane(pState+ 5, Gaeiou); \ + STORE_Plane(pState+10, Kaeiou); \ + STORE_Plane(pState+15, Maeiou); \ + STORE_Plane(pState+20, Saeiou); + +#define KeccakP_Round(i) \ + /* Theta */ \ + b0 = XOR5( Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou ); \ + b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); \ + b0 = _mm512_permutexvar_epi64(moveThetaNext, b0); \ + b0 = _mm512_rol_epi64(b0, 1); \ + Baeiou = XOR3( Baeiou, b0, b1 ); \ + Gaeiou = XOR3( Gaeiou, b0, b1 ); \ + Kaeiou = XOR3( Kaeiou, b0, b1 ); \ + Maeiou = XOR3( Maeiou, b0, b1 ); \ + Saeiou = XOR3( Saeiou, b0, b1 ); \ + /* Rho */ \ + Baeiou = _mm512_rolv_epi64(Baeiou, rhoB); \ + Gaeiou = _mm512_rolv_epi64(Gaeiou, rhoG); \ + Kaeiou = _mm512_rolv_epi64(Kaeiou, rhoK); \ + Maeiou = _mm512_rolv_epi64(Maeiou, rhoM); \ + Saeiou = _mm512_rolv_epi64(Saeiou, rhoS); \ + /* Pi 1 */ \ + b0 = _mm512_permutexvar_epi64(pi1B, Baeiou); \ + b1 = _mm512_permutexvar_epi64(pi1G, Gaeiou); \ + b2 = _mm512_permutexvar_epi64(pi1K, Kaeiou); \ + b3 = _mm512_permutexvar_epi64(pi1M, Maeiou); \ + b4 = _mm512_permutexvar_epi64(pi1S, Saeiou); \ + /* Chi */ \ + Baeiou = Chi(b0, b1, b2); \ + Gaeiou = Chi(b1, b2, b3); \ + Kaeiou = Chi(b2, b3, b4); \ + Maeiou = Chi(b3, b4, b0); \ + Saeiou = Chi(b4, b0, b1); \ + /* Iota */ \ + Baeiou = XOR(Baeiou, LOAD_Lane(KeccakP1600RoundConstants+i)); \ + /* Pi 2 */ \ + b0 = _mm512_unpacklo_epi64(Baeiou, Gaeiou); \ + b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); \ + b0 = _mm512_permutex2var_epi64(b0, pi2S1, Saeiou); \ + b2 = _mm512_unpackhi_epi64(Baeiou, Gaeiou); \ + b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); \ + b2 = _mm512_permutex2var_epi64(b2, pi2S2, Saeiou); \ + Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); \ + Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); \ + Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); \ + Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); \ + b0 = _mm512_permutex2var_epi64(b0, pi2S3, b1); \ + Saeiou = _mm512_mask_blend_epi64(0x10, b0, Saeiou) + +#define rounds12 \ + KeccakP_Round( 12 ); \ + KeccakP_Round( 13 ); \ + KeccakP_Round( 14 ); \ + KeccakP_Round( 15 ); \ + KeccakP_Round( 16 ); \ + KeccakP_Round( 17 ); \ + KeccakP_Round( 18 ); \ + KeccakP_Round( 19 ); \ + KeccakP_Round( 20 ); \ + KeccakP_Round( 21 ); \ + KeccakP_Round( 22 ); \ + KeccakP_Round( 23 ) + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AVX512_Permute_12rounds(void *state) +{ + KeccakP_DeclareVars + uint64_t *stateAsLanes = (uint64_t*)state; + + copyFromState(stateAsLanes); + rounds12; + copyToState(stateAsLanes); +} + +/* ---------------------------------------------------------------- */ + +#include + +size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) +{ + size_t originalDataByteLen = dataByteLen; + + assert(laneCount == 21); + + KeccakP_DeclareVars; + uint64_t *stateAsLanes = (uint64_t*)state; + uint64_t *inDataAsLanes = (uint64_t*)data; + + copyFromState(stateAsLanes); + while(dataByteLen >= 21*8) { + Baeiou = XOR(Baeiou, LOAD_Plane(inDataAsLanes+ 0)); + Gaeiou = XOR(Gaeiou, LOAD_Plane(inDataAsLanes+ 5)); + Kaeiou = XOR(Kaeiou, LOAD_Plane(inDataAsLanes+10)); + Maeiou = XOR(Maeiou, LOAD_Plane(inDataAsLanes+15)); + Saeiou = XOR(Saeiou, LOAD_Lane(inDataAsLanes+20)); + rounds12; + inDataAsLanes += 21; + dataByteLen -= 21*8; + } + copyToState(stateAsLanes); + + return originalDataByteLen - dataByteLen; +} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s new file mode 100644 index 0000000..383ca43 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s @@ -0,0 +1,551 @@ +# Copyright (c) 2006-2017, CRYPTOGAMS by +# Copyright (c) 2018 Ronny Van Keer +# All rights reserved. +# +# The source code in this file is licensed under the CRYPTOGAMS license. +# For further details see http://www.openssl.org/~appro/cryptogams/. +# +# Notes: +# The code for the permutation (__KeccakF1600) was generated with +# Andy Polyakov's keccak1600-avx512.pl from the CRYPTOGAMS project +# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx512.pl). +# The rest of the code was written by Ronny Van Keer. +# Adaptations for macOS by Stéphane Léon. + +.text + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX512_Initialize(void *state); +# +.ifdef macOS +.globl _KeccakP1600_AVX512_Initialize +_KeccakP1600_AVX512_Initialize: +.else +.globl KeccakP1600_AVX512_Initialize +.type KeccakP1600_AVX512_Initialize,@function +KeccakP1600_AVX512_Initialize: +.endif +.balign 32 + vpxorq %zmm0,%zmm0,%zmm0 + vmovdqu64 %zmm0,0*64(%rdi) + vmovdqu64 %zmm0,1*64(%rdi) + vmovdqu64 %zmm0,2*64(%rdi) + movq $0,3*64(%rdi) + ret +.ifdef macOS +.else +.size KeccakP1600_AVX512_Initialize,.-KeccakP1600_AVX512_Initialize +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset); +# %rdi %rsi %rdx +#!! +#.globl KeccakP1600_AVX512_AddByte +#.type KeccakP1600_AVX512_AddByte,@function +#.balign 32 +#KeccakP1600_AVX512_AddByte: +# mov %rdx, %rax +# and $7, %rax +# and $0xFFFFFFF8, %edx +# mov mapState(%rdx), %rdx +# add %rdx, %rdi +# add %rax, %rdi +# xorb %sil, (%rdi) +# ret +#.size KeccakP1600_AVX512_AddByte,.-KeccakP1600_AVX512_AddByte + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX512_AddBytes +_KeccakP1600_AVX512_AddBytes: +.else +.globl KeccakP1600_AVX512_AddBytes +.type KeccakP1600_AVX512_AddBytes,@function +KeccakP1600_AVX512_AddBytes: +.endif +.balign 32 + cmp $0, %rcx + jz KeccakP1600_AVX512_AddBytes_Exit + add %rdx, %rdi # state += offset + and $7, %rdx + jz KeccakP1600_AVX512_AddBytes_LaneAlignedCheck + mov $8, %r9 # r9 is (max) length of incomplete lane + sub %rdx, %r9 + cmp %rcx, %r9 + cmovae %rcx, %r9 + sub %r9, %rcx # length -= length of incomplete lane +KeccakP1600_AVX512_AddBytes_NotAlignedLoop: + mov (%rsi), %r8b + inc %rsi + xorb %r8b, (%rdi) + inc %rdi + dec %r9 + jnz KeccakP1600_AVX512_AddBytes_NotAlignedLoop + jmp KeccakP1600_AVX512_AddBytes_LaneAlignedCheck +KeccakP1600_AVX512_AddBytes_LaneAlignedLoop: + mov (%rsi), %r8 + add $8, %rsi + xor %r8, (%rdi) + add $8, %rdi +KeccakP1600_AVX512_AddBytes_LaneAlignedCheck: + sub $8, %rcx + jnc KeccakP1600_AVX512_AddBytes_LaneAlignedLoop +KeccakP1600_AVX512_AddBytes_LastIncompleteLane: + add $8, %rcx + jz KeccakP1600_AVX512_AddBytes_Exit +KeccakP1600_AVX512_AddBytes_LastIncompleteLaneLoop: + mov (%rsi), %r8b + inc %rsi + xor %r8b, (%rdi) + inc %rdi + dec %rcx + jnz KeccakP1600_AVX512_AddBytes_LastIncompleteLaneLoop +KeccakP1600_AVX512_AddBytes_Exit: + ret +.ifdef macOS +.else +.size KeccakP1600_AVX512_AddBytes,.-KeccakP1600_AVX512_AddBytes +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX512_ExtractBytes +_KeccakP1600_AVX512_ExtractBytes: +.else +.globl KeccakP1600_AVX512_ExtractBytes +.type KeccakP1600_AVX512_ExtractBytes,@function +KeccakP1600_AVX512_ExtractBytes: +.endif +.balign 32 + cmp $0, %rcx + jz KeccakP1600_AVX512_ExtractBytes_Exit + add %rdx, %rdi # state += offset + and $7, %rdx + jz KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck + mov $8, %rax # rax is (max) length of incomplete lane + sub %rdx, %rax + cmp %rcx, %rax + cmovae %rcx, %rax + sub %rax, %rcx # length -= length of incomplete lane +KeccakP1600_AVX512_ExtractBytes_NotAlignedLoop: + mov (%rdi), %r8b + inc %rdi + mov %r8b, (%rsi) + inc %rsi + dec %rax + jnz KeccakP1600_AVX512_ExtractBytes_NotAlignedLoop + jmp KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck +KeccakP1600_AVX512_ExtractBytes_LaneAlignedLoop: + mov (%rdi), %r8 + add $8, %rdi + mov %r8, (%rsi) + add $8, %rsi +KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck: + sub $8, %rcx + jnc KeccakP1600_AVX512_ExtractBytes_LaneAlignedLoop +KeccakP1600_AVX512_ExtractBytes_LastIncompleteLane: + add $8, %rcx + jz KeccakP1600_AVX512_ExtractBytes_Exit + mov (%rdi), %r8 +KeccakP1600_AVX512_ExtractBytes_LastIncompleteLaneLoop: + mov %r8b, (%rsi) + shr $8, %r8 + inc %rsi + dec %rcx + jnz KeccakP1600_AVX512_ExtractBytes_LastIncompleteLaneLoop +KeccakP1600_AVX512_ExtractBytes_Exit: + ret +.ifdef macOS +.else +.size KeccakP1600_AVX512_ExtractBytes,.-KeccakP1600_AVX512_ExtractBytes +.endif + +# ----------------------------------------------------------------------------- +# +# internal +# +.text +.ifdef macOS +.else +.type __KeccakF1600,@function +.endif +.balign 32 +__KeccakF1600: +.Loop_avx512: + ######################################### Theta, even round + vmovdqa64 %zmm0,%zmm5 # put aside original A00 + vpternlogq $0x96,%zmm2,%zmm1,%zmm0 # and use it as "C00" + vpternlogq $0x96,%zmm4,%zmm3,%zmm0 + vprolq $1,%zmm0,%zmm6 + vpermq %zmm0,%zmm13,%zmm0 + vpermq %zmm6,%zmm16,%zmm6 + vpternlogq $0x96,%zmm0,%zmm6,%zmm5 # T[0] is original A00 + vpternlogq $0x96,%zmm0,%zmm6,%zmm1 + vpternlogq $0x96,%zmm0,%zmm6,%zmm2 + vpternlogq $0x96,%zmm0,%zmm6,%zmm3 + vpternlogq $0x96,%zmm0,%zmm6,%zmm4 + ######################################### Rho + vprolvq %zmm22,%zmm5,%zmm0 # T[0] is original A00 + vprolvq %zmm23,%zmm1,%zmm1 + vprolvq %zmm24,%zmm2,%zmm2 + vprolvq %zmm25,%zmm3,%zmm3 + vprolvq %zmm26,%zmm4,%zmm4 + ######################################### Pi + vpermq %zmm0,%zmm17,%zmm0 + vpermq %zmm1,%zmm18,%zmm1 + vpermq %zmm2,%zmm19,%zmm2 + vpermq %zmm3,%zmm20,%zmm3 + vpermq %zmm4,%zmm21,%zmm4 + ######################################### Chi + vmovdqa64 %zmm0,%zmm5 + vmovdqa64 %zmm1,%zmm6 + vpternlogq $0xD2,%zmm2,%zmm1,%zmm0 + vpternlogq $0xD2,%zmm3,%zmm2,%zmm1 + vpternlogq $0xD2,%zmm4,%zmm3,%zmm2 + vpternlogq $0xD2,%zmm5,%zmm4,%zmm3 + vpternlogq $0xD2,%zmm6,%zmm5,%zmm4 + ######################################### Iota + vpxorq (%r10),%zmm0,%zmm0{%k1} + lea 16(%r10),%r10 + ######################################### Harmonize rounds + vpblendmq %zmm2,%zmm1,%zmm6{%k2} + vpblendmq %zmm3,%zmm2,%zmm7{%k2} + vpblendmq %zmm4,%zmm3,%zmm8{%k2} + vpblendmq %zmm1,%zmm0,%zmm5{%k2} + vpblendmq %zmm0,%zmm4,%zmm9{%k2} + vpblendmq %zmm3,%zmm6,%zmm6{%k3} + vpblendmq %zmm4,%zmm7,%zmm7{%k3} + vpblendmq %zmm2,%zmm5,%zmm5{%k3} + vpblendmq %zmm0,%zmm8,%zmm8{%k3} + vpblendmq %zmm1,%zmm9,%zmm9{%k3} + vpblendmq %zmm4,%zmm6,%zmm6{%k4} + vpblendmq %zmm3,%zmm5,%zmm5{%k4} + vpblendmq %zmm0,%zmm7,%zmm7{%k4} + vpblendmq %zmm1,%zmm8,%zmm8{%k4} + vpblendmq %zmm2,%zmm9,%zmm9{%k4} + vpblendmq %zmm4,%zmm5,%zmm5{%k5} + vpblendmq %zmm0,%zmm6,%zmm6{%k5} + vpblendmq %zmm1,%zmm7,%zmm7{%k5} + vpblendmq %zmm2,%zmm8,%zmm8{%k5} + vpblendmq %zmm3,%zmm9,%zmm9{%k5} + #vpermq %zmm5,%zmm33,%zmm0 # doesn't actually change order + vpermq %zmm6,%zmm13,%zmm1 + vpermq %zmm7,%zmm14,%zmm2 + vpermq %zmm8,%zmm15,%zmm3 + vpermq %zmm9,%zmm16,%zmm4 + ######################################### Theta, odd round + vmovdqa64 %zmm5,%zmm0 # real A00 + vpternlogq $0x96,%zmm2,%zmm1,%zmm5 # C00 is %zmm5's alias + vpternlogq $0x96,%zmm4,%zmm3,%zmm5 + vprolq $1,%zmm5,%zmm6 + vpermq %zmm5,%zmm13,%zmm5 + vpermq %zmm6,%zmm16,%zmm6 + vpternlogq $0x96,%zmm5,%zmm6,%zmm0 + vpternlogq $0x96,%zmm5,%zmm6,%zmm3 + vpternlogq $0x96,%zmm5,%zmm6,%zmm1 + vpternlogq $0x96,%zmm5,%zmm6,%zmm4 + vpternlogq $0x96,%zmm5,%zmm6,%zmm2 + ######################################### Rho + vprolvq %zmm27,%zmm0,%zmm0 + vprolvq %zmm30,%zmm3,%zmm6 + vprolvq %zmm28,%zmm1,%zmm7 + vprolvq %zmm31,%zmm4,%zmm8 + vprolvq %zmm29,%zmm2,%zmm9 + vpermq %zmm0,%zmm16,%zmm10 + vpermq %zmm0,%zmm15,%zmm11 + ######################################### Iota + vpxorq -8(%r10),%zmm0,%zmm0{%k1} + ######################################### Pi + vpermq %zmm6,%zmm14,%zmm1 + vpermq %zmm7,%zmm16,%zmm2 + vpermq %zmm8,%zmm13,%zmm3 + vpermq %zmm9,%zmm15,%zmm4 + ######################################### Chi + vpternlogq $0xD2,%zmm11,%zmm10,%zmm0 + vpermq %zmm6,%zmm13,%zmm12 + #vpermq %zmm6,%zmm33,%zmm6 + vpternlogq $0xD2,%zmm6,%zmm12,%zmm1 + vpermq %zmm7,%zmm15,%zmm5 + vpermq %zmm7,%zmm14,%zmm7 + vpternlogq $0xD2,%zmm7,%zmm5,%zmm2 + #vpermq %zmm8,%zmm33,%zmm8 + vpermq %zmm8,%zmm16,%zmm6 + vpternlogq $0xD2,%zmm6,%zmm8,%zmm3 + vpermq %zmm9,%zmm14,%zmm5 + vpermq %zmm9,%zmm13,%zmm9 + vpternlogq $0xD2,%zmm9,%zmm5,%zmm4 + dec %eax + jnz .Loop_avx512 + ret +.ifdef macOS +.else +.size __KeccakF1600,.-__KeccakF1600 +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX512_Permute_12rounds(void *state); +# %rdi +# +.ifdef macOS +.globl _KeccakP1600_AVX512_Permute_12rounds +_KeccakP1600_AVX512_Permute_12rounds: +.else +.globl KeccakP1600_AVX512_Permute_12rounds +.type KeccakP1600_AVX512_Permute_12rounds,@function +KeccakP1600_AVX512_Permute_12rounds: +.endif +.balign 32 + lea 96(%rdi),%rdi + lea theta_perm(%rip),%r8 + kxnorw %k6,%k6,%k6 + kshiftrw $15,%k6,%k1 + kshiftrw $11,%k6,%k6 + kshiftlw $1,%k1,%k2 + kshiftlw $2,%k1,%k3 + kshiftlw $3,%k1,%k4 + kshiftlw $4,%k1,%k5 + #vmovdqa64 64*0(%r8),%zmm33 + vmovdqa64 64*1(%r8),%zmm13 + vmovdqa64 64*2(%r8),%zmm14 + vmovdqa64 64*3(%r8),%zmm15 + vmovdqa64 64*4(%r8),%zmm16 + vmovdqa64 64*5(%r8),%zmm27 + vmovdqa64 64*6(%r8),%zmm28 + vmovdqa64 64*7(%r8),%zmm29 + vmovdqa64 64*8(%r8),%zmm30 + vmovdqa64 64*9(%r8),%zmm31 + vmovdqa64 64*10(%r8),%zmm22 + vmovdqa64 64*11(%r8),%zmm23 + vmovdqa64 64*12(%r8),%zmm24 + vmovdqa64 64*13(%r8),%zmm25 + vmovdqa64 64*14(%r8),%zmm26 + vmovdqa64 64*15(%r8),%zmm17 + vmovdqa64 64*16(%r8),%zmm18 + vmovdqa64 64*17(%r8),%zmm19 + vmovdqa64 64*18(%r8),%zmm20 + vmovdqa64 64*19(%r8),%zmm21 + vmovdqu64 40*0-96(%rdi),%zmm0{%k6}{z} +# vpxorq %zmm5,%zmm5,%zmm5 + vmovdqu64 40*1-96(%rdi),%zmm1{%k6}{z} + vmovdqu64 40*2-96(%rdi),%zmm2{%k6}{z} + vmovdqu64 40*3-96(%rdi),%zmm3{%k6}{z} + vmovdqu64 40*4-96(%rdi),%zmm4{%k6}{z} + lea iotas+12*8(%rip), %r10 + mov $12/2, %eax + call __KeccakF1600 + vmovdqu64 %zmm0,40*0-96(%rdi){%k6} + vmovdqu64 %zmm1,40*1-96(%rdi){%k6} + vmovdqu64 %zmm2,40*2-96(%rdi){%k6} + vmovdqu64 %zmm3,40*3-96(%rdi){%k6} + vmovdqu64 %zmm4,40*4-96(%rdi){%k6} + vzeroupper + ret +.ifdef macOS +.else +.size KeccakP1600_AVX512_Permute_12rounds,.-KeccakP1600_AVX512_Permute_12rounds +.endif + +# ----------------------------------------------------------------------------- +# +# size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX512_12rounds_FastLoop_Absorb +_KeccakP1600_AVX512_12rounds_FastLoop_Absorb: +.else +.globl KeccakP1600_AVX512_12rounds_FastLoop_Absorb +.type KeccakP1600_AVX512_12rounds_FastLoop_Absorb,@function +KeccakP1600_AVX512_12rounds_FastLoop_Absorb: +.endif +.balign 32 + push %rbx + push %r10 + shr $3, %rcx # rcx = data length in lanes + mov %rdx, %rbx # rbx = initial data pointer + cmp %rsi, %rcx + jb KeccakP1600_AVX512_FastLoop_Absorb_Exit + lea 96(%rdi),%rdi + lea theta_perm(%rip),%r8 + kxnorw %k6,%k6,%k6 + kshiftrw $15,%k6,%k1 + kshiftrw $11,%k6,%k6 + kshiftlw $1,%k1,%k2 + kshiftlw $2,%k1,%k3 + kshiftlw $3,%k1,%k4 + kshiftlw $4,%k1,%k5 + vmovdqa64 64*1(%r8),%zmm13 + vmovdqa64 64*2(%r8),%zmm14 + vmovdqa64 64*3(%r8),%zmm15 + vmovdqa64 64*4(%r8),%zmm16 + vmovdqa64 64*5(%r8),%zmm27 + vmovdqa64 64*6(%r8),%zmm28 + vmovdqa64 64*7(%r8),%zmm29 + vmovdqa64 64*8(%r8),%zmm30 + vmovdqa64 64*9(%r8),%zmm31 + vmovdqa64 64*10(%r8),%zmm22 + vmovdqa64 64*11(%r8),%zmm23 + vmovdqa64 64*12(%r8),%zmm24 + vmovdqa64 64*13(%r8),%zmm25 + vmovdqa64 64*14(%r8),%zmm26 + vmovdqa64 64*15(%r8),%zmm17 + vmovdqa64 64*16(%r8),%zmm18 + vmovdqa64 64*17(%r8),%zmm19 + vmovdqa64 64*18(%r8),%zmm20 + vmovdqa64 64*19(%r8),%zmm21 + vmovdqu64 40*0-96(%rdi),%zmm0{%k6}{z} + vmovdqu64 40*1-96(%rdi),%zmm1{%k6}{z} + vmovdqu64 40*2-96(%rdi),%zmm2{%k6}{z} + vmovdqu64 40*3-96(%rdi),%zmm3{%k6}{z} + vmovdqu64 40*4-96(%rdi),%zmm4{%k6}{z} + cmp $21, %rsi + jnz KeccakP1600_AVX512_FastLoop_Absorb_Not21Lanes + sub $21, %rcx +KeccakP1600_AVX512_FastLoop_Absorb_Loop21Lanes: + vmovdqu64 8*0(%rdx),%zmm5{%k6}{z} + vmovdqu64 8*5(%rdx),%zmm6{%k6}{z} + vmovdqu64 8*10(%rdx),%zmm7{%k6}{z} + vmovdqu64 8*15(%rdx),%zmm8{%k6}{z} + vmovdqu64 8*20(%rdx),%zmm9{%k1}{z} + vpxorq %zmm5,%zmm0,%zmm0 + vpxorq %zmm6,%zmm1,%zmm1 + vpxorq %zmm7,%zmm2,%zmm2 + vpxorq %zmm8,%zmm3,%zmm3 + vpxorq %zmm9,%zmm4,%zmm4 + add $21*8, %rdx + lea iotas+12*8(%rip), %r10 + mov $12/2, %eax + call __KeccakF1600 + sub $21, %rcx + jnc KeccakP1600_AVX512_FastLoop_Absorb_Loop21Lanes +KeccakP1600_AVX512_FastLoop_Absorb_SaveAndExit: + vmovdqu64 %zmm0,40*0-96(%rdi){%k6} + vmovdqu64 %zmm1,40*1-96(%rdi){%k6} + vmovdqu64 %zmm2,40*2-96(%rdi){%k6} + vmovdqu64 %zmm3,40*3-96(%rdi){%k6} + vmovdqu64 %zmm4,40*4-96(%rdi){%k6} +KeccakP1600_AVX512_FastLoop_Absorb_Exit: + vzeroupper + mov %rdx, %rax # return number of bytes processed + sub %rbx, %rax + pop %r10 + pop %rbx + ret +KeccakP1600_AVX512_FastLoop_Absorb_Not21Lanes: + cmp $17, %rsi + jnz KeccakP1600_AVX512_FastLoop_Absorb_Not17Lanes + sub $17, %rcx +KeccakP1600_AVX512_FastLoop_Absorb_Loop17Lanes: + vmovdqu64 8*0(%rdx),%zmm5{%k6}{z} + vmovdqu64 8*5(%rdx),%zmm6{%k6}{z} + vmovdqu64 8*10(%rdx),%zmm7{%k6}{z} + vmovdqu64 8*15(%rdx),%zmm8{%k1}{z} + vmovdqu64 8*15(%rdx),%zmm8{%k2} + vpxorq %zmm5,%zmm0,%zmm0 + vpxorq %zmm6,%zmm1,%zmm1 + vpxorq %zmm7,%zmm2,%zmm2 + vpxorq %zmm8,%zmm3,%zmm3 + add $17*8, %rdx + lea iotas+12*8(%rip), %r10 + mov $12/2, %eax + call __KeccakF1600 + sub $17, %rcx + jnc KeccakP1600_AVX512_FastLoop_Absorb_Loop17Lanes + jmp KeccakP1600_AVX512_FastLoop_Absorb_SaveAndExit +KeccakP1600_AVX512_FastLoop_Absorb_Not17Lanes: + lea -96(%rdi), %rdi +KeccakP1600_AVX512_FastLoop_Absorb_LanesLoop: + mov %rsi, %rax + mov %rdi, %r10 +KeccakP1600_AVX512_FastLoop_Absorb_LanesAddLoop: + mov (%rdx), %r8 + add $8, %rdx + xor %r8, (%r10) + add $8, %r10 + sub $1, %rax + jnz KeccakP1600_AVX512_FastLoop_Absorb_LanesAddLoop + sub %rsi, %rcx + push %rdi + push %rsi + push %rdx + push %rcx +.ifdef macOS + call _KeccakP1600_AVX512_Permute_12rounds +.else + call KeccakP1600_AVX512_Permute_12rounds@PLT +.endif + pop %rcx + pop %rdx + pop %rsi + pop %rdi + cmp %rsi, %rcx + jae KeccakP1600_AVX512_FastLoop_Absorb_LanesLoop + jmp KeccakP1600_AVX512_FastLoop_Absorb_Exit +.ifdef macOS +.else +.size KeccakP1600_AVX512_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX512_12rounds_FastLoop_Absorb +.endif +.balign 64 +theta_perm: + .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used] + .quad 4, 0, 1, 2, 3, 5, 6, 7 + .quad 3, 4, 0, 1, 2, 5, 6, 7 + .quad 2, 3, 4, 0, 1, 5, 6, 7 + .quad 1, 2, 3, 4, 0, 5, 6, 7 +rhotates1: + .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4] + .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4] + .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4] + .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4] + .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4] +rhotates0: + .quad 0, 1, 62, 28, 27, 0, 0, 0 + .quad 36, 44, 6, 55, 20, 0, 0, 0 + .quad 3, 10, 43, 25, 39, 0, 0, 0 + .quad 41, 45, 15, 21, 8, 0, 0, 0 + .quad 18, 2, 61, 56, 14, 0, 0, 0 +pi0_perm: + .quad 0, 3, 1, 4, 2, 5, 6, 7 + .quad 1, 4, 2, 0, 3, 5, 6, 7 + .quad 2, 0, 3, 1, 4, 5, 6, 7 + .quad 3, 1, 4, 2, 0, 5, 6, 7 + .quad 4, 2, 0, 3, 1, 5, 6, 7 +iotas: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +iotas_end: +.asciz "Keccak-1600 for AVX-512F, CRYPTOGAMS by " diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h new file mode 100644 index 0000000..709469c --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h @@ -0,0 +1,74 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#ifndef _KeccakP_1600_SnP_h_ +#define _KeccakP_1600_SnP_h_ + +/* Keccak-p[1600] */ + +#define KeccakP1600_stateSizeInBytes 200 +#define KeccakP1600_stateAlignment 8 +#define KeccakP1600_12rounds_FastLoop_supported + +const char * KeccakP1600_GetImplementation(); +void KeccakP1600_Initialize(void *state); +void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_Permute_12rounds(void *state); +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +void KeccakP1600_AVX512_Initialize(void *state); +void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_AVX512_Permute_12rounds(void *state); +void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +void KeccakP1600_AVX2_Initialize(void *state); +void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_AVX2_Permute_12rounds(void *state); +void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +void KeccakP1600_opt64_Initialize(void *state); +void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_opt64_Permute_12rounds(void *state); +void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +/* Keccak-p[1600]×2 */ + +int KeccakP1600times2_IsAvailable(); +const char * KeccakP1600times2_GetImplementation(); + +/* Keccak-p[1600]×4 */ + +int KeccakP1600times4_IsAvailable(); +const char * KeccakP1600times4_GetImplementation(); + +/* Keccak-p[1600]×8 */ + +int KeccakP1600times8_IsAvailable(); +const char * KeccakP1600times8_GetImplementation(); + +#endif diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c new file mode 100644 index 0000000..e98056d --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c @@ -0,0 +1,1026 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include +#include "brg_endian.h" +#include + +#define KeccakP1600_opt64_implementation_config "all rounds unrolled" +#define KeccakP1600_opt64_fullUnrolling +/* Or */ +/* +#define KeccakP1600_opt64_implementation_config "6 rounds unrolled" +#define KeccakP1600_opt64_unrolling 6 +*/ +/* Or */ +/* +#define KeccakP1600_opt64_implementation_config "lane complementing, 6 rounds unrolled" +#define KeccakP1600_opt64_unrolling 6 +#define KeccakP1600_opt64_useLaneComplementing +*/ +/* Or */ +/* +#define KeccakP1600_opt64_implementation_config "lane complementing, all rounds unrolled" +#define KeccakP1600_opt64_fullUnrolling +#define KeccakP1600_opt64_useLaneComplementing +*/ +/* Or */ +/* +#define KeccakP1600_opt64_implementation_config "lane complementing, all rounds unrolled, using SHLD for rotations" +#define KeccakP1600_opt64_fullUnrolling +#define KeccakP1600_opt64_useLaneComplementing +#define KeccakP1600_opt64_useSHLD +*/ + +#if defined(KeccakP1600_opt64_useLaneComplementing) +#define UseBebigokimisa +#endif + +#if defined(_MSC_VER) +#define ROL64(a, offset) _rotl64(a, offset) +#elif defined(KeccakP1600_opt64_useSHLD) + #define ROL64(x,N) ({ \ + register uint64_t __out; \ + register uint64_t __in = x; \ + __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \ + __out; \ + }) +#else +#define ROL64(a, offset) ((((uint64_t)a) << offset) ^ (((uint64_t)a) >> (64-offset))) +#endif + +#ifdef KeccakP1600_opt64_fullUnrolling +#define FullUnrolling +#else +#define Unrolling KeccakP1600_opt64_unrolling +#endif + +static const uint64_t KeccakF1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL }; + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_Initialize(void *state) +{ + memset(state, 0, 200); +#ifdef KeccakP1600_opt64_useLaneComplementing + ((uint64_t*)state)[ 1] = ~(uint64_t)0; + ((uint64_t*)state)[ 2] = ~(uint64_t)0; + ((uint64_t*)state)[ 8] = ~(uint64_t)0; + ((uint64_t*)state)[12] = ~(uint64_t)0; + ((uint64_t*)state)[17] = ~(uint64_t)0; + ((uint64_t*)state)[20] = ~(uint64_t)0; +#endif +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + uint64_t lane; + if (length == 0) + return; + if (length == 1) + lane = data[0]; + else { + lane = 0; + memcpy(&lane, data, length); + } + lane <<= offset*8; +#else + uint64_t lane = 0; + unsigned int i; + for(i=0; i 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +#define declareABCDE \ + uint64_t Aba, Abe, Abi, Abo, Abu; \ + uint64_t Aga, Age, Agi, Ago, Agu; \ + uint64_t Aka, Ake, Aki, Ako, Aku; \ + uint64_t Ama, Ame, Ami, Amo, Amu; \ + uint64_t Asa, Ase, Asi, Aso, Asu; \ + uint64_t Bba, Bbe, Bbi, Bbo, Bbu; \ + uint64_t Bga, Bge, Bgi, Bgo, Bgu; \ + uint64_t Bka, Bke, Bki, Bko, Bku; \ + uint64_t Bma, Bme, Bmi, Bmo, Bmu; \ + uint64_t Bsa, Bse, Bsi, Bso, Bsu; \ + uint64_t Ca, Ce, Ci, Co, Cu; \ + uint64_t Da, De, Di, Do, Du; \ + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; \ + uint64_t Ega, Ege, Egi, Ego, Egu; \ + uint64_t Eka, Eke, Eki, Eko, Eku; \ + uint64_t Ema, Eme, Emi, Emo, Emu; \ + uint64_t Esa, Ese, Esi, Eso, Esu; \ + +#define prepareTheta \ + Ca = Aba^Aga^Aka^Ama^Asa; \ + Ce = Abe^Age^Ake^Ame^Ase; \ + Ci = Abi^Agi^Aki^Ami^Asi; \ + Co = Abo^Ago^Ako^Amo^Aso; \ + Cu = Abu^Agu^Aku^Amu^Asu; \ + +#ifdef UseBebigokimisa +/* --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^( Bbe | Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + Ca = E##ba; \ + E##be = Bbe ^((~Bbi)| Bbo ); \ + Ce = E##be; \ + E##bi = Bbi ^( Bbo & Bbu ); \ + Ci = E##bi; \ + E##bo = Bbo ^( Bbu | Bba ); \ + Co = E##bo; \ + E##bu = Bbu ^( Bba & Bbe ); \ + Cu = E##bu; \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^( Bge | Bgi ); \ + Ca ^= E##ga; \ + E##ge = Bge ^( Bgi & Bgo ); \ + Ce ^= E##ge; \ + E##gi = Bgi ^( Bgo |(~Bgu)); \ + Ci ^= E##gi; \ + E##go = Bgo ^( Bgu | Bga ); \ + Co ^= E##go; \ + E##gu = Bgu ^( Bga & Bge ); \ + Cu ^= E##gu; \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^( Bke | Bki ); \ + Ca ^= E##ka; \ + E##ke = Bke ^( Bki & Bko ); \ + Ce ^= E##ke; \ + E##ki = Bki ^((~Bko)& Bku ); \ + Ci ^= E##ki; \ + E##ko = (~Bko)^( Bku | Bka ); \ + Co ^= E##ko; \ + E##ku = Bku ^( Bka & Bke ); \ + Cu ^= E##ku; \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^( Bme & Bmi ); \ + Ca ^= E##ma; \ + E##me = Bme ^( Bmi | Bmo ); \ + Ce ^= E##me; \ + E##mi = Bmi ^((~Bmo)| Bmu ); \ + Ci ^= E##mi; \ + E##mo = (~Bmo)^( Bmu & Bma ); \ + Co ^= E##mo; \ + E##mu = Bmu ^( Bma | Bme ); \ + Cu ^= E##mu; \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + Ca ^= E##sa; \ + E##se = (~Bse)^( Bsi | Bso ); \ + Ce ^= E##se; \ + E##si = Bsi ^( Bso & Bsu ); \ + Ci ^= E##si; \ + E##so = Bso ^( Bsu | Bsa ); \ + Co ^= E##so; \ + E##su = Bsu ^( Bsa & Bse ); \ + Cu ^= E##su; \ +\ + +/* --- Code for round (lane complementing pattern 'bebigokimisa') */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIota(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^( Bbe | Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + E##be = Bbe ^((~Bbi)| Bbo ); \ + E##bi = Bbi ^( Bbo & Bbu ); \ + E##bo = Bbo ^( Bbu | Bba ); \ + E##bu = Bbu ^( Bba & Bbe ); \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^( Bge | Bgi ); \ + E##ge = Bge ^( Bgi & Bgo ); \ + E##gi = Bgi ^( Bgo |(~Bgu)); \ + E##go = Bgo ^( Bgu | Bga ); \ + E##gu = Bgu ^( Bga & Bge ); \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^( Bke | Bki ); \ + E##ke = Bke ^( Bki & Bko ); \ + E##ki = Bki ^((~Bko)& Bku ); \ + E##ko = (~Bko)^( Bku | Bka ); \ + E##ku = Bku ^( Bka & Bke ); \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^( Bme & Bmi ); \ + E##me = Bme ^( Bmi | Bmo ); \ + E##mi = Bmi ^((~Bmo)| Bmu ); \ + E##mo = (~Bmo)^( Bmu & Bma ); \ + E##mu = Bmu ^( Bma | Bme ); \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + E##se = (~Bse)^( Bsi | Bso ); \ + E##si = Bsi ^( Bso & Bsu ); \ + E##so = Bso ^( Bsu | Bsa ); \ + E##su = Bsu ^( Bsa & Bse ); \ +\ + +#else /* UseBebigokimisa */ +/* --- Code for round, with prepare-theta */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^((~Bbe)& Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + Ca = E##ba; \ + E##be = Bbe ^((~Bbi)& Bbo ); \ + Ce = E##be; \ + E##bi = Bbi ^((~Bbo)& Bbu ); \ + Ci = E##bi; \ + E##bo = Bbo ^((~Bbu)& Bba ); \ + Co = E##bo; \ + E##bu = Bbu ^((~Bba)& Bbe ); \ + Cu = E##bu; \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^((~Bge)& Bgi ); \ + Ca ^= E##ga; \ + E##ge = Bge ^((~Bgi)& Bgo ); \ + Ce ^= E##ge; \ + E##gi = Bgi ^((~Bgo)& Bgu ); \ + Ci ^= E##gi; \ + E##go = Bgo ^((~Bgu)& Bga ); \ + Co ^= E##go; \ + E##gu = Bgu ^((~Bga)& Bge ); \ + Cu ^= E##gu; \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^((~Bke)& Bki ); \ + Ca ^= E##ka; \ + E##ke = Bke ^((~Bki)& Bko ); \ + Ce ^= E##ke; \ + E##ki = Bki ^((~Bko)& Bku ); \ + Ci ^= E##ki; \ + E##ko = Bko ^((~Bku)& Bka ); \ + Co ^= E##ko; \ + E##ku = Bku ^((~Bka)& Bke ); \ + Cu ^= E##ku; \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^((~Bme)& Bmi ); \ + Ca ^= E##ma; \ + E##me = Bme ^((~Bmi)& Bmo ); \ + Ce ^= E##me; \ + E##mi = Bmi ^((~Bmo)& Bmu ); \ + Ci ^= E##mi; \ + E##mo = Bmo ^((~Bmu)& Bma ); \ + Co ^= E##mo; \ + E##mu = Bmu ^((~Bma)& Bme ); \ + Cu ^= E##mu; \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + Ca ^= E##sa; \ + E##se = Bse ^((~Bsi)& Bso ); \ + Ce ^= E##se; \ + E##si = Bsi ^((~Bso)& Bsu ); \ + Ci ^= E##si; \ + E##so = Bso ^((~Bsu)& Bsa ); \ + Co ^= E##so; \ + E##su = Bsu ^((~Bsa)& Bse ); \ + Cu ^= E##su; \ +\ + +/* --- Code for round */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIota(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^((~Bbe)& Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + E##be = Bbe ^((~Bbi)& Bbo ); \ + E##bi = Bbi ^((~Bbo)& Bbu ); \ + E##bo = Bbo ^((~Bbu)& Bba ); \ + E##bu = Bbu ^((~Bba)& Bbe ); \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^((~Bge)& Bgi ); \ + E##ge = Bge ^((~Bgi)& Bgo ); \ + E##gi = Bgi ^((~Bgo)& Bgu ); \ + E##go = Bgo ^((~Bgu)& Bga ); \ + E##gu = Bgu ^((~Bga)& Bge ); \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^((~Bke)& Bki ); \ + E##ke = Bke ^((~Bki)& Bko ); \ + E##ki = Bki ^((~Bko)& Bku ); \ + E##ko = Bko ^((~Bku)& Bka ); \ + E##ku = Bku ^((~Bka)& Bke ); \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^((~Bme)& Bmi ); \ + E##me = Bme ^((~Bmi)& Bmo ); \ + E##mi = Bmi ^((~Bmo)& Bmu ); \ + E##mo = Bmo ^((~Bmu)& Bma ); \ + E##mu = Bmu ^((~Bma)& Bme ); \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + E##se = Bse ^((~Bsi)& Bso ); \ + E##si = Bsi ^((~Bso)& Bsu ); \ + E##so = Bso ^((~Bsu)& Bsa ); \ + E##su = Bsu ^((~Bsa)& Bse ); \ +\ + +#endif /* UseBebigokimisa */ + +#define copyFromState(X, state) \ + X##ba = state[ 0]; \ + X##be = state[ 1]; \ + X##bi = state[ 2]; \ + X##bo = state[ 3]; \ + X##bu = state[ 4]; \ + X##ga = state[ 5]; \ + X##ge = state[ 6]; \ + X##gi = state[ 7]; \ + X##go = state[ 8]; \ + X##gu = state[ 9]; \ + X##ka = state[10]; \ + X##ke = state[11]; \ + X##ki = state[12]; \ + X##ko = state[13]; \ + X##ku = state[14]; \ + X##ma = state[15]; \ + X##me = state[16]; \ + X##mi = state[17]; \ + X##mo = state[18]; \ + X##mu = state[19]; \ + X##sa = state[20]; \ + X##se = state[21]; \ + X##si = state[22]; \ + X##so = state[23]; \ + X##su = state[24]; \ + +#define copyToState(state, X) \ + state[ 0] = X##ba; \ + state[ 1] = X##be; \ + state[ 2] = X##bi; \ + state[ 3] = X##bo; \ + state[ 4] = X##bu; \ + state[ 5] = X##ga; \ + state[ 6] = X##ge; \ + state[ 7] = X##gi; \ + state[ 8] = X##go; \ + state[ 9] = X##gu; \ + state[10] = X##ka; \ + state[11] = X##ke; \ + state[12] = X##ki; \ + state[13] = X##ko; \ + state[14] = X##ku; \ + state[15] = X##ma; \ + state[16] = X##me; \ + state[17] = X##mi; \ + state[18] = X##mo; \ + state[19] = X##mu; \ + state[20] = X##sa; \ + state[21] = X##se; \ + state[22] = X##si; \ + state[23] = X##so; \ + state[24] = X##su; \ + +#define copyStateVariables(X, Y) \ + X##ba = Y##ba; \ + X##be = Y##be; \ + X##bi = Y##bi; \ + X##bo = Y##bo; \ + X##bu = Y##bu; \ + X##ga = Y##ga; \ + X##ge = Y##ge; \ + X##gi = Y##gi; \ + X##go = Y##go; \ + X##gu = Y##gu; \ + X##ka = Y##ka; \ + X##ke = Y##ke; \ + X##ki = Y##ki; \ + X##ko = Y##ko; \ + X##ku = Y##ku; \ + X##ma = Y##ma; \ + X##me = Y##me; \ + X##mi = Y##mi; \ + X##mo = Y##mo; \ + X##mu = Y##mu; \ + X##sa = Y##sa; \ + X##se = Y##se; \ + X##si = Y##si; \ + X##so = Y##so; \ + X##su = Y##su; \ + +#if ((defined(FullUnrolling)) || (Unrolling == 12)) +#define rounds12 \ + prepareTheta \ + thetaRhoPiChiIotaPrepareTheta(12, A, E) \ + thetaRhoPiChiIotaPrepareTheta(13, E, A) \ + thetaRhoPiChiIotaPrepareTheta(14, A, E) \ + thetaRhoPiChiIotaPrepareTheta(15, E, A) \ + thetaRhoPiChiIotaPrepareTheta(16, A, E) \ + thetaRhoPiChiIotaPrepareTheta(17, E, A) \ + thetaRhoPiChiIotaPrepareTheta(18, A, E) \ + thetaRhoPiChiIotaPrepareTheta(19, E, A) \ + thetaRhoPiChiIotaPrepareTheta(20, A, E) \ + thetaRhoPiChiIotaPrepareTheta(21, E, A) \ + thetaRhoPiChiIotaPrepareTheta(22, A, E) \ + thetaRhoPiChiIota(23, E, A) \ + +#elif (Unrolling == 6) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=6) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ + } \ + +#elif (Unrolling == 4) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=4) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + } \ + +#elif (Unrolling == 3) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=3) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + copyStateVariables(A, E) \ + } \ + +#elif (Unrolling == 2) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=2) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + } \ + +#elif (Unrolling == 1) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i++) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + copyStateVariables(A, E) \ + } \ + +#else +#error "Unrolling is not correctly specified!" +#endif + +void KeccakP1600_opt64_Permute_12rounds(void *state) +{ + declareABCDE + #ifndef KeccakP1600_opt64_fullUnrolling + unsigned int i; + #endif + uint64_t *stateAsLanes = (uint64_t*)state; + + copyFromState(A, stateAsLanes) + rounds12 + copyToState(stateAsLanes, A) +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) +{ + uint64_t lane = ((uint64_t*)state)[lanePosition]; +#ifdef KeccakP1600_opt64_useLaneComplementing + if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20)) + lane = ~lane; +#endif +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + { + uint64_t lane1[1]; + lane1[0] = lane; + memcpy(data, (uint8_t*)lane1+offset, length); + } +#else + unsigned int i; + lane >>= offset*8; + for(i=0; i>= 8; + } +#endif +} + +/* ---------------------------------------------------------------- */ + +#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) +static void fromWordToBytes(uint8_t *bytes, const uint64_t word) +{ + unsigned int i; + + for(i=0; i<(64/8); i++) + bytes[i] = (word >> (8*i)) & 0xFF; +} +#endif + +void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + memcpy(data, state, laneCount*8); +#else + unsigned int i; + + for(i=0; i 1) { + ((uint64_t*)data)[ 1] = ~((uint64_t*)data)[ 1]; + if (laneCount > 2) { + ((uint64_t*)data)[ 2] = ~((uint64_t*)data)[ 2]; + if (laneCount > 8) { + ((uint64_t*)data)[ 8] = ~((uint64_t*)data)[ 8]; + if (laneCount > 12) { + ((uint64_t*)data)[12] = ~((uint64_t*)data)[12]; + if (laneCount > 17) { + ((uint64_t*)data)[17] = ~((uint64_t*)data)[17]; + if (laneCount > 20) { + ((uint64_t*)data)[20] = ~((uint64_t*)data)[20]; + } + } + } + } + } + } +#endif +} + +/* ---------------------------------------------------------------- */ + +#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ + { \ + if ((offset) == 0) { \ + SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ + SnP_ExtractBytesInLane(state, \ + (length)/SnP_laneLengthInBytes, \ + (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ + 0, \ + (length)%SnP_laneLengthInBytes); \ + } \ + else { \ + unsigned int _sizeLeft = (length); \ + unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ + unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ + unsigned char *_curData = (data); \ + while(_sizeLeft > 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define HTOLE64(x) (x) +#else +#define HTOLE64(x) (\ + ((x & 0xff00000000000000ull) >> 56) | \ + ((x & 0x00ff000000000000ull) >> 40) | \ + ((x & 0x0000ff0000000000ull) >> 24) | \ + ((x & 0x000000ff00000000ull) >> 8) | \ + ((x & 0x00000000ff000000ull) << 8) | \ + ((x & 0x0000000000ff0000ull) << 24) | \ + ((x & 0x000000000000ff00ull) << 40) | \ + ((x & 0x00000000000000ffull) << 56)) +#endif + +#define addInput(X, input, laneCount) \ + if (laneCount == 21) { \ + X##ba ^= HTOLE64(input[ 0]); \ + X##be ^= HTOLE64(input[ 1]); \ + X##bi ^= HTOLE64(input[ 2]); \ + X##bo ^= HTOLE64(input[ 3]); \ + X##bu ^= HTOLE64(input[ 4]); \ + X##ga ^= HTOLE64(input[ 5]); \ + X##ge ^= HTOLE64(input[ 6]); \ + X##gi ^= HTOLE64(input[ 7]); \ + X##go ^= HTOLE64(input[ 8]); \ + X##gu ^= HTOLE64(input[ 9]); \ + X##ka ^= HTOLE64(input[10]); \ + X##ke ^= HTOLE64(input[11]); \ + X##ki ^= HTOLE64(input[12]); \ + X##ko ^= HTOLE64(input[13]); \ + X##ku ^= HTOLE64(input[14]); \ + X##ma ^= HTOLE64(input[15]); \ + X##me ^= HTOLE64(input[16]); \ + X##mi ^= HTOLE64(input[17]); \ + X##mo ^= HTOLE64(input[18]); \ + X##mu ^= HTOLE64(input[19]); \ + X##sa ^= HTOLE64(input[20]); \ + } \ + +#include + +size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) +{ + size_t originalDataByteLen = dataByteLen; + declareABCDE + #ifndef KeccakP1600_opt64_fullUnrolling + unsigned int i; + #endif + uint64_t *stateAsLanes = (uint64_t*)state; + uint64_t *inDataAsLanes = (uint64_t*)data; + + assert(laneCount == 21); + + #define laneCount 21 + copyFromState(A, stateAsLanes) + while(dataByteLen >= laneCount*8) { + addInput(A, inDataAsLanes, laneCount) + rounds12 + inDataAsLanes += laneCount; + dataByteLen -= laneCount*8; + } + #undef laneCount + copyToState(stateAsLanes, A) + return originalDataByteLen - dataByteLen; +} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c new file mode 100644 index 0000000..22a0901 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c @@ -0,0 +1,406 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include +#include "brg_endian.h" +#include "KeccakP-1600-SnP.h" + +#ifdef KeccakP1600_disableParallelism +#undef KeccakP1600_enable_simd_options +#else + +// Forward declaration +void KangarooTwelve_SetProcessorCapabilities(); +#ifdef KeccakP1600_enable_simd_options +int K12_SSSE3_requested_disabled = 0; +int K12_AVX2_requested_disabled = 0; +int K12_AVX512_requested_disabled = 0; +#endif // KeccakP1600_enable_simd_options +int K12_enableSSSE3 = 0; +int K12_enableAVX2 = 0; +int K12_enableAVX512 = 0; + +/* ---------------------------------------------------------------- */ + +void KangarooTwelve_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output); +void KangarooTwelve_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output); + +int KeccakP1600times2_IsAvailable() +{ + int result = 0; + result |= K12_enableAVX512; + result |= K12_enableSSSE3; + return result; +} + +const char * KeccakP1600times2_GetImplementation() +{ + if (K12_enableAVX512) { + return "AVX-512 implementation"; + } else if (K12_enableSSSE3) { + return "SSSE3 implementation"; + } else { + return ""; + } +} + +void KangarooTwelve_Process2Leaves(const unsigned char *input, unsigned char *output) +{ + if (K12_enableAVX512) { + KangarooTwelve_AVX512_Process2Leaves(input, output); + } else if (K12_enableSSSE3) { + KangarooTwelve_SSSE3_Process2Leaves(input, output); + } +} + + +void KangarooTwelve_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output); +void KangarooTwelve_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output); + +int KeccakP1600times4_IsAvailable() +{ + int result = 0; + result |= K12_enableAVX512; + result |= K12_enableAVX2; + return result; +} + +const char * KeccakP1600times4_GetImplementation() +{ + if (K12_enableAVX512) { + return "AVX-512 implementation"; + } else if (K12_enableAVX2) { + return "AVX2 implementation"; + } else { + return ""; + } +} + +void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output) +{ + if (K12_enableAVX512) { + KangarooTwelve_AVX512_Process4Leaves(input, output); + } else if (K12_enableAVX2) { + KangarooTwelve_AVX2_Process4Leaves(input, output); + } +} + + +void KangarooTwelve_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output); + +int KeccakP1600times8_IsAvailable() +{ + int result = 0; + result |= K12_enableAVX512; + return result; +} + +const char * KeccakP1600times8_GetImplementation() +{ + if (K12_enableAVX512) { + return "AVX-512 implementation"; + } else { + return ""; + } +} + +void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output) +{ + if (K12_enableAVX512) + KangarooTwelve_AVX512_Process8Leaves(input, output); +} + +#endif // KeccakP1600_disableParallelism + +const char * KeccakP1600_GetImplementation() +{ + if (K12_enableAVX512) + return "AVX-512 implementation"; + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + return "AVX2 implementation"; + else +#endif + return "generic 64-bit implementation"; +} + +void KeccakP1600_Initialize(void *state) +{ + KangarooTwelve_SetProcessorCapabilities(); + if (K12_enableAVX512) + KeccakP1600_AVX512_Initialize(state); + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + KeccakP1600_AVX2_Initialize(state); + else +#endif + KeccakP1600_opt64_Initialize(state); +} + +void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset) +{ + if (K12_enableAVX512) + ((unsigned char*)(state))[offset] ^= data; + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + KeccakP1600_AVX2_AddByte(state, data, offset); + else +#endif + KeccakP1600_opt64_AddByte(state, data, offset); +} + +void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + if (K12_enableAVX512) + KeccakP1600_AVX512_AddBytes(state, data, offset, length); + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + KeccakP1600_AVX2_AddBytes(state, data, offset, length); + else +#endif + KeccakP1600_opt64_AddBytes(state, data, offset, length); +} + +void KeccakP1600_Permute_12rounds(void *state) +{ + if (K12_enableAVX512) + KeccakP1600_AVX512_Permute_12rounds(state); + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + KeccakP1600_AVX2_Permute_12rounds(state); + else +#endif + KeccakP1600_opt64_Permute_12rounds(state); +} + +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + if (K12_enableAVX512) + KeccakP1600_AVX512_ExtractBytes(state, data, offset, length); + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + KeccakP1600_AVX2_ExtractBytes(state, data, offset, length); + else +#endif + KeccakP1600_opt64_ExtractBytes(state, data, offset, length); +} + +size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) +{ + if (K12_enableAVX512) + return KeccakP1600_AVX512_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + return KeccakP1600_AVX2_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); + else +#endif + return KeccakP1600_opt64_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); +} + +/* ---------------------------------------------------------------- */ + +/* Processor capability detection code by Samuel Neves and Jack O'Connor, see + * https://github.com/BLAKE3-team/BLAKE3/blob/master/c/blake3_dispatch.c + */ + +#if defined(__x86_64__) || defined(_M_X64) +#define IS_X86 +#define IS_X86_64 +#endif + +#if defined(__i386__) || defined(_M_IX86) +#define IS_X86 +#define IS_X86_32 +#endif + +#if defined(IS_X86) +static uint64_t xgetbv() { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax = 0, edx = 0; + __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; +#endif +} + +static void cpuid(uint32_t out[4], uint32_t id) { +#if defined(_MSC_VER) + __cpuid((int *)out, id); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#endif +} + +static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { +#if defined(_MSC_VER) + __cpuidex((int *)out, id, sid); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#endif +} + +#endif + +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +static enum cpu_feature g_cpu_features = UNDEFINED; + +static enum cpu_feature + get_cpu_features(void) { + + if (g_cpu_features != UNDEFINED) { + return g_cpu_features; + } else { +#if defined(IS_X86) + uint32_t regs[4] = {0}; + uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; + (void)edx; + enum cpu_feature features = 0; + cpuid(regs, 0); + const int max_id = *eax; + cpuid(regs, 1); +#if defined(__amd64__) || defined(_M_X64) + features |= SSE2; +#else + if (*edx & (1UL << 26)) + features |= SSE2; +#endif + if (*ecx & (1UL << 9)) + features |= SSSE3; + if (*ecx & (1UL << 19)) + features |= SSE41; + + if (*ecx & (1UL << 27)) { // OSXSAVE + const uint64_t mask = xgetbv(); + if ((mask & 6) == 6) { // SSE and AVX states + if (*ecx & (1UL << 28)) + features |= AVX; + if (max_id >= 7) { + cpuidex(regs, 7, 0); + if (*ebx & (1UL << 5)) + features |= AVX2; + if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm + if (*ebx & (1UL << 31)) + features |= AVX512VL; + if (*ebx & (1UL << 16)) + features |= AVX512F; + } + } + } + } + g_cpu_features = features; + return features; +#else + /* How to detect NEON? */ + return 0; +#endif + } +} + +void KangarooTwelve_SetProcessorCapabilities() +{ + enum cpu_feature features = get_cpu_features(); + K12_enableSSSE3 = (features & SSSE3); + K12_enableAVX2 = (features & AVX2); + K12_enableAVX512 = (features & AVX512F) && (features & AVX512VL); +#ifdef KeccakP1600_enable_simd_options + K12_enableSSSE3 = K12_enableSSSE3 && !K12_SSSE3_requested_disabled; + K12_enableAVX2 = K12_enableAVX2 && !K12_AVX2_requested_disabled; + K12_enableAVX512 = K12_enableAVX512 && !K12_AVX512_requested_disabled; +#endif // KeccakP1600_enable_simd_options +} + +#ifdef KeccakP1600_enable_simd_options +int KangarooTwelve_DisableSSSE3(void) { + KangarooTwelve_SetProcessorCapabilities(); + K12_SSSE3_requested_disabled = 1; + if (K12_enableSSSE3) { + KangarooTwelve_SetProcessorCapabilities(); + return 1; // SSSE3 was disabled on this call. + } else { + return 0; // Nothing changed. + } +} + +int KangarooTwelve_DisableAVX2(void) { + KangarooTwelve_SetProcessorCapabilities(); + K12_AVX2_requested_disabled = 1; + if (K12_enableAVX2) { + KangarooTwelve_SetProcessorCapabilities(); + return 1; // AVX2 was disabled on this call. + } else { + return 0; // Nothing changed. + } +} + +int KangarooTwelve_DisableAVX512(void) { + KangarooTwelve_SetProcessorCapabilities(); + K12_AVX512_requested_disabled = 1; + if (K12_enableAVX512) { + KangarooTwelve_SetProcessorCapabilities(); + return 1; // AVX512 was disabled on this call. + } else { + return 0; // Nothing changed. + } +} + +void KangarooTwelve_EnableAllCpuFeatures(void) { + K12_SSSE3_requested_disabled = 0; + K12_AVX2_requested_disabled = 0; + K12_AVX512_requested_disabled = 0; + KangarooTwelve_SetProcessorCapabilities(); +} +#endif // KeccakP1600_enable_simd_options diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c new file mode 100644 index 0000000..0abab49 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c @@ -0,0 +1,419 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include "KeccakP-1600-SnP.h" +#include "align.h" + +#define AVX2alignment 32 + +#define ANDnu256(a, b) _mm256_andnot_si256(a, b) +#define CONST256(a) _mm256_load_si256((const __m256i *)&(a)) +#define CONST256_64(a) _mm256_set1_epi64x(a) +#define LOAD256(a) _mm256_load_si256((const __m256i *)&(a)) +#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d)) +#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o))) +#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) +#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56)) +static ALIGN(AVX2alignment) const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F}; +static ALIGN(AVX2alignment) const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; +#define STORE256(a, b) _mm256_store_si256((__m256i *)&(a), b) +#define STORE256u(a, b) _mm256_storeu_si256((__m256i *)&(a), b) +#define XOR256(a, b) _mm256_xor_si256(a, b) +#define XOReq256(a, b) a = _mm256_xor_si256(a, b) +#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) +#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) +#define PERM128( a, b, c ) _mm256_permute2f128_si256(a, b, c) +#define SHUFFLE64( a, b, c ) _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c)) +#define ZERO() _mm256_setzero_si256() + +static ALIGN(AVX2alignment) const uint64_t KeccakP1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL}; + +#define declareABCDE \ + __m256i Aba, Abe, Abi, Abo, Abu; \ + __m256i Aga, Age, Agi, Ago, Agu; \ + __m256i Aka, Ake, Aki, Ako, Aku; \ + __m256i Ama, Ame, Ami, Amo, Amu; \ + __m256i Asa, Ase, Asi, Aso, Asu; \ + __m256i Bba, Bbe, Bbi, Bbo, Bbu; \ + __m256i Bga, Bge, Bgi, Bgo, Bgu; \ + __m256i Bka, Bke, Bki, Bko, Bku; \ + __m256i Bma, Bme, Bmi, Bmo, Bmu; \ + __m256i Bsa, Bse, Bsi, Bso, Bsu; \ + __m256i Ca, Ce, Ci, Co, Cu; \ + __m256i Ca1, Ce1, Ci1, Co1, Cu1; \ + __m256i Da, De, Di, Do, Du; \ + __m256i Eba, Ebe, Ebi, Ebo, Ebu; \ + __m256i Ega, Ege, Egi, Ego, Egu; \ + __m256i Eka, Eke, Eki, Eko, Eku; \ + __m256i Ema, Eme, Emi, Emo, Emu; \ + __m256i Esa, Ese, Esi, Eso, Esu; \ + +#define prepareTheta \ + Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \ + Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \ + Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \ + Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \ + Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \ + +/* --- Theta Rho Pi Chi Iota Prepare-theta */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + ROL64in256(Ce1, Ce, 1); \ + Da = XOR256(Cu, Ce1); \ + ROL64in256(Ci1, Ci, 1); \ + De = XOR256(Ca, Ci1); \ + ROL64in256(Co1, Co, 1); \ + Di = XOR256(Ce, Co1); \ + ROL64in256(Cu1, Cu, 1); \ + Do = XOR256(Ci, Cu1); \ + ROL64in256(Ca1, Ca, 1); \ + Du = XOR256(Co, Ca1); \ +\ + XOReq256(A##ba, Da); \ + Bba = A##ba; \ + XOReq256(A##ge, De); \ + ROL64in256(Bbe, A##ge, 44); \ + XOReq256(A##ki, Di); \ + ROL64in256(Bbi, A##ki, 43); \ + E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ + XOReq256(E##ba, CONST256_64(KeccakP1600RoundConstants[i])); \ + Ca = E##ba; \ + XOReq256(A##mo, Do); \ + ROL64in256(Bbo, A##mo, 21); \ + E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ + Ce = E##be; \ + XOReq256(A##su, Du); \ + ROL64in256(Bbu, A##su, 14); \ + E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ + Ci = E##bi; \ + E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ + Co = E##bo; \ + E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ + Cu = E##bu; \ +\ + XOReq256(A##bo, Do); \ + ROL64in256(Bga, A##bo, 28); \ + XOReq256(A##gu, Du); \ + ROL64in256(Bge, A##gu, 20); \ + XOReq256(A##ka, Da); \ + ROL64in256(Bgi, A##ka, 3); \ + E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ + XOReq256(Ca, E##ga); \ + XOReq256(A##me, De); \ + ROL64in256(Bgo, A##me, 45); \ + E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ + XOReq256(Ce, E##ge); \ + XOReq256(A##si, Di); \ + ROL64in256(Bgu, A##si, 61); \ + E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ + XOReq256(Ci, E##gi); \ + E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ + XOReq256(Co, E##go); \ + E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ + XOReq256(Cu, E##gu); \ +\ + XOReq256(A##be, De); \ + ROL64in256(Bka, A##be, 1); \ + XOReq256(A##gi, Di); \ + ROL64in256(Bke, A##gi, 6); \ + XOReq256(A##ko, Do); \ + ROL64in256(Bki, A##ko, 25); \ + E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ + XOReq256(Ca, E##ka); \ + XOReq256(A##mu, Du); \ + ROL64in256_8(Bko, A##mu); \ + E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ + XOReq256(Ce, E##ke); \ + XOReq256(A##sa, Da); \ + ROL64in256(Bku, A##sa, 18); \ + E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ + XOReq256(Ci, E##ki); \ + E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ + XOReq256(Co, E##ko); \ + E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ + XOReq256(Cu, E##ku); \ +\ + XOReq256(A##bu, Du); \ + ROL64in256(Bma, A##bu, 27); \ + XOReq256(A##ga, Da); \ + ROL64in256(Bme, A##ga, 36); \ + XOReq256(A##ke, De); \ + ROL64in256(Bmi, A##ke, 10); \ + E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ + XOReq256(Ca, E##ma); \ + XOReq256(A##mi, Di); \ + ROL64in256(Bmo, A##mi, 15); \ + E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ + XOReq256(Ce, E##me); \ + XOReq256(A##so, Do); \ + ROL64in256_56(Bmu, A##so); \ + E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ + XOReq256(Ci, E##mi); \ + E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ + XOReq256(Co, E##mo); \ + E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ + XOReq256(Cu, E##mu); \ +\ + XOReq256(A##bi, Di); \ + ROL64in256(Bsa, A##bi, 62); \ + XOReq256(A##go, Do); \ + ROL64in256(Bse, A##go, 55); \ + XOReq256(A##ku, Du); \ + ROL64in256(Bsi, A##ku, 39); \ + E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ + XOReq256(Ca, E##sa); \ + XOReq256(A##ma, Da); \ + ROL64in256(Bso, A##ma, 41); \ + E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ + XOReq256(Ce, E##se); \ + XOReq256(A##se, De); \ + ROL64in256(Bsu, A##se, 2); \ + E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ + XOReq256(Ci, E##si); \ + E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ + XOReq256(Co, E##so); \ + E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ + XOReq256(Cu, E##su); \ +\ + +/* --- Theta Rho Pi Chi Iota */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIota(i, A, E) \ + ROL64in256(Ce1, Ce, 1); \ + Da = XOR256(Cu, Ce1); \ + ROL64in256(Ci1, Ci, 1); \ + De = XOR256(Ca, Ci1); \ + ROL64in256(Co1, Co, 1); \ + Di = XOR256(Ce, Co1); \ + ROL64in256(Cu1, Cu, 1); \ + Do = XOR256(Ci, Cu1); \ + ROL64in256(Ca1, Ca, 1); \ + Du = XOR256(Co, Ca1); \ +\ + XOReq256(A##ba, Da); \ + Bba = A##ba; \ + XOReq256(A##ge, De); \ + ROL64in256(Bbe, A##ge, 44); \ + XOReq256(A##ki, Di); \ + ROL64in256(Bbi, A##ki, 43); \ + E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ + XOReq256(E##ba, CONST256_64(KeccakP1600RoundConstants[i])); \ + XOReq256(A##mo, Do); \ + ROL64in256(Bbo, A##mo, 21); \ + E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ + XOReq256(A##su, Du); \ + ROL64in256(Bbu, A##su, 14); \ + E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ + E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ + E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ +\ + XOReq256(A##bo, Do); \ + ROL64in256(Bga, A##bo, 28); \ + XOReq256(A##gu, Du); \ + ROL64in256(Bge, A##gu, 20); \ + XOReq256(A##ka, Da); \ + ROL64in256(Bgi, A##ka, 3); \ + E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ + XOReq256(A##me, De); \ + ROL64in256(Bgo, A##me, 45); \ + E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ + XOReq256(A##si, Di); \ + ROL64in256(Bgu, A##si, 61); \ + E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ + E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ + E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ +\ + XOReq256(A##be, De); \ + ROL64in256(Bka, A##be, 1); \ + XOReq256(A##gi, Di); \ + ROL64in256(Bke, A##gi, 6); \ + XOReq256(A##ko, Do); \ + ROL64in256(Bki, A##ko, 25); \ + E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ + XOReq256(A##mu, Du); \ + ROL64in256_8(Bko, A##mu); \ + E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ + XOReq256(A##sa, Da); \ + ROL64in256(Bku, A##sa, 18); \ + E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ + E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ + E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ +\ + XOReq256(A##bu, Du); \ + ROL64in256(Bma, A##bu, 27); \ + XOReq256(A##ga, Da); \ + ROL64in256(Bme, A##ga, 36); \ + XOReq256(A##ke, De); \ + ROL64in256(Bmi, A##ke, 10); \ + E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ + XOReq256(A##mi, Di); \ + ROL64in256(Bmo, A##mi, 15); \ + E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ + XOReq256(A##so, Do); \ + ROL64in256_56(Bmu, A##so); \ + E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ + E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ + E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ +\ + XOReq256(A##bi, Di); \ + ROL64in256(Bsa, A##bi, 62); \ + XOReq256(A##go, Do); \ + ROL64in256(Bse, A##go, 55); \ + XOReq256(A##ku, Du); \ + ROL64in256(Bsi, A##ku, 39); \ + E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ + XOReq256(A##ma, Da); \ + ROL64in256(Bso, A##ma, 41); \ + E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ + XOReq256(A##se, De); \ + ROL64in256(Bsu, A##se, 2); \ + E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ + E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ + E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ +\ + +#define initializeState(X) \ + X##ba = ZERO(); \ + X##be = ZERO(); \ + X##bi = ZERO(); \ + X##bo = ZERO(); \ + X##bu = ZERO(); \ + X##ga = ZERO(); \ + X##ge = ZERO(); \ + X##gi = ZERO(); \ + X##go = ZERO(); \ + X##gu = ZERO(); \ + X##ka = ZERO(); \ + X##ke = ZERO(); \ + X##ki = ZERO(); \ + X##ko = ZERO(); \ + X##ku = ZERO(); \ + X##ma = ZERO(); \ + X##me = ZERO(); \ + X##mi = ZERO(); \ + X##mo = ZERO(); \ + X##mu = ZERO(); \ + X##sa = ZERO(); \ + X##se = ZERO(); \ + X##si = ZERO(); \ + X##so = ZERO(); \ + X##su = ZERO(); \ + +#define XORdata16(X, data0, data1, data2, data3) \ + XOReq256(X##ba, LOAD4_64((data3)[ 0], (data2)[ 0], (data1)[ 0], (data0)[ 0])); \ + XOReq256(X##be, LOAD4_64((data3)[ 1], (data2)[ 1], (data1)[ 1], (data0)[ 1])); \ + XOReq256(X##bi, LOAD4_64((data3)[ 2], (data2)[ 2], (data1)[ 2], (data0)[ 2])); \ + XOReq256(X##bo, LOAD4_64((data3)[ 3], (data2)[ 3], (data1)[ 3], (data0)[ 3])); \ + XOReq256(X##bu, LOAD4_64((data3)[ 4], (data2)[ 4], (data1)[ 4], (data0)[ 4])); \ + XOReq256(X##ga, LOAD4_64((data3)[ 5], (data2)[ 5], (data1)[ 5], (data0)[ 5])); \ + XOReq256(X##ge, LOAD4_64((data3)[ 6], (data2)[ 6], (data1)[ 6], (data0)[ 6])); \ + XOReq256(X##gi, LOAD4_64((data3)[ 7], (data2)[ 7], (data1)[ 7], (data0)[ 7])); \ + XOReq256(X##go, LOAD4_64((data3)[ 8], (data2)[ 8], (data1)[ 8], (data0)[ 8])); \ + XOReq256(X##gu, LOAD4_64((data3)[ 9], (data2)[ 9], (data1)[ 9], (data0)[ 9])); \ + XOReq256(X##ka, LOAD4_64((data3)[10], (data2)[10], (data1)[10], (data0)[10])); \ + XOReq256(X##ke, LOAD4_64((data3)[11], (data2)[11], (data1)[11], (data0)[11])); \ + XOReq256(X##ki, LOAD4_64((data3)[12], (data2)[12], (data1)[12], (data0)[12])); \ + XOReq256(X##ko, LOAD4_64((data3)[13], (data2)[13], (data1)[13], (data0)[13])); \ + XOReq256(X##ku, LOAD4_64((data3)[14], (data2)[14], (data1)[14], (data0)[14])); \ + XOReq256(X##ma, LOAD4_64((data3)[15], (data2)[15], (data1)[15], (data0)[15])); \ + +#define XORdata21(X, data0, data1, data2, data3) \ + XORdata16(X, data0, data1, data2, data3) \ + XOReq256(X##me, LOAD4_64((data3)[16], (data2)[16], (data1)[16], (data0)[16])); \ + XOReq256(X##mi, LOAD4_64((data3)[17], (data2)[17], (data1)[17], (data0)[17])); \ + XOReq256(X##mo, LOAD4_64((data3)[18], (data2)[18], (data1)[18], (data0)[18])); \ + XOReq256(X##mu, LOAD4_64((data3)[19], (data2)[19], (data1)[19], (data0)[19])); \ + XOReq256(X##sa, LOAD4_64((data3)[20], (data2)[20], (data1)[20], (data0)[20])); \ + +#define rounds12 \ + prepareTheta \ + thetaRhoPiChiIotaPrepareTheta(12, A, E) \ + thetaRhoPiChiIotaPrepareTheta(13, E, A) \ + thetaRhoPiChiIotaPrepareTheta(14, A, E) \ + thetaRhoPiChiIotaPrepareTheta(15, E, A) \ + thetaRhoPiChiIotaPrepareTheta(16, A, E) \ + thetaRhoPiChiIotaPrepareTheta(17, E, A) \ + thetaRhoPiChiIotaPrepareTheta(18, A, E) \ + thetaRhoPiChiIotaPrepareTheta(19, E, A) \ + thetaRhoPiChiIotaPrepareTheta(20, A, E) \ + thetaRhoPiChiIotaPrepareTheta(21, E, A) \ + thetaRhoPiChiIotaPrepareTheta(22, A, E) \ + thetaRhoPiChiIota(23, E, A) + +#define chunkSize 8192 +#define rateInBytes (21*8) + +void KangarooTwelve_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output) +{ + declareABCDE + unsigned int j; + + initializeState(A); + + for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { + XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); + rounds12 + input += rateInBytes; + } + + XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); + XOReq256(Ame, CONST256_64(0x0BULL)); + XOReq256(Asa, CONST256_64(0x8000000000000000ULL)); + rounds12 + + { + __m256i lanesL01, lanesL23, lanesH01, lanesH23; + + lanesL01 = UNPACKL( Aba, Abe ); + lanesH01 = UNPACKH( Aba, Abe ); + lanesL23 = UNPACKL( Abi, Abo ); + lanesH23 = UNPACKH( Abi, Abo ); + STORE256u( output[ 0], PERM128( lanesL01, lanesL23, 0x20 ) ); + STORE256u( output[32], PERM128( lanesH01, lanesH23, 0x20 ) ); + STORE256u( output[64], PERM128( lanesL01, lanesL23, 0x31 ) ); + STORE256u( output[96], PERM128( lanesH01, lanesH23, 0x31 ) ); + } +} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c new file mode 100644 index 0000000..a19fc35 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c @@ -0,0 +1,458 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include +#include "KeccakP-1600-SnP.h" +#include "align.h" + +#define AVX512alignment 64 + +#define LOAD4_32(a,b,c,d) _mm_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d)) +#define LOAD8_32(a,b,c,d,e,f,g,h) _mm256_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d), (uint32_t)(e), (uint32_t)(f), (uint32_t)(g), (uint32_t)(h)) +#define LOAD_GATHER2_64(idx,p) _mm_i32gather_epi64( (const void*)(p), idx, 8) +#define LOAD_GATHER4_64(idx,p) _mm256_i32gather_epi64( (const void*)(p), idx, 8) +#define LOAD_GATHER8_64(idx,p) _mm512_i32gather_epi64( idx, (const void*)(p), 8) +#define STORE_SCATTER8_64(p,idx, v) _mm512_i32scatter_epi64( (void*)(p), idx, v, 8) + + +/* Keccak-p[1600]×2 */ + +#define XOR(a,b) _mm_xor_si128(a,b) +#define XOReq(a, b) a = _mm_xor_si128(a, b) +#define XOR3(a,b,c) _mm_ternarylogic_epi64(a,b,c,0x96) +#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) +#define ROL(a,offset) _mm_rol_epi64(a,offset) +#define Chi(a,b,c) _mm_ternarylogic_epi64(a,b,c,0xD2) +#define CONST_64(a) _mm_set1_epi64x(a) +#define LOAD6464(a, b) _mm_set_epi64x(a, b) +#define STORE128u(a, b) _mm_storeu_si128((__m128i *)&(a), b) +#define UNPACKL( a, b ) _mm_unpacklo_epi64((a), (b)) +#define UNPACKH( a, b ) _mm_unpackhi_epi64((a), (b)) +#define ZERO() _mm_setzero_si128() + +static ALIGN(AVX512alignment) const uint64_t KeccakP1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL}; + +#define KeccakP_DeclareVars(type) \ + type _Ba, _Be, _Bi, _Bo, _Bu; \ + type _Da, _De, _Di, _Do, _Du; \ + type _ba, _be, _bi, _bo, _bu; \ + type _ga, _ge, _gi, _go, _gu; \ + type _ka, _ke, _ki, _ko, _ku; \ + type _ma, _me, _mi, _mo, _mu; \ + type _sa, _se, _si, _so, _su + +#define KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bb1, _Bb2, _Bb3, _Bb4, _Bb5, _Rr1, _Rr2, _Rr3, _Rr4, _Rr5 ) \ + _Bb1 = XOR(_L1, _Da); \ + _Bb2 = XOR(_L2, _De); \ + _Bb3 = XOR(_L3, _Di); \ + _Bb4 = XOR(_L4, _Do); \ + _Bb5 = XOR(_L5, _Du); \ + if (_Rr1 != 0) _Bb1 = ROL(_Bb1, _Rr1); \ + _Bb2 = ROL(_Bb2, _Rr2); \ + _Bb3 = ROL(_Bb3, _Rr3); \ + _Bb4 = ROL(_Bb4, _Rr4); \ + _Bb5 = ROL(_Bb5, _Rr5); \ + _L1 = Chi( _Ba, _Be, _Bi); \ + _L2 = Chi( _Be, _Bi, _Bo); \ + _L3 = Chi( _Bi, _Bo, _Bu); \ + _L4 = Chi( _Bo, _Bu, _Ba); \ + _L5 = Chi( _Bu, _Ba, _Be); + +#define KeccakP_ThetaRhoPiChiIota0( _L1, _L2, _L3, _L4, _L5, _rc ) \ + _Ba = XOR5( _ba, _ga, _ka, _ma, _sa ); /* Theta effect */ \ + _Be = XOR5( _be, _ge, _ke, _me, _se ); \ + _Bi = XOR5( _bi, _gi, _ki, _mi, _si ); \ + _Bo = XOR5( _bo, _go, _ko, _mo, _so ); \ + _Bu = XOR5( _bu, _gu, _ku, _mu, _su ); \ + _Da = ROL( _Be, 1 ); \ + _De = ROL( _Bi, 1 ); \ + _Di = ROL( _Bo, 1 ); \ + _Do = ROL( _Bu, 1 ); \ + _Du = ROL( _Ba, 1 ); \ + _Da = XOR( _Da, _Bu ); \ + _De = XOR( _De, _Ba ); \ + _Di = XOR( _Di, _Be ); \ + _Do = XOR( _Do, _Bi ); \ + _Du = XOR( _Du, _Bo ); \ + KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Ba, _Be, _Bi, _Bo, _Bu, 0, 44, 43, 21, 14 ); \ + _L1 = XOR(_L1, _rc) /* Iota */ + +#define KeccakP_ThetaRhoPiChi1( _L1, _L2, _L3, _L4, _L5 ) \ + KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bi, _Bo, _Bu, _Ba, _Be, 3, 45, 61, 28, 20 ) + +#define KeccakP_ThetaRhoPiChi2( _L1, _L2, _L3, _L4, _L5 ) \ + KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bu, _Ba, _Be, _Bi, _Bo, 18, 1, 6, 25, 8 ) + +#define KeccakP_ThetaRhoPiChi3( _L1, _L2, _L3, _L4, _L5 ) \ + KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Be, _Bi, _Bo, _Bu, _Ba, 36, 10, 15, 56, 27 ) + +#define KeccakP_ThetaRhoPiChi4( _L1, _L2, _L3, _L4, _L5 ) \ + KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bo, _Bu, _Ba, _Be, _Bi, 41, 2, 62, 55, 39 ) + +#define KeccakP_4rounds( i ) \ + KeccakP_ThetaRhoPiChiIota0(_ba, _ge, _ki, _mo, _su, CONST_64(KeccakP1600RoundConstants[i]) ); \ + KeccakP_ThetaRhoPiChi1( _ka, _me, _si, _bo, _gu ); \ + KeccakP_ThetaRhoPiChi2( _sa, _be, _gi, _ko, _mu ); \ + KeccakP_ThetaRhoPiChi3( _ga, _ke, _mi, _so, _bu ); \ + KeccakP_ThetaRhoPiChi4( _ma, _se, _bi, _go, _ku ); \ +\ + KeccakP_ThetaRhoPiChiIota0(_ba, _me, _gi, _so, _ku, CONST_64(KeccakP1600RoundConstants[i+1]) ); \ + KeccakP_ThetaRhoPiChi1( _sa, _ke, _bi, _mo, _gu ); \ + KeccakP_ThetaRhoPiChi2( _ma, _ge, _si, _ko, _bu ); \ + KeccakP_ThetaRhoPiChi3( _ka, _be, _mi, _go, _su ); \ + KeccakP_ThetaRhoPiChi4( _ga, _se, _ki, _bo, _mu ); \ +\ + KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST_64(KeccakP1600RoundConstants[i+2]) ); \ + KeccakP_ThetaRhoPiChi1( _ma, _be, _ki, _so, _gu ); \ + KeccakP_ThetaRhoPiChi2( _ga, _me, _bi, _ko, _su ); \ + KeccakP_ThetaRhoPiChi3( _sa, _ge, _mi, _bo, _ku ); \ + KeccakP_ThetaRhoPiChi4( _ka, _se, _gi, _mo, _bu ); \ +\ + KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST_64(KeccakP1600RoundConstants[i+3]) ); \ + KeccakP_ThetaRhoPiChi1( _ga, _ge, _gi, _go, _gu ); \ + KeccakP_ThetaRhoPiChi2( _ka, _ke, _ki, _ko, _ku ); \ + KeccakP_ThetaRhoPiChi3( _ma, _me, _mi, _mo, _mu ); \ + KeccakP_ThetaRhoPiChi4( _sa, _se, _si, _so, _su ) + +#define rounds12 \ + KeccakP_4rounds( 12 ); \ + KeccakP_4rounds( 16 ); \ + KeccakP_4rounds( 20 ) + +#define initializeState(X) \ + X##ba = ZERO(); \ + X##be = ZERO(); \ + X##bi = ZERO(); \ + X##bo = ZERO(); \ + X##bu = ZERO(); \ + X##ga = ZERO(); \ + X##ge = ZERO(); \ + X##gi = ZERO(); \ + X##go = ZERO(); \ + X##gu = ZERO(); \ + X##ka = ZERO(); \ + X##ke = ZERO(); \ + X##ki = ZERO(); \ + X##ko = ZERO(); \ + X##ku = ZERO(); \ + X##ma = ZERO(); \ + X##me = ZERO(); \ + X##mi = ZERO(); \ + X##mo = ZERO(); \ + X##mu = ZERO(); \ + X##sa = ZERO(); \ + X##se = ZERO(); \ + X##si = ZERO(); \ + X##so = ZERO(); \ + X##su = ZERO(); \ + +#define XORdata16(X, data0, data1) \ + XOReq(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \ + XOReq(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \ + XOReq(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \ + XOReq(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \ + XOReq(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \ + XOReq(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \ + XOReq(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \ + XOReq(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \ + XOReq(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \ + XOReq(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \ + XOReq(X##ka, LOAD6464((data1)[10], (data0)[10])); \ + XOReq(X##ke, LOAD6464((data1)[11], (data0)[11])); \ + XOReq(X##ki, LOAD6464((data1)[12], (data0)[12])); \ + XOReq(X##ko, LOAD6464((data1)[13], (data0)[13])); \ + XOReq(X##ku, LOAD6464((data1)[14], (data0)[14])); \ + XOReq(X##ma, LOAD6464((data1)[15], (data0)[15])); \ + +#define XORdata21(X, data0, data1) \ + XORdata16(X, data0, data1) \ + XOReq(X##me, LOAD6464((data1)[16], (data0)[16])); \ + XOReq(X##mi, LOAD6464((data1)[17], (data0)[17])); \ + XOReq(X##mo, LOAD6464((data1)[18], (data0)[18])); \ + XOReq(X##mu, LOAD6464((data1)[19], (data0)[19])); \ + XOReq(X##sa, LOAD6464((data1)[20], (data0)[20])); \ + +#define chunkSize 8192 +#define rateInBytes (21*8) + +void KangarooTwelve_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output) +{ + KeccakP_DeclareVars(__m128i); + unsigned int j; + + initializeState(_); + + for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { + XORdata21(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); + rounds12 + input += rateInBytes; + } + + XORdata16(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); + XOReq(_me, CONST_64(0x0BULL)); + XOReq(_sa, CONST_64(0x8000000000000000ULL)); + rounds12 + + STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( _ba, _be ) ); + STORE128u( *(__m128i*)&(output[16]), UNPACKL( _bi, _bo ) ); + STORE128u( *(__m128i*)&(output[32]), UNPACKH( _ba, _be ) ); + STORE128u( *(__m128i*)&(output[48]), UNPACKH( _bi, _bo ) ); +} + +#undef XOR +#undef XOReq +#undef XOR3 +#undef XOR5 +#undef ROL +#undef Chi +#undef CONST_64 +#undef LOAD6464 +#undef STORE128u +#undef UNPACKL +#undef UNPACKH +#undef ZERO +#undef XORdata16 +#undef XORdata21 + + +/* Keccak-p[1600]×4 */ + +#define XOR(a,b) _mm256_xor_si256(a,b) +#define XOReq(a,b) a = _mm256_xor_si256(a,b) +#define XOR3(a,b,c) _mm256_ternarylogic_epi64(a,b,c,0x96) +#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) +#define XOR512(a,b) _mm512_xor_si512(a,b) +#define ROL(a,offset) _mm256_rol_epi64(a,offset) +#define Chi(a,b,c) _mm256_ternarylogic_epi64(a,b,c,0xD2) +#define CONST_64(a) _mm256_set1_epi64x(a) +#define ZERO() _mm256_setzero_si256() +#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d)) + +#define XORdata16(X, data0, data1, data2, data3) \ + XOReq(X##ba, LOAD4_64((data3)[ 0], (data2)[ 0], (data1)[ 0], (data0)[ 0])); \ + XOReq(X##be, LOAD4_64((data3)[ 1], (data2)[ 1], (data1)[ 1], (data0)[ 1])); \ + XOReq(X##bi, LOAD4_64((data3)[ 2], (data2)[ 2], (data1)[ 2], (data0)[ 2])); \ + XOReq(X##bo, LOAD4_64((data3)[ 3], (data2)[ 3], (data1)[ 3], (data0)[ 3])); \ + XOReq(X##bu, LOAD4_64((data3)[ 4], (data2)[ 4], (data1)[ 4], (data0)[ 4])); \ + XOReq(X##ga, LOAD4_64((data3)[ 5], (data2)[ 5], (data1)[ 5], (data0)[ 5])); \ + XOReq(X##ge, LOAD4_64((data3)[ 6], (data2)[ 6], (data1)[ 6], (data0)[ 6])); \ + XOReq(X##gi, LOAD4_64((data3)[ 7], (data2)[ 7], (data1)[ 7], (data0)[ 7])); \ + XOReq(X##go, LOAD4_64((data3)[ 8], (data2)[ 8], (data1)[ 8], (data0)[ 8])); \ + XOReq(X##gu, LOAD4_64((data3)[ 9], (data2)[ 9], (data1)[ 9], (data0)[ 9])); \ + XOReq(X##ka, LOAD4_64((data3)[10], (data2)[10], (data1)[10], (data0)[10])); \ + XOReq(X##ke, LOAD4_64((data3)[11], (data2)[11], (data1)[11], (data0)[11])); \ + XOReq(X##ki, LOAD4_64((data3)[12], (data2)[12], (data1)[12], (data0)[12])); \ + XOReq(X##ko, LOAD4_64((data3)[13], (data2)[13], (data1)[13], (data0)[13])); \ + XOReq(X##ku, LOAD4_64((data3)[14], (data2)[14], (data1)[14], (data0)[14])); \ + XOReq(X##ma, LOAD4_64((data3)[15], (data2)[15], (data1)[15], (data0)[15])); \ + +#define XORdata21(X, data0, data1, data2, data3) \ + XORdata16(X, data0, data1, data2, data3) \ + XOReq(X##me, LOAD4_64((data3)[16], (data2)[16], (data1)[16], (data0)[16])); \ + XOReq(X##mi, LOAD4_64((data3)[17], (data2)[17], (data1)[17], (data0)[17])); \ + XOReq(X##mo, LOAD4_64((data3)[18], (data2)[18], (data1)[18], (data0)[18])); \ + XOReq(X##mu, LOAD4_64((data3)[19], (data2)[19], (data1)[19], (data0)[19])); \ + XOReq(X##sa, LOAD4_64((data3)[20], (data2)[20], (data1)[20], (data0)[20])); \ + +void KangarooTwelve_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output) +{ + KeccakP_DeclareVars(__m256i); + unsigned int j; + + initializeState(_); + + for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { + XORdata21(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); + rounds12 + input += rateInBytes; + } + + XORdata16(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); + XOReq(_me, CONST_64(0x0BULL)); + XOReq(_sa, CONST_64(0x8000000000000000ULL)); + rounds12 + +#define STORE256u(a, b) _mm256_storeu_si256((__m256i *)&(a), b) +#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) +#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) +#define PERM128( a, b, c ) _mm256_permute2f128_si256(a, b, c) + { + __m256i lanesL01, lanesL23, lanesH01, lanesH23; + + lanesL01 = UNPACKL( _ba, _be ); + lanesH01 = UNPACKH( _ba, _be ); + lanesL23 = UNPACKL( _bi, _bo ); + lanesH23 = UNPACKH( _bi, _bo ); + STORE256u( output[ 0], PERM128( lanesL01, lanesL23, 0x20 ) ); + STORE256u( output[32], PERM128( lanesH01, lanesH23, 0x20 ) ); + STORE256u( output[64], PERM128( lanesL01, lanesL23, 0x31 ) ); + STORE256u( output[96], PERM128( lanesH01, lanesH23, 0x31 ) ); + } +/* TODO: check if something like this would be better: + index512 = LOAD8_32(3*laneOffset+1, 2*laneOffset+1, 1*laneOffset+1, 0*laneOffset+1, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset); + STORE_SCATTER8_64(dataAsLanes+0, index512, stateAsLanes512[0/2]); + STORE_SCATTER8_64(dataAsLanes+2, index512, stateAsLanes512[2/2]); +*/ +} + +#undef XOR +#undef XOReq +#undef XOR3 +#undef XOR5 +#undef XOR512 +#undef ROL +#undef Chi +#undef CONST_64 +#undef ZERO +#undef LOAD4_64 +#undef XORdata16 +#undef XORdata21 + + +/* Keccak-p[1600]×8 */ + +#define XOR(a,b) _mm512_xor_si512(a,b) +#define XOReq(a,b) a = _mm512_xor_si512(a,b) +#define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96) +#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) +#define XOReq512(a, b) a = XOR(a,b) +#define ROL(a,offset) _mm512_rol_epi64(a,offset) +#define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2) +#define CONST_64(a) _mm512_set1_epi64(a) +#define ZERO() _mm512_setzero_si512() +#define LOAD(p) _mm512_loadu_si512(p) + +#define LoadAndTranspose8(dataAsLanes, offset) \ + t0 = LOAD((dataAsLanes) + (offset) + 0*chunkSize/8); \ + t1 = LOAD((dataAsLanes) + (offset) + 1*chunkSize/8); \ + t2 = LOAD((dataAsLanes) + (offset) + 2*chunkSize/8); \ + t3 = LOAD((dataAsLanes) + (offset) + 3*chunkSize/8); \ + t4 = LOAD((dataAsLanes) + (offset) + 4*chunkSize/8); \ + t5 = LOAD((dataAsLanes) + (offset) + 5*chunkSize/8); \ + t6 = LOAD((dataAsLanes) + (offset) + 6*chunkSize/8); \ + t7 = LOAD((dataAsLanes) + (offset) + 7*chunkSize/8); \ + r0 = _mm512_unpacklo_epi64(t0, t1); \ + r1 = _mm512_unpackhi_epi64(t0, t1); \ + r2 = _mm512_unpacklo_epi64(t2, t3); \ + r3 = _mm512_unpackhi_epi64(t2, t3); \ + r4 = _mm512_unpacklo_epi64(t4, t5); \ + r5 = _mm512_unpackhi_epi64(t4, t5); \ + r6 = _mm512_unpacklo_epi64(t6, t7); \ + r7 = _mm512_unpackhi_epi64(t6, t7); \ + t0 = _mm512_shuffle_i32x4(r0, r2, 0x88); \ + t1 = _mm512_shuffle_i32x4(r1, r3, 0x88); \ + t2 = _mm512_shuffle_i32x4(r0, r2, 0xdd); \ + t3 = _mm512_shuffle_i32x4(r1, r3, 0xdd); \ + t4 = _mm512_shuffle_i32x4(r4, r6, 0x88); \ + t5 = _mm512_shuffle_i32x4(r5, r7, 0x88); \ + t6 = _mm512_shuffle_i32x4(r4, r6, 0xdd); \ + t7 = _mm512_shuffle_i32x4(r5, r7, 0xdd); \ + r0 = _mm512_shuffle_i32x4(t0, t4, 0x88); \ + r1 = _mm512_shuffle_i32x4(t1, t5, 0x88); \ + r2 = _mm512_shuffle_i32x4(t2, t6, 0x88); \ + r3 = _mm512_shuffle_i32x4(t3, t7, 0x88); \ + r4 = _mm512_shuffle_i32x4(t0, t4, 0xdd); \ + r5 = _mm512_shuffle_i32x4(t1, t5, 0xdd); \ + r6 = _mm512_shuffle_i32x4(t2, t6, 0xdd); \ + r7 = _mm512_shuffle_i32x4(t3, t7, 0xdd); \ + +#define XORdata16(X, index, dataAsLanes) \ + LoadAndTranspose8(dataAsLanes, 0) \ + XOReq(X##ba, r0); \ + XOReq(X##be, r1); \ + XOReq(X##bi, r2); \ + XOReq(X##bo, r3); \ + XOReq(X##bu, r4); \ + XOReq(X##ga, r5); \ + XOReq(X##ge, r6); \ + XOReq(X##gi, r7); \ + LoadAndTranspose8(dataAsLanes, 8) \ + XOReq(X##go, r0); \ + XOReq(X##gu, r1); \ + XOReq(X##ka, r2); \ + XOReq(X##ke, r3); \ + XOReq(X##ki, r4); \ + XOReq(X##ko, r5); \ + XOReq(X##ku, r6); \ + XOReq(X##ma, r7); \ + +#define XORdata21(X, index, dataAsLanes) \ + XORdata16(X, index, dataAsLanes) \ + XOReq(X##me, LOAD_GATHER8_64(index, (dataAsLanes) + 16)); \ + XOReq(X##mi, LOAD_GATHER8_64(index, (dataAsLanes) + 17)); \ + XOReq(X##mo, LOAD_GATHER8_64(index, (dataAsLanes) + 18)); \ + XOReq(X##mu, LOAD_GATHER8_64(index, (dataAsLanes) + 19)); \ + XOReq(X##sa, LOAD_GATHER8_64(index, (dataAsLanes) + 20)); \ + +void KangarooTwelve_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output) +{ + KeccakP_DeclareVars(__m512i); + unsigned int j; + const uint64_t *outputAsLanes = (const uint64_t *)output; + __m256i index; + __m512i t0, t1, t2, t3, t4, t5, t6, t7; + __m512i r0, r1, r2, r3, r4, r5, r6, r7; + + initializeState(_); + + index = LOAD8_32(7*(chunkSize / 8), 6*(chunkSize / 8), 5*(chunkSize / 8), 4*(chunkSize / 8), 3*(chunkSize / 8), 2*(chunkSize / 8), 1*(chunkSize / 8), 0*(chunkSize / 8)); + for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { + XORdata21(_, index, (const uint64_t *)input); + rounds12 + input += rateInBytes; + } + + XORdata16(_, index, (const uint64_t *)input); + XOReq(_me, CONST_64(0x0BULL)); + XOReq(_sa, CONST_64(0x8000000000000000ULL)); + rounds12 + + index = LOAD8_32(7*4, 6*4, 5*4, 4*4, 3*4, 2*4, 1*4, 0*4); + STORE_SCATTER8_64(outputAsLanes+0, index, _ba); + STORE_SCATTER8_64(outputAsLanes+1, index, _be); + STORE_SCATTER8_64(outputAsLanes+2, index, _bi); + STORE_SCATTER8_64(outputAsLanes+3, index, _bo); +} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c new file mode 100644 index 0000000..036df52 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c @@ -0,0 +1,438 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include "KeccakP-1600-SnP.h" +#include "align.h" + +#define KeccakP1600times2_SSSE3_unrolling 2 + +#define SSSE3alignment 16 + +#define ANDnu128(a, b) _mm_andnot_si128(a, b) +#define CONST128(a) _mm_load_si128((const __m128i *)&(a)) +#define LOAD128(a) _mm_load_si128((const __m128i *)&(a)) +#define LOAD6464(a, b) _mm_set_epi64x(a, b) +#define CONST128_64(a) _mm_set1_epi64x(a) +#define ROL64in128(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o))) +#define ROL64in128_8(a) _mm_shuffle_epi8(a, CONST128(rho8)) +#define ROL64in128_56(a) _mm_shuffle_epi8(a, CONST128(rho56)) +static const uint64_t rho8[2] = {0x0605040302010007, 0x0E0D0C0B0A09080F}; +static const uint64_t rho56[2] = {0x0007060504030201, 0x080F0E0D0C0B0A09}; +#define STORE128(a, b) _mm_store_si128((__m128i *)&(a), b) +#define STORE128u(a, b) _mm_storeu_si128((__m128i *)&(a), b) +#define XOR128(a, b) _mm_xor_si128(a, b) +#define XOReq128(a, b) a = _mm_xor_si128(a, b) +#define UNPACKL( a, b ) _mm_unpacklo_epi64((a), (b)) +#define UNPACKH( a, b ) _mm_unpackhi_epi64((a), (b)) +#define ZERO() _mm_setzero_si128() + +static ALIGN(SSSE3alignment) const uint64_t KeccakP1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL}; + +#define declareABCDE \ + __m128i Aba, Abe, Abi, Abo, Abu; \ + __m128i Aga, Age, Agi, Ago, Agu; \ + __m128i Aka, Ake, Aki, Ako, Aku; \ + __m128i Ama, Ame, Ami, Amo, Amu; \ + __m128i Asa, Ase, Asi, Aso, Asu; \ + __m128i Bba, Bbe, Bbi, Bbo, Bbu; \ + __m128i Bga, Bge, Bgi, Bgo, Bgu; \ + __m128i Bka, Bke, Bki, Bko, Bku; \ + __m128i Bma, Bme, Bmi, Bmo, Bmu; \ + __m128i Bsa, Bse, Bsi, Bso, Bsu; \ + __m128i Ca, Ce, Ci, Co, Cu; \ + __m128i Da, De, Di, Do, Du; \ + __m128i Eba, Ebe, Ebi, Ebo, Ebu; \ + __m128i Ega, Ege, Egi, Ego, Egu; \ + __m128i Eka, Eke, Eki, Eko, Eku; \ + __m128i Ema, Eme, Emi, Emo, Emu; \ + __m128i Esa, Ese, Esi, Eso, Esu; \ + +#define prepareTheta \ + Ca = XOR128(Aba, XOR128(Aga, XOR128(Aka, XOR128(Ama, Asa)))); \ + Ce = XOR128(Abe, XOR128(Age, XOR128(Ake, XOR128(Ame, Ase)))); \ + Ci = XOR128(Abi, XOR128(Agi, XOR128(Aki, XOR128(Ami, Asi)))); \ + Co = XOR128(Abo, XOR128(Ago, XOR128(Ako, XOR128(Amo, Aso)))); \ + Cu = XOR128(Abu, XOR128(Agu, XOR128(Aku, XOR128(Amu, Asu)))); \ + +/* --- Theta Rho Pi Chi Iota Prepare-theta */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da = XOR128(Cu, ROL64in128(Ce, 1)); \ + De = XOR128(Ca, ROL64in128(Ci, 1)); \ + Di = XOR128(Ce, ROL64in128(Co, 1)); \ + Do = XOR128(Ci, ROL64in128(Cu, 1)); \ + Du = XOR128(Co, ROL64in128(Ca, 1)); \ +\ + XOReq128(A##ba, Da); \ + Bba = A##ba; \ + XOReq128(A##ge, De); \ + Bbe = ROL64in128(A##ge, 44); \ + XOReq128(A##ki, Di); \ + Bbi = ROL64in128(A##ki, 43); \ + E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \ + XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \ + Ca = E##ba; \ + XOReq128(A##mo, Do); \ + Bbo = ROL64in128(A##mo, 21); \ + E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \ + Ce = E##be; \ + XOReq128(A##su, Du); \ + Bbu = ROL64in128(A##su, 14); \ + E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \ + Ci = E##bi; \ + E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \ + Co = E##bo; \ + E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \ + Cu = E##bu; \ +\ + XOReq128(A##bo, Do); \ + Bga = ROL64in128(A##bo, 28); \ + XOReq128(A##gu, Du); \ + Bge = ROL64in128(A##gu, 20); \ + XOReq128(A##ka, Da); \ + Bgi = ROL64in128(A##ka, 3); \ + E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \ + XOReq128(Ca, E##ga); \ + XOReq128(A##me, De); \ + Bgo = ROL64in128(A##me, 45); \ + E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \ + XOReq128(Ce, E##ge); \ + XOReq128(A##si, Di); \ + Bgu = ROL64in128(A##si, 61); \ + E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \ + XOReq128(Ci, E##gi); \ + E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \ + XOReq128(Co, E##go); \ + E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \ + XOReq128(Cu, E##gu); \ +\ + XOReq128(A##be, De); \ + Bka = ROL64in128(A##be, 1); \ + XOReq128(A##gi, Di); \ + Bke = ROL64in128(A##gi, 6); \ + XOReq128(A##ko, Do); \ + Bki = ROL64in128(A##ko, 25); \ + E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \ + XOReq128(Ca, E##ka); \ + XOReq128(A##mu, Du); \ + Bko = ROL64in128_8(A##mu); \ + E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \ + XOReq128(Ce, E##ke); \ + XOReq128(A##sa, Da); \ + Bku = ROL64in128(A##sa, 18); \ + E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \ + XOReq128(Ci, E##ki); \ + E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \ + XOReq128(Co, E##ko); \ + E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \ + XOReq128(Cu, E##ku); \ +\ + XOReq128(A##bu, Du); \ + Bma = ROL64in128(A##bu, 27); \ + XOReq128(A##ga, Da); \ + Bme = ROL64in128(A##ga, 36); \ + XOReq128(A##ke, De); \ + Bmi = ROL64in128(A##ke, 10); \ + E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \ + XOReq128(Ca, E##ma); \ + XOReq128(A##mi, Di); \ + Bmo = ROL64in128(A##mi, 15); \ + E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \ + XOReq128(Ce, E##me); \ + XOReq128(A##so, Do); \ + Bmu = ROL64in128_56(A##so); \ + E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \ + XOReq128(Ci, E##mi); \ + E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \ + XOReq128(Co, E##mo); \ + E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \ + XOReq128(Cu, E##mu); \ +\ + XOReq128(A##bi, Di); \ + Bsa = ROL64in128(A##bi, 62); \ + XOReq128(A##go, Do); \ + Bse = ROL64in128(A##go, 55); \ + XOReq128(A##ku, Du); \ + Bsi = ROL64in128(A##ku, 39); \ + E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \ + XOReq128(Ca, E##sa); \ + XOReq128(A##ma, Da); \ + Bso = ROL64in128(A##ma, 41); \ + E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \ + XOReq128(Ce, E##se); \ + XOReq128(A##se, De); \ + Bsu = ROL64in128(A##se, 2); \ + E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \ + XOReq128(Ci, E##si); \ + E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \ + XOReq128(Co, E##so); \ + E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \ + XOReq128(Cu, E##su); \ +\ + +/* --- Theta Rho Pi Chi Iota */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIota(i, A, E) \ + Da = XOR128(Cu, ROL64in128(Ce, 1)); \ + De = XOR128(Ca, ROL64in128(Ci, 1)); \ + Di = XOR128(Ce, ROL64in128(Co, 1)); \ + Do = XOR128(Ci, ROL64in128(Cu, 1)); \ + Du = XOR128(Co, ROL64in128(Ca, 1)); \ +\ + XOReq128(A##ba, Da); \ + Bba = A##ba; \ + XOReq128(A##ge, De); \ + Bbe = ROL64in128(A##ge, 44); \ + XOReq128(A##ki, Di); \ + Bbi = ROL64in128(A##ki, 43); \ + E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \ + XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \ + XOReq128(A##mo, Do); \ + Bbo = ROL64in128(A##mo, 21); \ + E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \ + XOReq128(A##su, Du); \ + Bbu = ROL64in128(A##su, 14); \ + E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \ + E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \ + E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \ +\ + XOReq128(A##bo, Do); \ + Bga = ROL64in128(A##bo, 28); \ + XOReq128(A##gu, Du); \ + Bge = ROL64in128(A##gu, 20); \ + XOReq128(A##ka, Da); \ + Bgi = ROL64in128(A##ka, 3); \ + E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \ + XOReq128(A##me, De); \ + Bgo = ROL64in128(A##me, 45); \ + E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \ + XOReq128(A##si, Di); \ + Bgu = ROL64in128(A##si, 61); \ + E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \ + E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \ + E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \ +\ + XOReq128(A##be, De); \ + Bka = ROL64in128(A##be, 1); \ + XOReq128(A##gi, Di); \ + Bke = ROL64in128(A##gi, 6); \ + XOReq128(A##ko, Do); \ + Bki = ROL64in128(A##ko, 25); \ + E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \ + XOReq128(A##mu, Du); \ + Bko = ROL64in128_8(A##mu); \ + E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \ + XOReq128(A##sa, Da); \ + Bku = ROL64in128(A##sa, 18); \ + E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \ + E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \ + E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \ +\ + XOReq128(A##bu, Du); \ + Bma = ROL64in128(A##bu, 27); \ + XOReq128(A##ga, Da); \ + Bme = ROL64in128(A##ga, 36); \ + XOReq128(A##ke, De); \ + Bmi = ROL64in128(A##ke, 10); \ + E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \ + XOReq128(A##mi, Di); \ + Bmo = ROL64in128(A##mi, 15); \ + E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \ + XOReq128(A##so, Do); \ + Bmu = ROL64in128_56(A##so); \ + E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \ + E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \ + E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \ +\ + XOReq128(A##bi, Di); \ + Bsa = ROL64in128(A##bi, 62); \ + XOReq128(A##go, Do); \ + Bse = ROL64in128(A##go, 55); \ + XOReq128(A##ku, Du); \ + Bsi = ROL64in128(A##ku, 39); \ + E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \ + XOReq128(A##ma, Da); \ + Bso = ROL64in128(A##ma, 41); \ + E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \ + XOReq128(A##se, De); \ + Bsu = ROL64in128(A##se, 2); \ + E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \ + E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \ + E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \ +\ + +#define initializeState(X) \ + X##ba = ZERO(); \ + X##be = ZERO(); \ + X##bi = ZERO(); \ + X##bo = ZERO(); \ + X##bu = ZERO(); \ + X##ga = ZERO(); \ + X##ge = ZERO(); \ + X##gi = ZERO(); \ + X##go = ZERO(); \ + X##gu = ZERO(); \ + X##ka = ZERO(); \ + X##ke = ZERO(); \ + X##ki = ZERO(); \ + X##ko = ZERO(); \ + X##ku = ZERO(); \ + X##ma = ZERO(); \ + X##me = ZERO(); \ + X##mi = ZERO(); \ + X##mo = ZERO(); \ + X##mu = ZERO(); \ + X##sa = ZERO(); \ + X##se = ZERO(); \ + X##si = ZERO(); \ + X##so = ZERO(); \ + X##su = ZERO(); \ + +#define XORdata16(X, data0, data1) \ + XOReq128(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \ + XOReq128(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \ + XOReq128(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \ + XOReq128(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \ + XOReq128(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \ + XOReq128(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \ + XOReq128(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \ + XOReq128(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \ + XOReq128(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \ + XOReq128(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \ + XOReq128(X##ka, LOAD6464((data1)[10], (data0)[10])); \ + XOReq128(X##ke, LOAD6464((data1)[11], (data0)[11])); \ + XOReq128(X##ki, LOAD6464((data1)[12], (data0)[12])); \ + XOReq128(X##ko, LOAD6464((data1)[13], (data0)[13])); \ + XOReq128(X##ku, LOAD6464((data1)[14], (data0)[14])); \ + XOReq128(X##ma, LOAD6464((data1)[15], (data0)[15])); \ + +#define XORdata21(X, data0, data1) \ + XORdata16(X, data0, data1) \ + XOReq128(X##me, LOAD6464((data1)[16], (data0)[16])); \ + XOReq128(X##mi, LOAD6464((data1)[17], (data0)[17])); \ + XOReq128(X##mo, LOAD6464((data1)[18], (data0)[18])); \ + XOReq128(X##mu, LOAD6464((data1)[19], (data0)[19])); \ + XOReq128(X##sa, LOAD6464((data1)[20], (data0)[20])); \ + +#if ((defined(KeccakP1600times2_SSSE3_fullUnrolling)) || (KeccakP1600times2_SSSE3_unrolling == 12)) +#define rounds12 \ + prepareTheta \ + thetaRhoPiChiIotaPrepareTheta(12, A, E) \ + thetaRhoPiChiIotaPrepareTheta(13, E, A) \ + thetaRhoPiChiIotaPrepareTheta(14, A, E) \ + thetaRhoPiChiIotaPrepareTheta(15, E, A) \ + thetaRhoPiChiIotaPrepareTheta(16, A, E) \ + thetaRhoPiChiIotaPrepareTheta(17, E, A) \ + thetaRhoPiChiIotaPrepareTheta(18, A, E) \ + thetaRhoPiChiIotaPrepareTheta(19, E, A) \ + thetaRhoPiChiIotaPrepareTheta(20, A, E) \ + thetaRhoPiChiIotaPrepareTheta(21, E, A) \ + thetaRhoPiChiIotaPrepareTheta(22, A, E) \ + thetaRhoPiChiIota(23, E, A) \ + +#elif (KeccakP1600times2_SSSE3_unrolling == 6) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=6) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ + } \ + +#elif (KeccakP1600times2_SSSE3_unrolling == 4) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=4) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + } \ + +#elif (KeccakP1600times2_SSSE3_unrolling == 2) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=2) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + } \ + +#else +#error "KeccakP1600times2_SSSE3_unrolling is not correctly specified!" +#endif + +#define chunkSize 8192 +#define rateInBytes (21*8) + +void KangarooTwelve_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output) +{ + declareABCDE + #ifndef KeccakP1600times2_SSSE3_fullUnrolling + unsigned int i; + #endif + unsigned int j; + + initializeState(A); + + for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { + XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); + rounds12 + input += rateInBytes; + } + + XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); + XOReq128(Ame, _mm_set1_epi64x(0x0BULL)); + XOReq128(Asa, _mm_set1_epi64x(0x8000000000000000ULL)); + rounds12 + + STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( Aba, Abe ) ); + STORE128u( *(__m128i*)&(output[16]), UNPACKL( Abi, Abo ) ); + STORE128u( *(__m128i*)&(output[32]), UNPACKH( Aba, Abe ) ); + STORE128u( *(__m128i*)&(output[48]), UNPACKH( Abi, Abo ) ); +} From b81d2e0c975c219e4460d762fc58081308914c1b Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Sat, 9 Dec 2023 18:42:11 -0500 Subject: [PATCH 3/7] Flags for linux build and more K12 deps --- crypto/build.rs | 56 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/crypto/build.rs b/crypto/build.rs index 8d1f280..e5c2a3d 100644 --- a/crypto/build.rs +++ b/crypto/build.rs @@ -15,7 +15,7 @@ fn main() { .define("_MSC_VER", "1") .define("_AMD64_", "1") .compile("Chopper") - } else { + } else if std::env::consts::OS == "linux" { cc::Build::new() .define("__LINUX__", "1") .define("_X86_", "1") @@ -33,15 +33,65 @@ fn main() { cc::Build::new() .include("../ffi-deps/K12/lib") - .include("../ffi-deps/K12/lib/Plain64") + .include("../ffi-deps/K12/lib/Optimized64") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c") .file("../ffi-deps/K12/lib/KangarooTwelve.c") + .flag("-march=native") + .flag("-mavx512vl") + .flag("-mavx512f") + .flag("-msse3") .compile("KangarooTwelve"); cc::Build::new() .file("../ffi-deps/chopper-linux.cpp") .define("__LINUX__", "1") .define("_X86_", "1") - //.define("_AMD64_", "1") + .compile("Chopper") + } else { + cc::Build::new() + .define("__LINUX__", "1") + .define("_ARM_", "1") + .define("_AVX_", "1") + .define("USE_ENDO", "true") + .include("../ffi-deps/FourQlib/FourQ_32bit") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") + .file("../ffi-deps/FourQlib/random/random.c") + .file("../ffi-deps/FourQlib/sha512/sha512.c") + .compile("libFourQ"); + + cc::Build::new() + .include("../ffi-deps/K12/lib") + .include("../ffi-deps/K12/lib/Optimized64") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c") + .file("../ffi-deps/K12/lib/KangarooTwelve.c") + .flag("-march=native") + .flag("-mavx512vl") + .flag("-mavx512f") + .flag("-msse3") + .compile("KangarooTwelve"); + + cc::Build::new() + .file("../ffi-deps/chopper-linux.cpp") + .define("__LINUX__", "1") + .define("_AMD64_", "1") .compile("Chopper") } } From 4f9d8a64563086adc5e8d889ee3e7ad624e997c5 Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Sat, 9 Dec 2023 19:47:29 -0500 Subject: [PATCH 4/7] Building on Mac M1 --- crypto/build.rs | 22 +++++++--------------- identity/build.rs | 28 +++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/crypto/build.rs b/crypto/build.rs index e5c2a3d..1bdf33e 100644 --- a/crypto/build.rs +++ b/crypto/build.rs @@ -18,7 +18,8 @@ fn main() { } else if std::env::consts::OS == "linux" { cc::Build::new() .define("__LINUX__", "1") - .define("_X86_", "1") + //.define("_X86_", "1") + .define("_AMD64_", "1") .define("_AVX_", "1") .define("USE_ENDO", "true") .include("../ffi-deps/FourQlib/FourQ_32bit") @@ -52,11 +53,13 @@ fn main() { cc::Build::new() .file("../ffi-deps/chopper-linux.cpp") .define("__LINUX__", "1") - .define("_X86_", "1") + //.define("_X86_", "1") + .define("_AMD64_", "1") .compile("Chopper") } else { cc::Build::new() .define("__LINUX__", "1") + .define("_AMD64_", "1") .define("_ARM_", "1") .define("_AVX_", "1") .define("USE_ENDO", "true") @@ -72,20 +75,9 @@ fn main() { cc::Build::new() .include("../ffi-deps/K12/lib") - .include("../ffi-deps/K12/lib/Optimized64") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c") + .include("../ffi-deps/K12/lib/Inplace32BI") + .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c") .file("../ffi-deps/K12/lib/KangarooTwelve.c") - .flag("-march=native") - .flag("-mavx512vl") - .flag("-mavx512f") - .flag("-msse3") .compile("KangarooTwelve"); cc::Build::new() diff --git a/identity/build.rs b/identity/build.rs index 3f29496..d9f70e0 100644 --- a/identity/build.rs +++ b/identity/build.rs @@ -10,7 +10,7 @@ fn main() { .define("_AVX_", "1") .include("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h") .compile("Chopper") - } else { + } else if std::env::consts::OS == "linux" { cc::Build::new() .define("__LINUX__", "1") .define("_X86_", "1") @@ -38,6 +38,32 @@ fn main() { .include("../ffi-deps/FourQlib/FourQ_32bit") .file("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h") .compile("Chopper") + } else { + cc::Build::new() + .define("__LINUX__", "1") + .define("_AMD64_", "1") + .define("_ARM_", "1") + .define("_AVX_", "1") + .define("USE_ENDO", "true") + .include("../ffi-deps/FourQlib/FourQ_32bit") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") + .file("../ffi-deps/FourQlib/random/random.c") + .file("../ffi-deps/FourQlib/sha512/sha512.c") + .compile("libFourQ"); + + + println!("cargo:rustc-link-lib=libFourQ"); + println!("cargo:rustc-link-lib=dylib=libFourQ"); + + cc::Build::new() + .file("../ffi-deps/chopper-linux.cpp") + .define("__LINUX__", "1") + .define("_AMD64_", "1") + .compile("Chopper") } } From 98947c959b6f9f72146bf0b82d81d0e896f4ea19 Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Mon, 11 Dec 2023 13:00:25 -0500 Subject: [PATCH 5/7] Cpu tweaking in build --- crypto/build.rs | 206 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 140 insertions(+), 66 deletions(-) diff --git a/crypto/build.rs b/crypto/build.rs index 1bdf33e..f8d270e 100644 --- a/crypto/build.rs +++ b/crypto/build.rs @@ -9,81 +9,155 @@ fn main() { fn main() { println!("Running crypto Build Step"); - if std::env::consts::OS == "windows" { + let os = std::env::consts::OS; + let arch = std::env::consts::ARCH; + + + if os == "windows" { cc::Build::new() .file("../ffi-deps/chopper-win.cpp") .define("_MSC_VER", "1") .define("_AMD64_", "1") .compile("Chopper") - } else if std::env::consts::OS == "linux" { - cc::Build::new() - .define("__LINUX__", "1") - //.define("_X86_", "1") - .define("_AMD64_", "1") - .define("_AVX_", "1") - .define("USE_ENDO", "true") - .include("../ffi-deps/FourQlib/FourQ_32bit") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") - .file("../ffi-deps/FourQlib/random/random.c") - .file("../ffi-deps/FourQlib/sha512/sha512.c") - .compile("libFourQ"); + } else if os == "linux" { + if arch == "x86_64" || arch == "x86" { + cc::Build::new() + .define("__LINUX__", "1") + .define("_X86_", "1") + .define("_AVX_", "1") + .define("USE_ENDO", "true") + .include("../ffi-deps/FourQlib/FourQ_32bit") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") + .file("../ffi-deps/FourQlib/random/random.c") + .file("../ffi-deps/FourQlib/sha512/sha512.c") + .compile("libFourQ"); - cc::Build::new() - .include("../ffi-deps/K12/lib") - .include("../ffi-deps/K12/lib/Optimized64") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c") - .file("../ffi-deps/K12/lib/KangarooTwelve.c") - .flag("-march=native") - .flag("-mavx512vl") - .flag("-mavx512f") - .flag("-msse3") - .compile("KangarooTwelve"); + cc::Build::new() + .include("../ffi-deps/K12/lib") + .include("../ffi-deps/K12/lib/Optimized64") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c") + .file("../ffi-deps/K12/lib/KangarooTwelve.c") + .flag("-march=native") + .flag("-mavx512vl") + .flag("-mavx512f") + .flag("-msse3") + .compile("KangarooTwelve"); - cc::Build::new() - .file("../ffi-deps/chopper-linux.cpp") - .define("__LINUX__", "1") - //.define("_X86_", "1") - .define("_AMD64_", "1") - .compile("Chopper") + cc::Build::new() + .file("../ffi-deps/chopper-linux.cpp") + .define("__LINUX__", "1") + .define("_X86_", "1") + .compile("Chopper") + } else { //ARM + cc::Build::new() + .define("__LINUX__", "1") + .define("_AMD64_", "1") + .define("_AVX_", "1") + .define("USE_ENDO", "true") + .include("../ffi-deps/FourQlib/FourQ_32bit") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") + .file("../ffi-deps/FourQlib/random/random.c") + .file("../ffi-deps/FourQlib/sha512/sha512.c") + .compile("libFourQ"); + + cc::Build::new() + .include("../ffi-deps/K12/lib") + .include("../ffi-deps/K12/lib/Optimized64") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c") + .file("../ffi-deps/K12/lib/KangarooTwelve.c") + .flag("-march=native") + .flag("-mavx512vl") + .flag("-mavx512f") + .flag("-msse3") + .compile("KangarooTwelve"); + + cc::Build::new() + .file("../ffi-deps/chopper-linux.cpp") + .define("__LINUX__", "1") + .define("_AMD64_", "1") + .compile("Chopper") + } } else { - cc::Build::new() - .define("__LINUX__", "1") - .define("_AMD64_", "1") - .define("_ARM_", "1") - .define("_AVX_", "1") - .define("USE_ENDO", "true") - .include("../ffi-deps/FourQlib/FourQ_32bit") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") - .file("../ffi-deps/FourQlib/random/random.c") - .file("../ffi-deps/FourQlib/sha512/sha512.c") - .compile("libFourQ"); - cc::Build::new() - .include("../ffi-deps/K12/lib") - .include("../ffi-deps/K12/lib/Inplace32BI") - .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c") - .file("../ffi-deps/K12/lib/KangarooTwelve.c") - .compile("KangarooTwelve"); + if arch == "x86_64" || arch == "x86" { //Intel Mac + cc::Build::new() + .define("__LINUX__", "1") + .define("_X86_", "1") + .define("_AVX_", "1") + .define("USE_ENDO", "true") + .include("../ffi-deps/FourQlib/FourQ_32bit") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") + .file("../ffi-deps/FourQlib/random/random.c") + .file("../ffi-deps/FourQlib/sha512/sha512.c") + .compile("libFourQ"); - cc::Build::new() - .file("../ffi-deps/chopper-linux.cpp") - .define("__LINUX__", "1") - .define("_AMD64_", "1") - .compile("Chopper") + cc::Build::new() + .include("../ffi-deps/K12/lib") + .include("../ffi-deps/K12/lib/Inplace32BI") + .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c") + .file("../ffi-deps/K12/lib/KangarooTwelve.c") + .compile("KangarooTwelve"); + + cc::Build::new() + .file("../ffi-deps/chopper-linux.cpp") + .define("__LINUX__", "1") + .define("_AMD64_", "1") + .compile("Chopper") + } else { //Mac M1 Series + cc::Build::new() + .define("__LINUX__", "1") + .define("_AMD64_", "1") + .define("_ARM_", "1") + .define("_AVX_", "1") + .define("USE_ENDO", "true") + .include("../ffi-deps/FourQlib/FourQ_32bit") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") + .file("../ffi-deps/FourQlib/random/random.c") + .file("../ffi-deps/FourQlib/sha512/sha512.c") + .compile("libFourQ"); + + cc::Build::new() + .include("../ffi-deps/K12/lib") + .include("../ffi-deps/K12/lib/Inplace32BI") + .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c") + .file("../ffi-deps/K12/lib/KangarooTwelve.c") + .compile("KangarooTwelve"); + + cc::Build::new() + .file("../ffi-deps/chopper-linux.cpp") + .define("__LINUX__", "1") + .define("_AMD64_", "1") + .compile("Chopper") + } } } From 0a0afec1162df2a635d284fe95aef5ce6a16f5f9 Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Fri, 15 Dec 2023 15:06:38 -0500 Subject: [PATCH 6/7] Simplify FFI Build Script --- crypto/build.rs | 195 ++++++++++++++------------------------------ identity/Cargo.toml | 3 - identity/build.rs | 69 ---------------- 3 files changed, 59 insertions(+), 208 deletions(-) delete mode 100644 identity/build.rs diff --git a/crypto/build.rs b/crypto/build.rs index f8d270e..3ca1657 100644 --- a/crypto/build.rs +++ b/crypto/build.rs @@ -7,11 +7,17 @@ fn main() { #[cfg(feature = "encryption")] fn main() { - println!("Running crypto Build Step"); - let os = std::env::consts::OS; let arch = std::env::consts::ARCH; - + let cpu: &str = match arch { + "x86" => "_X86_", + "x86_64" => "_X86_", + _ => "_AMD64_" + }; + let extra_four_q_define: &str = match cpu { + "_X86_" => "_BOGUS_", + _ => "_ARM_" //Mac M1 need this + }; if os == "windows" { cc::Build::new() @@ -19,145 +25,62 @@ fn main() { .define("_MSC_VER", "1") .define("_AMD64_", "1") .compile("Chopper") - } else if os == "linux" { - if arch == "x86_64" || arch == "x86" { - cc::Build::new() - .define("__LINUX__", "1") - .define("_X86_", "1") - .define("_AVX_", "1") - .define("USE_ENDO", "true") - .include("../ffi-deps/FourQlib/FourQ_32bit") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") - .file("../ffi-deps/FourQlib/random/random.c") - .file("../ffi-deps/FourQlib/sha512/sha512.c") - .compile("libFourQ"); + } - cc::Build::new() - .include("../ffi-deps/K12/lib") - .include("../ffi-deps/K12/lib/Optimized64") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c") - .file("../ffi-deps/K12/lib/KangarooTwelve.c") - .flag("-march=native") - .flag("-mavx512vl") - .flag("-mavx512f") - .flag("-msse3") - .compile("KangarooTwelve"); + cc::Build::new() + .define("__LINUX__", "1") + .define(cpu, "1") + .define(extra_four_q_define, "1") + .define("_AVX_", "1") + .define("USE_ENDO", "true") + .include("../ffi-deps/FourQlib/FourQ_32bit") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") + .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") + .file("../ffi-deps/FourQlib/random/random.c") + .file("../ffi-deps/FourQlib/sha512/sha512.c") + .compile("libFourQ"); - cc::Build::new() - .file("../ffi-deps/chopper-linux.cpp") - .define("__LINUX__", "1") - .define("_X86_", "1") - .compile("Chopper") - } else { //ARM - cc::Build::new() - .define("__LINUX__", "1") - .define("_AMD64_", "1") - .define("_AVX_", "1") - .define("USE_ENDO", "true") - .include("../ffi-deps/FourQlib/FourQ_32bit") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") - .file("../ffi-deps/FourQlib/random/random.c") - .file("../ffi-deps/FourQlib/sha512/sha512.c") - .compile("libFourQ"); + let mut binding = cc::Build::new(); + let k12 = binding + .include("../ffi-deps/K12/lib") + .file("../ffi-deps/K12/lib/KangarooTwelve.c"); - cc::Build::new() - .include("../ffi-deps/K12/lib") - .include("../ffi-deps/K12/lib/Optimized64") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c") - .file("../ffi-deps/K12/lib/KangarooTwelve.c") - .flag("-march=native") - .flag("-mavx512vl") - .flag("-mavx512f") - .flag("-msse3") - .compile("KangarooTwelve"); + if os == "linux" { + k12 + .include("../ffi-deps/K12/lib/Optimized64") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c") + .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c") + .flag("-march=native") + .flag("-mavx512vl") + .flag("-mavx512f") + .flag("-msse3") + .compile("KangarooTwelve"); - cc::Build::new() - .file("../ffi-deps/chopper-linux.cpp") - .define("__LINUX__", "1") - .define("_AMD64_", "1") - .compile("Chopper") - } + cc::Build::new() + .file("../ffi-deps/chopper-linux.cpp") + .define("__LINUX__", "1") + .define(cpu, "1") + .compile("Chopper") } else { + k12 + .include("../ffi-deps/K12/lib/Inplace32BI") + .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c") + .compile("KangarooTwelve"); - if arch == "x86_64" || arch == "x86" { //Intel Mac - cc::Build::new() - .define("__LINUX__", "1") - .define("_X86_", "1") - .define("_AVX_", "1") - .define("USE_ENDO", "true") - .include("../ffi-deps/FourQlib/FourQ_32bit") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") - .file("../ffi-deps/FourQlib/random/random.c") - .file("../ffi-deps/FourQlib/sha512/sha512.c") - .compile("libFourQ"); - - cc::Build::new() - .include("../ffi-deps/K12/lib") - .include("../ffi-deps/K12/lib/Inplace32BI") - .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c") - .file("../ffi-deps/K12/lib/KangarooTwelve.c") - .compile("KangarooTwelve"); - - cc::Build::new() - .file("../ffi-deps/chopper-linux.cpp") - .define("__LINUX__", "1") - .define("_AMD64_", "1") - .compile("Chopper") - } else { //Mac M1 Series - cc::Build::new() - .define("__LINUX__", "1") - .define("_AMD64_", "1") - .define("_ARM_", "1") - .define("_AVX_", "1") - .define("USE_ENDO", "true") - .include("../ffi-deps/FourQlib/FourQ_32bit") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") - .file("../ffi-deps/FourQlib/random/random.c") - .file("../ffi-deps/FourQlib/sha512/sha512.c") - .compile("libFourQ"); - - cc::Build::new() - .include("../ffi-deps/K12/lib") - .include("../ffi-deps/K12/lib/Inplace32BI") - .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c") - .file("../ffi-deps/K12/lib/KangarooTwelve.c") - .compile("KangarooTwelve"); + cc::Build::new() + .file("../ffi-deps/chopper-linux.cpp") + .define("__LINUX__", "1") + .define("_AMD64_", "1") + .compile("Chopper") - cc::Build::new() - .file("../ffi-deps/chopper-linux.cpp") - .define("__LINUX__", "1") - .define("_AMD64_", "1") - .compile("Chopper") - } } } diff --git a/identity/Cargo.toml b/identity/Cargo.toml index bcfab0c..11f1988 100644 --- a/identity/Cargo.toml +++ b/identity/Cargo.toml @@ -9,6 +9,3 @@ libc = "0.2.147" crypto = { path = '../crypto', features = ["encryption"] } logger = { path = '../logger' } -[build-dependencies] -bindgen = "0.69.1" -cc = "1.0.79" diff --git a/identity/build.rs b/identity/build.rs deleted file mode 100644 index d9f70e0..0000000 --- a/identity/build.rs +++ /dev/null @@ -1,69 +0,0 @@ -extern crate cc; - -fn main() { - - if std::env::consts::OS == "windows" { - cc::Build::new() - .file("../ffi-deps/chopper-win.cpp") - .define("_MSC_VER", "1") - .define("_AMD64_", "1") - .define("_AVX_", "1") - .include("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h") - .compile("Chopper") - } else if std::env::consts::OS == "linux" { - cc::Build::new() - .define("__LINUX__", "1") - .define("_X86_", "1") - .define("_AVX_", "1") - .define("USE_ENDO", "true") - .include("../ffi-deps/FourQlib/FourQ_32bit") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") - .file("../ffi-deps/FourQlib/random/random.c") - .file("../ffi-deps/FourQlib/sha512/sha512.c") - .compile("libFourQ"); - - - println!("cargo:rustc-link-lib=libFourQ"); - println!("cargo:rustc-link-lib=dylib=libFourQ"); - - cc::Build::new() - .file("../ffi-deps/chopper-linux.cpp") - .define("__LINUX__", "1") - .define("_X86_", "1") - .define("_AVX_", "1") - .include("../ffi-deps/FourQlib/FourQ_32bit") - .file("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h") - .compile("Chopper") - } else { - cc::Build::new() - .define("__LINUX__", "1") - .define("_AMD64_", "1") - .define("_ARM_", "1") - .define("_AVX_", "1") - .define("USE_ENDO", "true") - .include("../ffi-deps/FourQlib/FourQ_32bit") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") - .file("../ffi-deps/FourQlib/random/random.c") - .file("../ffi-deps/FourQlib/sha512/sha512.c") - .compile("libFourQ"); - - - println!("cargo:rustc-link-lib=libFourQ"); - println!("cargo:rustc-link-lib=dylib=libFourQ"); - - cc::Build::new() - .file("../ffi-deps/chopper-linux.cpp") - .define("__LINUX__", "1") - .define("_AMD64_", "1") - .compile("Chopper") - } -} - From 5c38c22a230786c534aa1d2d0421e38a405f106e Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Fri, 15 Dec 2023 16:04:12 -0500 Subject: [PATCH 7/7] Explicit return in builder --- crypto/build.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crypto/build.rs b/crypto/build.rs index 3ca1657..e6dcb11 100644 --- a/crypto/build.rs +++ b/crypto/build.rs @@ -20,11 +20,11 @@ fn main() { }; if os == "windows" { - cc::Build::new() + return cc::Build::new() .file("../ffi-deps/chopper-win.cpp") .define("_MSC_VER", "1") .define("_AMD64_", "1") - .compile("Chopper") + .compile("Chopper"); } cc::Build::new()