From dc3f0b86fc1528ed99a3fb34e05b5c4ddde04230 Mon Sep 17 00:00:00 2001
From: Matthew Darnell <matthew.darnell@protonmail.com>
Date: Sat, 9 Dec 2023 17:44:43 -0500
Subject: [PATCH 1/7] (Not Working) Add K12 package

---
 crypto/build.rs                               |   12 +-
 ffi-deps/FourQlib/FourQ_32bit/schnorrq.c      |    2 +-
 ffi-deps/K12/README.markdown                  |   84 +
 ffi-deps/K12/lib/KangarooTwelve.c             |  333 ++++
 ffi-deps/K12/lib/KangarooTwelve.h             |  134 ++
 ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h   |   48 +
 .../K12/lib/Plain64/KeccakP-1600-plain64.c    |   24 +
 ffi-deps/K12/lib/align.h                      |   34 +
 ffi-deps/K12/lib/brg_endian.h                 |  143 ++
 ffi-deps/chopper-linux.cpp                    | 1473 +----------------
 identity/build.rs                             |    8 +-
 11 files changed, 838 insertions(+), 1457 deletions(-)
 create mode 100644 ffi-deps/K12/README.markdown
 create mode 100644 ffi-deps/K12/lib/KangarooTwelve.c
 create mode 100644 ffi-deps/K12/lib/KangarooTwelve.h
 create mode 100644 ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h
 create mode 100644 ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c
 create mode 100644 ffi-deps/K12/lib/align.h
 create mode 100644 ffi-deps/K12/lib/brg_endian.h

diff --git a/crypto/build.rs b/crypto/build.rs
index 91ddaf0..8d1f280 100644
--- a/crypto/build.rs
+++ b/crypto/build.rs
@@ -18,7 +18,7 @@ fn main() {
     } else {
         cc::Build::new()
             .define("__LINUX__", "1")
-            .define("_ARM_", "1")
+            .define("_X86_", "1")
             .define("_AVX_", "1")
             .define("USE_ENDO", "true")
             .include("../ffi-deps/FourQlib/FourQ_32bit")
@@ -31,9 +31,17 @@ fn main() {
             .file("../ffi-deps/FourQlib/sha512/sha512.c")
             .compile("libFourQ");
 
+        cc::Build::new()
+            .include("../ffi-deps/K12/lib")
+            .include("../ffi-deps/K12/lib/Plain64")
+            .file("../ffi-deps/K12/lib/KangarooTwelve.c")
+            .compile("KangarooTwelve");
+
         cc::Build::new()
             .file("../ffi-deps/chopper-linux.cpp")
-            .define("_AMD64_", "1")
+            .define("__LINUX__", "1")
+            .define("_X86_", "1")
+            //.define("_AMD64_", "1")
             .compile("Chopper")
     }
 }
diff --git a/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c b/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c
index e1f130f..9905350 100644
--- a/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c
+++ b/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c
@@ -17,7 +17,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-extern int KangarooTwelveCryptoHashFunction(unsigned char* input, unsigned int inputByteLen, unsigned char* output);
+extern int KangarooTwelveCryptoHashFunction(const unsigned char* input, const unsigned int inputByteLen, unsigned char* output);
 #define CryptoHashFunction      KangarooTwelveCryptoHashFunction
 
 ECCRYPTO_STATUS SchnorrQ_KeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey)
diff --git a/ffi-deps/K12/README.markdown b/ffi-deps/K12/README.markdown
new file mode 100644
index 0000000..4a85e1b
--- /dev/null
+++ b/ffi-deps/K12/README.markdown
@@ -0,0 +1,84 @@
+# What is KangarooTwelve ?
+
+[**KangarooTwelve**][k12] (or **K12**) is a fast and secure extendable-output function (XOF), the generalization of hash functions to arbitrary output lengths.
+Derived from Keccak, it aims at higher speeds than FIPS 202's SHA-3 and SHAKE functions, while retaining their flexibility and basis of security.
+
+On high-end platforms, it can exploit a high degree of parallelism, whether using multiple cores or the single-instruction multiple-data (SIMD) instruction set of modern processors.
+On Intel's Haswell and Skylake architectures, KangarooTwelve tops at less than 1.5 cycles/byte for long messages on a single core, and at 0.51 cycles/byte on the SkylakeX and Cascade Lake architectures.
+On the latest Apple A14 and M1 processors, KangarooTwelve can take advantage of the ARMv8-A's SHA-3 dedicated instructions to deliver 0.75 cycles/byte for long messages on a single core.
+On low-end platforms, as well as for short messages, it also benefits from about a factor two speed-up compared to the fastest FIPS 202 instance SHAKE128.
+
+More details can be found in our [ACNS Paper][eprint].
+
+# What can I find here?
+
+This repository contains source code that implements the extandable output (or hash) function [**KangarooTwelve**][k12] (or **K12**).
+Its purpose is to offer optimized implementations of K12 and nothing else.
+
+The code comes from the [**eXtended Keccak Code Package**][xkcp] (or **XKCP**), after much trimming to keep only what is needed for K12.
+It is still structured like the XKCP in two layers. The lower layer implements the permutation Keccak-_p_[1600, 12] and possibly parallel versions thereof, whereas the higher layer implements the sponge construction and the K12 tree hash mode.
+Also, some sources have been merged to reduce the file count.
+
+* For the higher layer, we kept only the code needed for K12.
+* For the lower layer, we removed all the functions that are not needed for K12. The lower layer therefore implements a subset of the SnP and PlSnP interfaces.
+
+For Keccak or Xoodoo-based functions other than K12 only, it is recommended to use the XKCP itself instead and not to mix both this repository and the XKCP.
+
+
+# Is there a tool to compute the K12 hash of a file?
+
+Not in this repository, but Jack O'Connor's [`kangarootwelve_xkcp.rs` repository](https://github.com/oconnor663/kangarootwelve_xkcp.rs) contains Rust bindings to this code and a `k12sum` utility.
+Pre-built binaries can be found [there](https://github.com/oconnor663/kangarootwelve_xkcp.rs/releases).
+
+
+# How can I build this K12 code?
+
+This repository uses the same build system as that of the XKCP.
+To build, the following tools are needed:
+
+* *GCC*
+* *GNU make*
+* *xsltproc*
+
+The different targets are defined in [`Makefile.build`](Makefile.build). This file is expanded into a regular makefile using *xsltproc*. To use it, simply type, e.g.,
+
+```
+make generic64/K12Tests
+```
+
+to build K12Tests generically optimized for 64-bit platforms. The name before the slash indicates the platform, while the part after the slash is the executable to build. As another example, the static (resp. dynamic) library is built by typing `make generic64/libK12.a` (resp. `.so`) or similarly with `generic64` replaced with the appropriate platform name.  An alternate C compiler can be specified via the `CC` environment variable.
+
+Instead of building an executable with *GCC*, one can choose to select the files needed and make a package. For this, simply append `.pack` to the target name, e.g.,
+
+```
+make generic64/K12Tests.pack
+```
+
+This creates a `.tar.gz` archive with all the necessary files to build the given target.
+
+The list of targets can be found at the end of [`Makefile.build`](Makefile.build) or by running `make` without parameters.
+
+## Microsoft Visual Studio support
+
+KangarooTwelve can be compiled with Microsoft Visual Studio (MSVC). The XKCP build system offers support for the creation of project files. To get a project file for a given target, simply append `.vcxproj` to the target name, e.g.,
+
+```
+make generic64noAsm/K12Tests.vcxproj
+```
+
+The targets `generic32` and `generic64noAsm` can be used with MSVC, but not `generic64` as it contains assembly implementations in the GCC syntax, which at this point cannot be used with MSVC.
+Please refer to the documention of [XKCP][xkcp] for more details on the limitations of the support of MSVC.
+
+[k12]: https://keccak.team/kangarootwelve.html
+[xkcp]: https://github.com/XKCP/XKCP
+[eprint]: https://eprint.iacr.org/2016/770.pdf
+
+
+# Acknowledgments
+
+We wish to thank:
+
+- Andy Polyakov for his expertise with the ARMv8-A+SHA3 code, and in particular for his core routine from [CRYPTOGAMS](https://github.com/dot-asm/cryptogams)
+- Duc Tri Nguyen for his benchmark on the Apple M1
+- Jack O'Connor for bug fixes and more importantly for his [Rust bindings](https://github.com/oconnor663/kangarootwelve_xkcp.rs)
+- Kent Ross for his contributions to this code and its quality
diff --git a/ffi-deps/K12/lib/KangarooTwelve.c b/ffi-deps/K12/lib/KangarooTwelve.c
new file mode 100644
index 0000000..ad184b1
--- /dev/null
+++ b/ffi-deps/K12/lib/KangarooTwelve.c
@@ -0,0 +1,333 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <assert.h>
+#include <string.h>
+#include "KangarooTwelve.h"
+#include "KeccakP-1600-SnP.h"
+
+/* ---------------------------------------------------------------- */
+
+#define K12_security        128
+#define K12_capacity        (2*K12_security)
+#define K12_capacityInBytes (K12_capacity/8)
+#define K12_rate            (1600-K12_capacity)
+#define K12_rateInBytes     (K12_rate/8)
+#define K12_rateInLanes     (K12_rate/64)
+
+static void TurboSHAKE128_Initialize(TurboSHAKE128_Instance *instance)
+{
+    KeccakP1600_Initialize(instance->state);
+    instance->byteIOIndex = 0;
+    instance->squeezing = 0;
+}
+
+static void TurboSHAKE128_Absorb(TurboSHAKE128_Instance *instance, const unsigned char *data, size_t dataByteLen)
+{
+    size_t i, j;
+    uint8_t partialBlock;
+    const unsigned char *curData;
+    const uint8_t rateInBytes = K12_rateInBytes;
+
+    assert(instance->squeezing == 0);
+
+    i = 0;
+    curData = data;
+    while(i < dataByteLen) {
+        if ((instance->byteIOIndex == 0) && (dataByteLen-i >= rateInBytes)) {
+#ifdef KeccakP1600_12rounds_FastLoop_supported
+            /* processing full blocks first */
+            j = KeccakP1600_12rounds_FastLoop_Absorb(instance->state, K12_rateInLanes, curData, dataByteLen - i);
+            i += j;
+            curData += j;
+#endif
+            for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+                KeccakP1600_AddBytes(instance->state, curData, 0, rateInBytes);
+                KeccakP1600_Permute_12rounds(instance->state);
+                curData+=rateInBytes;
+            }
+            i = dataByteLen - j;
+        } else {
+            /* normal lane: using the message queue */
+            if (dataByteLen - i > (size_t)rateInBytes - instance->byteIOIndex) {
+                partialBlock = rateInBytes-instance->byteIOIndex;
+            } else {
+                partialBlock = (uint8_t)(dataByteLen - i);
+            }
+            i += partialBlock;
+
+            KeccakP1600_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+            curData += partialBlock;
+            instance->byteIOIndex += partialBlock;
+            if (instance->byteIOIndex == rateInBytes) {
+                KeccakP1600_Permute_12rounds(instance->state);
+                instance->byteIOIndex = 0;
+            }
+        }
+    }
+}
+
+static void TurboSHAKE128_AbsorbDomainSeparationByte(TurboSHAKE128_Instance *instance, unsigned char D)
+{
+    const unsigned int rateInBytes = K12_rateInBytes;
+
+    assert(D != 0);
+    assert(instance->squeezing == 0);
+
+    /* Last few bits, whose delimiter coincides with first bit of padding */
+    KeccakP1600_AddByte(instance->state, D, instance->byteIOIndex);
+    /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+    if ((D >= 0x80) && (instance->byteIOIndex == (rateInBytes-1)))
+        KeccakP1600_Permute_12rounds(instance->state);
+    /* Second bit of padding */
+    KeccakP1600_AddByte(instance->state, 0x80, rateInBytes-1);
+    KeccakP1600_Permute_12rounds(instance->state);
+    instance->byteIOIndex = 0;
+    instance->squeezing = 1;
+}
+
+static void TurboSHAKE128_Squeeze(TurboSHAKE128_Instance *instance, unsigned char *data, size_t dataByteLen)
+{
+    size_t i, j;
+    unsigned int partialBlock;
+    const unsigned int rateInBytes = K12_rateInBytes;
+    unsigned char *curData;
+
+    if (!instance->squeezing)
+        TurboSHAKE128_AbsorbDomainSeparationByte(instance, 0x01);
+
+    i = 0;
+    curData = data;
+    while(i < dataByteLen) {
+        if ((instance->byteIOIndex == rateInBytes) && (dataByteLen-i >= rateInBytes)) {
+            for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+                KeccakP1600_Permute_12rounds(instance->state);
+                KeccakP1600_ExtractBytes(instance->state, curData, 0, rateInBytes);
+                curData+=rateInBytes;
+            }
+            i = dataByteLen - j;
+        } else {
+            /* normal lane: using the message queue */
+            if (instance->byteIOIndex == rateInBytes) {
+                KeccakP1600_Permute_12rounds(instance->state);
+                instance->byteIOIndex = 0;
+            }
+            if (dataByteLen-i > rateInBytes-instance->byteIOIndex)
+                partialBlock = rateInBytes-instance->byteIOIndex;
+            else
+                partialBlock = (unsigned int)(dataByteLen - i);
+            i += partialBlock;
+
+            KeccakP1600_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+            curData += partialBlock;
+            instance->byteIOIndex += partialBlock;
+        }
+    }
+}
+
+/* ---------------------------------------------------------------- */
+
+typedef enum {
+    NOT_INITIALIZED,
+    ABSORBING,
+    FINAL,
+    SQUEEZING
+} KCP_Phases;
+typedef KCP_Phases KangarooTwelve_Phases;
+
+#define K12_chunkSize       8192
+#define K12_suffixLeaf      0x0B /* '110': message hop, simple padding, inner node */
+
+#ifndef KeccakP1600_disableParallelism
+
+void KangarooTwelve_Process2Leaves(const unsigned char *input, unsigned char *output);
+void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output);
+void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output);
+
+#define ProcessLeaves( Parallellism ) \
+    while (inputByteLen >= Parallellism * K12_chunkSize) { \
+        unsigned char intermediate[Parallellism*K12_capacityInBytes]; \
+        \
+        KangarooTwelve_Process##Parallellism##Leaves(input, intermediate); \
+        input += Parallellism * K12_chunkSize; \
+        inputByteLen -= Parallellism * K12_chunkSize; \
+        ktInstance->blockNumber += Parallellism; \
+        TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, Parallellism * K12_capacityInBytes); \
+    }
+
+#endif  // KeccakP1600_disableParallelism
+
+static unsigned int right_encode(unsigned char * encbuf, size_t value)
+{
+    unsigned int n, i;
+    size_t v;
+
+    for (v = value, n = 0; v && (n < sizeof(size_t)); ++n, v >>= 8)
+        ; /* empty */
+    for (i = 1; i <= n; ++i) {
+        encbuf[i-1] = (unsigned char)(value >> (8 * (n-i)));
+    }
+    encbuf[n] = (unsigned char)n;
+    return n + 1;
+}
+
+int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputByteLen)
+{
+    ktInstance->fixedOutputLength = outputByteLen;
+    ktInstance->queueAbsorbedLen = 0;
+    ktInstance->blockNumber = 0;
+    ktInstance->phase = ABSORBING;
+    TurboSHAKE128_Initialize(&ktInstance->finalNode);
+    return 0;
+}
+
+int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen)
+{
+    if (ktInstance->phase != ABSORBING)
+        return 1;
+
+    if (ktInstance->blockNumber == 0) {
+        /* First block, absorb in final node */
+        unsigned int len = (inputByteLen < (K12_chunkSize - ktInstance->queueAbsorbedLen)) ? (unsigned int)inputByteLen : (K12_chunkSize - ktInstance->queueAbsorbedLen);
+        TurboSHAKE128_Absorb(&ktInstance->finalNode, input, len);
+        input += len;
+        inputByteLen -= len;
+        ktInstance->queueAbsorbedLen += len;
+        if ((ktInstance->queueAbsorbedLen == K12_chunkSize) && (inputByteLen != 0)) {
+            /* First block complete and more input data available, finalize it */
+            const unsigned char padding = 0x03; /* '110^6': message hop, simple padding */
+            ktInstance->queueAbsorbedLen = 0;
+            ktInstance->blockNumber = 1;
+            TurboSHAKE128_Absorb(&ktInstance->finalNode, &padding, 1);
+            ktInstance->finalNode.byteIOIndex = (ktInstance->finalNode.byteIOIndex + 7) & ~7; /* Zero padding up to 64 bits */
+        }
+    } else if (ktInstance->queueAbsorbedLen != 0) {
+        /* There is data in the queue, absorb further in queue until block complete */
+        unsigned int len = (inputByteLen < (K12_chunkSize - ktInstance->queueAbsorbedLen)) ? (unsigned int)inputByteLen : (K12_chunkSize - ktInstance->queueAbsorbedLen);
+        TurboSHAKE128_Absorb(&ktInstance->queueNode, input, len);
+        input += len;
+        inputByteLen -= len;
+        ktInstance->queueAbsorbedLen += len;
+        if (ktInstance->queueAbsorbedLen == K12_chunkSize) {
+            unsigned char intermediate[K12_capacityInBytes];
+            ktInstance->queueAbsorbedLen = 0;
+            ++ktInstance->blockNumber;
+            TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->queueNode, K12_suffixLeaf);
+            TurboSHAKE128_Squeeze(&ktInstance->queueNode, intermediate, K12_capacityInBytes);
+            TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, K12_capacityInBytes);
+        }
+    }
+
+#ifndef KeccakP1600_disableParallelism
+    if (KeccakP1600times8_IsAvailable()) {
+        ProcessLeaves(8);
+    }
+
+    if (KeccakP1600times4_IsAvailable()) {
+        ProcessLeaves(4);
+    }
+
+    if (KeccakP1600times2_IsAvailable()) {
+        ProcessLeaves(2);
+    }
+#endif
+
+    while (inputByteLen > 0) {
+        unsigned int len = (inputByteLen < K12_chunkSize) ? (unsigned int)inputByteLen : K12_chunkSize;
+        TurboSHAKE128_Initialize(&ktInstance->queueNode);
+        TurboSHAKE128_Absorb(&ktInstance->queueNode, input, len);
+        input += len;
+        inputByteLen -= len;
+        if (len == K12_chunkSize) {
+            unsigned char intermediate[K12_capacityInBytes];
+            ++ktInstance->blockNumber;
+            TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->queueNode, K12_suffixLeaf);
+            TurboSHAKE128_Squeeze(&ktInstance->queueNode, intermediate, K12_capacityInBytes);
+            TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, K12_capacityInBytes);
+        } else {
+            ktInstance->queueAbsorbedLen = len;
+        }
+    }
+
+    return 0;
+}
+
+int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen)
+{
+    unsigned char encbuf[sizeof(size_t)+1+2];
+    unsigned char padding;
+
+    if (ktInstance->phase != ABSORBING)
+        return 1;
+
+    /* Absorb customization | right_encode(customByteLen) */
+    if ((customByteLen != 0) && (KangarooTwelve_Update(ktInstance, customization, customByteLen) != 0))
+        return 1;
+    if (KangarooTwelve_Update(ktInstance, encbuf, right_encode(encbuf, customByteLen)) != 0)
+        return 1;
+
+    if (ktInstance->blockNumber == 0) {
+        /* Non complete first block in final node, pad it */
+        padding = 0x07; /*  '11': message hop, final node */
+    } else {
+        unsigned int n;
+
+        if (ktInstance->queueAbsorbedLen != 0) {
+            /* There is data in the queue node */
+            unsigned char intermediate[K12_capacityInBytes];
+            ++ktInstance->blockNumber;
+            TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->queueNode, K12_suffixLeaf);
+            TurboSHAKE128_Squeeze(&ktInstance->queueNode, intermediate, K12_capacityInBytes);
+            TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, K12_capacityInBytes);
+        }
+        --ktInstance->blockNumber; /* Absorb right_encode(number of Chaining Values) || 0xFF || 0xFF */
+        n = right_encode(encbuf, ktInstance->blockNumber);
+        encbuf[n++] = 0xFF;
+        encbuf[n++] = 0xFF;
+        TurboSHAKE128_Absorb(&ktInstance->finalNode, encbuf, n);
+        padding = 0x06; /* '01': chaining hop, final node */
+    }
+    TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->finalNode, padding);
+    if (ktInstance->fixedOutputLength != 0) {
+        ktInstance->phase = FINAL;
+        TurboSHAKE128_Squeeze(&ktInstance->finalNode, output, ktInstance->fixedOutputLength);
+        return 0;
+    }
+    ktInstance->phase = SQUEEZING;
+    return 0;
+}
+
+int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen)
+{
+    if (ktInstance->phase != SQUEEZING)
+        return 1;
+    TurboSHAKE128_Squeeze(&ktInstance->finalNode, output, outputByteLen);
+    return 0;
+}
+
+int KangarooTwelve(const unsigned char *input, size_t inputByteLen,
+                   unsigned char *output, size_t outputByteLen,
+                   const unsigned char *customization, size_t customByteLen)
+{
+    KangarooTwelve_Instance ktInstance;
+
+    if (outputByteLen == 0)
+        return 1;
+    KangarooTwelve_Initialize(&ktInstance, outputByteLen);
+    if (KangarooTwelve_Update(&ktInstance, input, inputByteLen) != 0)
+        return 1;
+    return KangarooTwelve_Final(&ktInstance, output, customization, customByteLen);
+}
diff --git a/ffi-deps/K12/lib/KangarooTwelve.h b/ffi-deps/K12/lib/KangarooTwelve.h
new file mode 100644
index 0000000..f7b3e33
--- /dev/null
+++ b/ffi-deps/K12/lib/KangarooTwelve.h
@@ -0,0 +1,134 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KangarooTwelve_h_
+#define _KangarooTwelve_h_
+
+#include <stddef.h>
+#include <stdint.h>
+#include "align.h"
+#include "KeccakP-1600-SnP.h"
+
+typedef struct TurboSHAKE128_InstanceStruct {
+    uint8_t state[KeccakP1600_stateSizeInBytes];
+    uint8_t byteIOIndex;
+    uint8_t squeezing;
+} TurboSHAKE128_Instance;
+
+typedef struct KangarooTwelve_InstanceStruct {
+    ALIGN(KeccakP1600_stateAlignment) TurboSHAKE128_Instance queueNode;
+    ALIGN(KeccakP1600_stateAlignment) TurboSHAKE128_Instance finalNode;
+    size_t fixedOutputLength;
+    size_t blockNumber;
+    unsigned int queueAbsorbedLen;
+    int phase;
+} KangarooTwelve_Instance;
+
+/** Extendable ouput function KangarooTwelve.
+  * @param  input           Pointer to the input message (M).
+  * @param  inputByteLen    The length of the input message in bytes.
+  * @param  output          Pointer to the output buffer.
+  * @param  outputByteLen   The desired number of output bytes.
+  * @param  customization   Pointer to the customization string (C).
+  * @param  customByteLen   The length of the customization string in bytes.
+  * @return 0 if successful, 1 otherwise.
+  */
+int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen);
+
+/**
+  * Function to initialize a KangarooTwelve instance.
+  * @param  ktInstance      Pointer to the instance to be initialized.
+  * @param  outputByteLen   The desired number of output bytes,
+  *                         or 0 for an arbitrarily-long output.
+  * @return 0 if successful, 1 otherwise.
+  */
+int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputByteLen);
+
+/**
+  * Function to give input data to be absorbed.
+  * @param  ktInstance      Pointer to the instance initialized by KangarooTwelve_Initialize().
+  * @param  input           Pointer to the input message data (M).
+  * @param  inputByteLen    The number of bytes provided in the input message data.
+  * @return 0 if successful, 1 otherwise.
+  */
+int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen);
+
+/**
+  * Function to call after all the input message has been input, and to get
+  * output bytes if the length was specified when calling KangarooTwelve_Initialize().
+  * @param  ktInstance      Pointer to the hash instance initialized by KangarooTwelve_Initialize().
+  * If @a outputByteLen was not 0 in the call to KangarooTwelve_Initialize(), the number of
+  *     output bytes is equal to @a outputByteLen.
+  * If @a outputByteLen was 0 in the call to KangarooTwelve_Initialize(), the output bytes
+  *     must be extracted using the KangarooTwelve_Squeeze() function.
+  * @param  output          Pointer to the buffer where to store the output data.
+  * @param  customization   Pointer to the customization string (C).
+  * @param  customByteLen   The length of the customization string in bytes.
+  * @return 0 if successful, 1 otherwise.
+  */
+int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen);
+
+/**
+  * Function to squeeze output data.
+  * @param  ktInstance     Pointer to the hash instance initialized by KangarooTwelve_Initialize().
+  * @param  data           Pointer to the buffer where to store the output data.
+  * @param  outputByteLen  The number of output bytes desired.
+  * @pre    KangarooTwelve_Final() must have been already called.
+  * @return 0 if successful, 1 otherwise.
+  */
+int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen);
+
+#if !defined(KeccakP1600_disableParallelism) && defined(KeccakP1600_enable_simd_options)
+/**
+  * Functions to selectively disable the use of CPU features. Should be rarely
+  * needed; if you're not sure this is what you want, don't worry about it.
+  *
+  * /!\ WARNING /!\: Calling these functions REQUIRES that there are no
+  * KangarooTwelve instances in use. The effects are global and affect the code
+  * paths taken by every call, as well as the details of the represented states.
+  * Calling these functions in the middle of your program (as opposed to during
+  * setup) is PROBABLY WRONG.
+  *
+  * These functions are at present only used to increase test suite coverage,
+  * and demonstrate comparative performance between implementations in different
+  * instruction sets. To enable them, the macro KeccakP1600_enable_simd_options
+  * must be defined at compile time.
+  *
+  * They can potentially also be useful in an environment where it is
+  * detrimental to online large vector units on the CPU, since doing so can lead
+  * to downclocking, performance hits in other threads sharing the same CPU
+  * core, and short delays while the CPU's power license is increased to online
+  * the vector unit.
+  *
+  * In the majority of situations, however, this should rarely matter and it is
+  * usually the case that the performance difference will be a wash or even an
+  * overall improvement despite the downsides.
+  *
+  * @return 1 if the feature was enabled and available and has been turned off,
+  *     0 if it was already disabled or unavailable.
+  */
+int KangarooTwelve_DisableAVX512(void);
+int KangarooTwelve_DisableAVX2(void);
+int KangarooTwelve_DisableSSSE3(void);
+
+/**
+  * Function to reset all CPU features to enabled-if-available. Calling this
+  * always has no effect if no CPU features have been explicitly disabled.
+  */
+void KangarooTwelve_EnableAllCpuFeatures(void);
+#endif  // !KeccakP1600_disableParallelism && KeccakP1600_enable_simd_options
+
+#endif
diff --git a/ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h
new file mode 100644
index 0000000..d9e0c6e
--- /dev/null
+++ b/ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h
@@ -0,0 +1,48 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+/* Keccak-p[1600] */
+
+#define KeccakP1600_stateSizeInBytes    200
+#define KeccakP1600_stateAlignment      8
+#define KeccakP1600_12rounds_FastLoop_supported
+#define KeccakP1600_disableParallelism
+
+const char * KeccakP1600_GetImplementation();
+void KeccakP1600_Initialize(void *state);
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_Permute_12rounds(void *state);
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+
+// Instead of defining proxy functions which do nothing, simply rename the
+// symbols of the opt64 implementation where they are used.
+#define KeccakP1600_opt64_Initialize KeccakP1600_Initialize
+#define KeccakP1600_opt64_AddByte KeccakP1600_AddByte
+#define KeccakP1600_opt64_AddBytes KeccakP1600_AddBytes
+#define KeccakP1600_opt64_Permute_12rounds KeccakP1600_Permute_12rounds
+#define KeccakP1600_opt64_ExtractBytes KeccakP1600_ExtractBytes
+#define KeccakP1600_opt64_12rounds_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb
+
+#endif
diff --git a/ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c b/ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c
new file mode 100644
index 0000000..0043b4f
--- /dev/null
+++ b/ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c
@@ -0,0 +1,24 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+const char * KeccakP1600_GetImplementation()
+{
+    return "generic 64-bit implementation";
+}
diff --git a/ffi-deps/K12/lib/align.h b/ffi-deps/K12/lib/align.h
new file mode 100644
index 0000000..31586bb
--- /dev/null
+++ b/ffi-deps/K12/lib/align.h
@@ -0,0 +1,34 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _align_h_
+#define _align_h_
+
+#ifdef ALIGN
+#undef ALIGN
+#endif
+
+#if defined(__GNUC__)
+#define ALIGN(x) __attribute__ ((aligned(x)))
+#elif defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#elif defined(__ARMCC_VERSION)
+#define ALIGN(x) __align(x)
+#else
+#define ALIGN(x)
+#endif
+
+#endif
diff --git a/ffi-deps/K12/lib/brg_endian.h b/ffi-deps/K12/lib/brg_endian.h
new file mode 100644
index 0000000..7c640b9
--- /dev/null
+++ b/ffi-deps/K12/lib/brg_endian.h
@@ -0,0 +1,143 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+ Changes for ARM 9/9/2010
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+#if 0
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )    || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )    || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )   || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ )  || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )    || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ )  || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C )  || defined( __VMCMS__ ) || defined( _AIX )       || \
+      defined( __s390__ ) || defined( __s390x__ ) || defined( __zarch__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif defined(__arm__)
+# ifdef __BIG_ENDIAN
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# else
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif 1     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
diff --git a/ffi-deps/chopper-linux.cpp b/ffi-deps/chopper-linux.cpp
index 07ea48f..193b1ef 100644
--- a/ffi-deps/chopper-linux.cpp
+++ b/ffi-deps/chopper-linux.cpp
@@ -61,51 +61,6 @@ long long unsigned int __shiftright128(
 #endif
 
 
-//From Qiner
-
-
-#if AVX512
-const  __m512i zero = _mm512_maskz_set1_epi64(0, 0);
-const  __m512i moveThetaPrev = _mm512_setr_epi64(4, 0, 1, 2, 3, 5, 6, 7);
-const  __m512i moveThetaNext = _mm512_setr_epi64(1, 2, 3, 4, 0, 5, 6, 7);
-const  __m512i rhoB = _mm512_setr_epi64(0, 1, 62, 28, 27, 0, 0, 0);
-const  __m512i rhoG = _mm512_setr_epi64(36, 44, 6, 55, 20, 0, 0, 0);
-const  __m512i rhoK = _mm512_setr_epi64(3, 10, 43, 25, 39, 0, 0, 0);
-const  __m512i rhoM = _mm512_setr_epi64(41, 45, 15, 21, 8, 0, 0, 0);
-const  __m512i rhoS = _mm512_setr_epi64(18, 2, 61, 56, 14, 0, 0, 0);
-const  __m512i pi1B = _mm512_setr_epi64(0, 3, 1, 4, 2, 5, 6, 7);
-const  __m512i pi1G = _mm512_setr_epi64(1, 4, 2, 0, 3, 5, 6, 7);
-const  __m512i pi1K = _mm512_setr_epi64(2, 0, 3, 1, 4, 5, 6, 7);
-const  __m512i pi1M = _mm512_setr_epi64(3, 1, 4, 2, 0, 5, 6, 7);
-const  __m512i pi1S = _mm512_setr_epi64(4, 2, 0, 3, 1, 5, 6, 7);
-const  __m512i pi2S1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 8, 10);
-const  __m512i pi2S2 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 9, 11);
-const  __m512i pi2BG = _mm512_setr_epi64(0, 1, 8, 9, 6, 5, 6, 7);
-const  __m512i pi2KM = _mm512_setr_epi64(2, 3, 10, 11, 7, 5, 6, 7);
-const  __m512i pi2S3 = _mm512_setr_epi64(4, 5, 12, 13, 4, 5, 6, 7);
-const  __m512i padding = _mm512_maskz_set1_epi64(1, 0x8000000000000000);
-
-const  __m512i K12RoundConst0 = _mm512_maskz_set1_epi64(1, 0x000000008000808bULL);
-const  __m512i K12RoundConst1 = _mm512_maskz_set1_epi64(1, 0x800000000000008bULL);
-const  __m512i K12RoundConst2 = _mm512_maskz_set1_epi64(1, 0x8000000000008089ULL);
-const  __m512i K12RoundConst3 = _mm512_maskz_set1_epi64(1, 0x8000000000008003ULL);
-const  __m512i K12RoundConst4 = _mm512_maskz_set1_epi64(1, 0x8000000000008002ULL);
-const  __m512i K12RoundConst5 = _mm512_maskz_set1_epi64(1, 0x8000000000000080ULL);
-const  __m512i K12RoundConst6 = _mm512_maskz_set1_epi64(1, 0x000000000000800aULL);
-const  __m512i K12RoundConst7 = _mm512_maskz_set1_epi64(1, 0x800000008000000aULL);
-const  __m512i K12RoundConst8 = _mm512_maskz_set1_epi64(1, 0x8000000080008081ULL);
-const  __m512i K12RoundConst9 = _mm512_maskz_set1_epi64(1, 0x8000000000008080ULL);
-const  __m512i K12RoundConst10 = _mm512_maskz_set1_epi64(1, 0x0000000080000001ULL);
-const  __m512i K12RoundConst11 = _mm512_maskz_set1_epi64(1, 0x8000000080008008ULL);
-
-#endif
-
-
-
-
-
-
-//End From Qiner
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -133,448 +88,9 @@ extern "C" {
     #define CopyMemory(x, y, z) memcpy(x, y, z)
 
 
-    #define KeccakF1600RoundConstant0   0x000000008000808bULL
-    #define KeccakF1600RoundConstant1   0x800000000000008bULL
-    #define KeccakF1600RoundConstant2   0x8000000000008089ULL
-    #define KeccakF1600RoundConstant3   0x8000000000008003ULL
-    #define KeccakF1600RoundConstant4   0x8000000000008002ULL
-    #define KeccakF1600RoundConstant5   0x8000000000000080ULL
-    #define KeccakF1600RoundConstant6   0x000000000000800aULL
-    #define KeccakF1600RoundConstant7   0x800000008000000aULL
-    #define KeccakF1600RoundConstant8   0x8000000080008081ULL
-    #define KeccakF1600RoundConstant9   0x8000000000008080ULL
-    #define KeccakF1600RoundConstant10  0x0000000080000001ULL
-
-    #define declareABCDE \
-        unsigned long long Aba, Abe, Abi, Abo, Abu; \
-        unsigned long long Aga, Age, Agi, Ago, Agu; \
-        unsigned long long Aka, Ake, Aki, Ako, Aku; \
-        unsigned long long Ama, Ame, Ami, Amo, Amu; \
-        unsigned long long Asa, Ase, Asi, Aso, Asu; \
-        unsigned long long Bba, Bbe, Bbi, Bbo, Bbu; \
-        unsigned long long Bga, Bge, Bgi, Bgo, Bgu; \
-        unsigned long long Bka, Bke, Bki, Bko, Bku; \
-        unsigned long long Bma, Bme, Bmi, Bmo, Bmu; \
-        unsigned long long Bsa, Bse, Bsi, Bso, Bsu; \
-        unsigned long long Ca, Ce, Ci, Co, Cu; \
-        unsigned long long Da, De, Di, Do, Du; \
-        unsigned long long Eba, Ebe, Ebi, Ebo, Ebu; \
-        unsigned long long Ega, Ege, Egi, Ego, Egu; \
-        unsigned long long Eka, Eke, Eki, Eko, Eku; \
-        unsigned long long Ema, Eme, Emi, Emo, Emu; \
-        unsigned long long Esa, Ese, Esi, Eso, Esu; \
-
-    #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
-        Da = Cu^ROL64(Ce, 1); \
-        De = Ca^ROL64(Ci, 1); \
-        Di = Ce^ROL64(Co, 1); \
-        Do = Ci^ROL64(Cu, 1); \
-        Du = Co^ROL64(Ca, 1); \
-        A##ba ^= Da; \
-        Bba = A##ba; \
-        A##ge ^= De; \
-        Bbe = ROL64(A##ge, 44); \
-        A##ki ^= Di; \
-        Bbi = ROL64(A##ki, 43); \
-        A##mo ^= Do; \
-        Bbo = ROL64(A##mo, 21); \
-        A##su ^= Du; \
-        Bbu = ROL64(A##su, 14); \
-        E##ba =   Bba ^((~Bbe)&  Bbi ); \
-        E##ba ^= KeccakF1600RoundConstant##i; \
-        Ca = E##ba; \
-        E##be =   Bbe ^((~Bbi)&  Bbo ); \
-        Ce = E##be; \
-        E##bi =   Bbi ^((~Bbo)&  Bbu ); \
-        Ci = E##bi; \
-        E##bo =   Bbo ^((~Bbu)&  Bba ); \
-        Co = E##bo; \
-        E##bu =   Bbu ^((~Bba)&  Bbe ); \
-        Cu = E##bu; \
-        A##bo ^= Do; \
-        Bga = ROL64(A##bo, 28); \
-        A##gu ^= Du; \
-        Bge = ROL64(A##gu, 20); \
-        A##ka ^= Da; \
-        Bgi = ROL64(A##ka, 3); \
-        A##me ^= De; \
-        Bgo = ROL64(A##me, 45); \
-        A##si ^= Di; \
-        Bgu = ROL64(A##si, 61); \
-        E##ga =   Bga ^((~Bge)&  Bgi ); \
-        Ca ^= E##ga; \
-        E##ge =   Bge ^((~Bgi)&  Bgo ); \
-        Ce ^= E##ge; \
-        E##gi =   Bgi ^((~Bgo)&  Bgu ); \
-        Ci ^= E##gi; \
-        E##go =   Bgo ^((~Bgu)&  Bga ); \
-        Co ^= E##go; \
-        E##gu =   Bgu ^((~Bga)&  Bge ); \
-        Cu ^= E##gu; \
-        A##be ^= De; \
-        Bka = ROL64(A##be, 1); \
-        A##gi ^= Di; \
-        Bke = ROL64(A##gi, 6); \
-        A##ko ^= Do; \
-        Bki = ROL64(A##ko, 25); \
-        A##mu ^= Du; \
-        Bko = ROL64(A##mu, 8); \
-        A##sa ^= Da; \
-        Bku = ROL64(A##sa, 18); \
-        E##ka =   Bka ^((~Bke)&  Bki ); \
-        Ca ^= E##ka; \
-        E##ke =   Bke ^((~Bki)&  Bko ); \
-        Ce ^= E##ke; \
-        E##ki =   Bki ^((~Bko)&  Bku ); \
-        Ci ^= E##ki; \
-        E##ko =   Bko ^((~Bku)&  Bka ); \
-        Co ^= E##ko; \
-        E##ku =   Bku ^((~Bka)&  Bke ); \
-        Cu ^= E##ku; \
-        A##bu ^= Du; \
-        Bma = ROL64(A##bu, 27); \
-        A##ga ^= Da; \
-        Bme = ROL64(A##ga, 36); \
-        A##ke ^= De; \
-        Bmi = ROL64(A##ke, 10); \
-        A##mi ^= Di; \
-        Bmo = ROL64(A##mi, 15); \
-        A##so ^= Do; \
-        Bmu = ROL64(A##so, 56); \
-        E##ma =   Bma ^((~Bme)&  Bmi ); \
-        Ca ^= E##ma; \
-        E##me =   Bme ^((~Bmi)&  Bmo ); \
-        Ce ^= E##me; \
-        E##mi =   Bmi ^((~Bmo)&  Bmu ); \
-        Ci ^= E##mi; \
-        E##mo =   Bmo ^((~Bmu)&  Bma ); \
-        Co ^= E##mo; \
-        E##mu =   Bmu ^((~Bma)&  Bme ); \
-        Cu ^= E##mu; \
-        A##bi ^= Di; \
-        Bsa = ROL64(A##bi, 62); \
-        A##go ^= Do; \
-        Bse = ROL64(A##go, 55); \
-        A##ku ^= Du; \
-        Bsi = ROL64(A##ku, 39); \
-        A##ma ^= Da; \
-        Bso = ROL64(A##ma, 41); \
-        A##se ^= De; \
-        Bsu = ROL64(A##se, 2); \
-        E##sa =   Bsa ^((~Bse)&  Bsi ); \
-        Ca ^= E##sa; \
-        E##se =   Bse ^((~Bsi)&  Bso ); \
-        Ce ^= E##se; \
-        E##si =   Bsi ^((~Bso)&  Bsu ); \
-        Ci ^= E##si; \
-        E##so =   Bso ^((~Bsu)&  Bsa ); \
-        Co ^= E##so; \
-        E##su =   Bsu ^((~Bsa)&  Bse ); \
-        Cu ^= E##su;
-
-    #define copyFromState(state) \
-        Aba = state[ 0]; \
-        Abe = state[ 1]; \
-        Abi = state[ 2]; \
-        Abo = state[ 3]; \
-        Abu = state[ 4]; \
-        Aga = state[ 5]; \
-        Age = state[ 6]; \
-        Agi = state[ 7]; \
-        Ago = state[ 8]; \
-        Agu = state[ 9]; \
-        Aka = state[10]; \
-        Ake = state[11]; \
-        Aki = state[12]; \
-        Ako = state[13]; \
-        Aku = state[14]; \
-        Ama = state[15]; \
-        Ame = state[16]; \
-        Ami = state[17]; \
-        Amo = state[18]; \
-        Amu = state[19]; \
-        Asa = state[20]; \
-        Ase = state[21]; \
-        Asi = state[22]; \
-        Aso = state[23]; \
-        Asu = state[24];
-
-    #define copyToState(state) \
-        state[ 0] = Aba; \
-        state[ 1] = Abe; \
-        state[ 2] = Abi; \
-        state[ 3] = Abo; \
-        state[ 4] = Abu; \
-        state[ 5] = Aga; \
-        state[ 6] = Age; \
-        state[ 7] = Agi; \
-        state[ 8] = Ago; \
-        state[ 9] = Agu; \
-        state[10] = Aka; \
-        state[11] = Ake; \
-        state[12] = Aki; \
-        state[13] = Ako; \
-        state[14] = Aku; \
-        state[15] = Ama; \
-        state[16] = Ame; \
-        state[17] = Ami; \
-        state[18] = Amo; \
-        state[19] = Amu; \
-        state[20] = Asa; \
-        state[21] = Ase; \
-        state[22] = Asi; \
-        state[23] = Aso; \
-        state[24] = Asu;
-
-    #define rounds12 \
-        Ca = Aba^Aga^Aka^Ama^Asa; \
-        Ce = Abe^Age^Ake^Ame^Ase; \
-        Ci = Abi^Agi^Aki^Ami^Asi; \
-        Co = Abo^Ago^Ako^Amo^Aso; \
-        Cu = Abu^Agu^Aku^Amu^Asu; \
-        thetaRhoPiChiIotaPrepareTheta(0, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(1, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(2, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(3, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(4, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(5, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(6, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(7, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(8, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(9, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(10, A, E) \
-        Da = Cu^ROL64(Ce, 1); \
-        De = Ca^ROL64(Ci, 1); \
-        Di = Ce^ROL64(Co, 1); \
-        Do = Ci^ROL64(Cu, 1); \
-        Du = Co^ROL64(Ca, 1); \
-        Eba ^= Da; \
-        Bba = Eba; \
-        Ege ^= De; \
-        Bbe = ROL64(Ege, 44); \
-        Eki ^= Di; \
-        Bbi = ROL64(Eki, 43); \
-        Emo ^= Do; \
-        Bbo = ROL64(Emo, 21); \
-        Esu ^= Du; \
-        Bbu = ROL64(Esu, 14); \
-        Aba =   Bba ^((~Bbe)&  Bbi ); \
-        Aba ^= 0x8000000080008008ULL; \
-        Abe =   Bbe ^((~Bbi)&  Bbo ); \
-        Abi =   Bbi ^((~Bbo)&  Bbu ); \
-        Abo =   Bbo ^((~Bbu)&  Bba ); \
-        Abu =   Bbu ^((~Bba)&  Bbe ); \
-        Ebo ^= Do; \
-        Bga = ROL64(Ebo, 28); \
-        Egu ^= Du; \
-        Bge = ROL64(Egu, 20); \
-        Eka ^= Da; \
-        Bgi = ROL64(Eka, 3); \
-        Eme ^= De; \
-        Bgo = ROL64(Eme, 45); \
-        Esi ^= Di; \
-        Bgu = ROL64(Esi, 61); \
-        Aga =   Bga ^((~Bge)&  Bgi ); \
-        Age =   Bge ^((~Bgi)&  Bgo ); \
-        Agi =   Bgi ^((~Bgo)&  Bgu ); \
-        Ago =   Bgo ^((~Bgu)&  Bga ); \
-        Agu =   Bgu ^((~Bga)&  Bge ); \
-        Ebe ^= De; \
-        Bka = ROL64(Ebe, 1); \
-        Egi ^= Di; \
-        Bke = ROL64(Egi, 6); \
-        Eko ^= Do; \
-        Bki = ROL64(Eko, 25); \
-        Emu ^= Du; \
-        Bko = ROL64(Emu, 8); \
-        Esa ^= Da; \
-        Bku = ROL64(Esa, 18); \
-        Aka =   Bka ^((~Bke)&  Bki ); \
-        Ake =   Bke ^((~Bki)&  Bko ); \
-        Aki =   Bki ^((~Bko)&  Bku ); \
-        Ako =   Bko ^((~Bku)&  Bka ); \
-        Aku =   Bku ^((~Bka)&  Bke ); \
-        Ebu ^= Du; \
-        Bma = ROL64(Ebu, 27); \
-        Ega ^= Da; \
-        Bme = ROL64(Ega, 36); \
-        Eke ^= De; \
-        Bmi = ROL64(Eke, 10); \
-        Emi ^= Di; \
-        Bmo = ROL64(Emi, 15); \
-        Eso ^= Do; \
-        Bmu = ROL64(Eso, 56); \
-        Ama =   Bma ^((~Bme)&  Bmi ); \
-        Ame =   Bme ^((~Bmi)&  Bmo ); \
-        Ami =   Bmi ^((~Bmo)&  Bmu ); \
-        Amo =   Bmo ^((~Bmu)&  Bma ); \
-        Amu =   Bmu ^((~Bma)&  Bme ); \
-        Ebi ^= Di; \
-        Bsa = ROL64(Ebi, 62); \
-        Ego ^= Do; \
-        Bse = ROL64(Ego, 55); \
-        Eku ^= Du; \
-        Bsi = ROL64(Eku, 39); \
-        Ema ^= Da; \
-        Bso = ROL64(Ema, 41); \
-        Ese ^= De; \
-        Bsu = ROL64(Ese, 2); \
-        Asa =   Bsa ^((~Bse)&  Bsi ); \
-        Ase =   Bse ^((~Bsi)&  Bso ); \
-        Asi =   Bsi ^((~Bso)&  Bsu ); \
-        Aso =   Bso ^((~Bsu)&  Bsa ); \
-        Asu =   Bsu ^((~Bsa)&  Bse );
-
-    #define K12_security        128
-    #define K12_capacity        (2 * K12_security)
-    #define K12_capacityInBytes (K12_capacity / 8)
-    #define K12_rateInBytes     ((1600 - K12_capacity) / 8)
-    #define K12_chunkSize       8192
-    #define K12_suffixLeaf      0x0B
-
-    typedef struct
-    {
-        unsigned char state[200];
-        unsigned char byteIOIndex;
-    } KangarooTwelve_F;
-
-    void KeccakP1600_Permute_12rounds(unsigned char* state)
-    {
-        declareABCDE
-            unsigned long long* stateAsLanes = (unsigned long long*)state;
-        copyFromState(stateAsLanes)
-            rounds12
-            copyToState(stateAsLanes)
-    }
-
-    void KangarooTwelve_F_Absorb(KangarooTwelve_F* instance, unsigned char* data, unsigned long long dataByteLen)
-    {
-        unsigned long long i = 0;
-        while (i < dataByteLen)
-        {
-            if (!instance->byteIOIndex && dataByteLen >= i + K12_rateInBytes)
-            {
-                declareABCDE
-                    unsigned long long* stateAsLanes = (unsigned long long*)instance->state;
-                copyFromState(stateAsLanes)
-                    unsigned long long modifiedDataByteLen = dataByteLen - i;
-                while (modifiedDataByteLen >= K12_rateInBytes)
-                {
-                    Aba ^= ((unsigned long long*)data)[0];
-                    Abe ^= ((unsigned long long*)data)[1];
-                    Abi ^= ((unsigned long long*)data)[2];
-                    Abo ^= ((unsigned long long*)data)[3];
-                    Abu ^= ((unsigned long long*)data)[4];
-                    Aga ^= ((unsigned long long*)data)[5];
-                    Age ^= ((unsigned long long*)data)[6];
-                    Agi ^= ((unsigned long long*)data)[7];
-                    Ago ^= ((unsigned long long*)data)[8];
-                    Agu ^= ((unsigned long long*)data)[9];
-                    Aka ^= ((unsigned long long*)data)[10];
-                    Ake ^= ((unsigned long long*)data)[11];
-                    Aki ^= ((unsigned long long*)data)[12];
-                    Ako ^= ((unsigned long long*)data)[13];
-                    Aku ^= ((unsigned long long*)data)[14];
-                    Ama ^= ((unsigned long long*)data)[15];
-                    Ame ^= ((unsigned long long*)data)[16];
-                    Ami ^= ((unsigned long long*)data)[17];
-                    Amo ^= ((unsigned long long*)data)[18];
-                    Amu ^= ((unsigned long long*)data)[19];
-                    Asa ^= ((unsigned long long*)data)[20];
-                    rounds12
-                        data += K12_rateInBytes;
-                    modifiedDataByteLen -= K12_rateInBytes;
-                }
-                copyToState(stateAsLanes)
-                    i = dataByteLen - modifiedDataByteLen;
-            }
-            else
-            {
-                unsigned char partialBlock;
-                if ((dataByteLen - i) + instance->byteIOIndex > K12_rateInBytes)
-                {
-                    partialBlock = K12_rateInBytes - instance->byteIOIndex;
-                }
-                else
-                {
-                    partialBlock = (unsigned char)(dataByteLen - i);
-                }
-                i += partialBlock;
-
-                if (!instance->byteIOIndex)
-                {
-                    unsigned int j = 0;
-                    for (; (j + 8) <= (unsigned int)(partialBlock >> 3); j += 8)
-                    {
-                        ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0];
-                        ((unsigned long long*)instance->state)[j + 1] ^= ((unsigned long long*)data)[j + 1];
-                        ((unsigned long long*)instance->state)[j + 2] ^= ((unsigned long long*)data)[j + 2];
-                        ((unsigned long long*)instance->state)[j + 3] ^= ((unsigned long long*)data)[j + 3];
-                        ((unsigned long long*)instance->state)[j + 4] ^= ((unsigned long long*)data)[j + 4];
-                        ((unsigned long long*)instance->state)[j + 5] ^= ((unsigned long long*)data)[j + 5];
-                        ((unsigned long long*)instance->state)[j + 6] ^= ((unsigned long long*)data)[j + 6];
-                        ((unsigned long long*)instance->state)[j + 7] ^= ((unsigned long long*)data)[j + 7];
-                    }
-                    for (; (j + 4) <= (unsigned int)(partialBlock >> 3); j += 4)
-                    {
-                        ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0];
-                        ((unsigned long long*)instance->state)[j + 1] ^= ((unsigned long long*)data)[j + 1];
-                        ((unsigned long long*)instance->state)[j + 2] ^= ((unsigned long long*)data)[j + 2];
-                        ((unsigned long long*)instance->state)[j + 3] ^= ((unsigned long long*)data)[j + 3];
-                    }
-                    for (; (j + 2) <= (unsigned int)(partialBlock >> 3); j += 2)
-                    {
-                        ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0];
-                        ((unsigned long long*)instance->state)[j + 1] ^= ((unsigned long long*)data)[j + 1];
-                    }
-                    if (j < (unsigned int)(partialBlock >> 3))
-                    {
-                        ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0];
-                    }
-                    if (partialBlock & 7)
-                    {
-                        unsigned long long lane = 0;
-                        CopyMemory(&lane, data + (partialBlock & 0xFFFFFFF8), partialBlock & 7);
-                        ((unsigned long long*)instance->state)[partialBlock >> 3] ^= lane;
-                    }
-                }
-                else
-                {
-                    unsigned int _sizeLeft = partialBlock;
-                    unsigned int _lanePosition = instance->byteIOIndex >> 3;
-                    unsigned int _offsetInLane = instance->byteIOIndex & 7;
-                    const unsigned char* _curData = data;
-                    while (_sizeLeft > 0)
-                    {
-                        unsigned int _bytesInLane = 8 - _offsetInLane;
-                        if (_bytesInLane > _sizeLeft)
-                        {
-                            _bytesInLane = _sizeLeft;
-                        }
-                        if (_bytesInLane)
-                        {
-                            unsigned long long lane = 0;
-                            CopyMemory(&lane, (void*)_curData, _bytesInLane);
-                            ((unsigned long long*)instance->state)[_lanePosition] ^= (lane << (_offsetInLane << 3));
-                        }
-                        _sizeLeft -= _bytesInLane;
-                        _lanePosition++;
-                        _offsetInLane = 0;
-                        _curData += _bytesInLane;
-                    }
-                }
 
-                data += partialBlock;
-                instance->byteIOIndex += partialBlock;
-                if (instance->byteIOIndex == K12_rateInBytes)
-                {
-                    KeccakP1600_Permute_12rounds(instance->state);
-                    instance->byteIOIndex = 0;
-                }
-            }
-        }
-    }
 
+/*
     void KangarooTwelve(unsigned char* input, unsigned int inputByteLen, unsigned char* output, unsigned int outputByteLen)
     {
         KangarooTwelve_F queueNode;
@@ -712,972 +228,27 @@ extern "C" {
         KeccakP1600_Permute_12rounds(finalNode.state);
         CopyMemory(output, finalNode.state, outputByteLen);
     }
-
-    int KangarooTwelveCryptoHashFunction(unsigned char* input, unsigned int inputByteLen, unsigned char* output)
+*/
+
+/** Extendable ouput function KangarooTwelve.
+    * @param  input           Pointer to the input message (M).
+    * @param  inputByteLen    The length of the input message in bytes.
+    * @param  output          Pointer to the output buffer.
+    * @param  outputByteLen   The desired number of output bytes.
+    * @param  customization   Pointer to the customization string (C).
+    * @param  customByteLen   The length of the customization string in bytes.
+    * @return 0 if successful, 1 otherwise.
+    */
+  //int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen);
+    extern int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen);
+
+
+    int KangarooTwelveCryptoHashFunction(const unsigned char* input, const unsigned int inputByteLen, unsigned char* output)
     {
-        KangarooTwelve(input, inputByteLen, output, 64);
+        KangarooTwelve(input, inputByteLen, output, 64, NULL, 0);
         return 0;
     }
 
-
-    void KangarooTwelve64To32(unsigned char* input, unsigned char* output)
-    {
-    #if AVX512
-        __m512i Baeiou = _mm512_maskz_loadu_epi64(0x1F, input);
-        __m512i Gaeiou = _mm512_set_epi64(0, 0, 0, 0, 0x0700, ((unsigned long long*)input)[7], ((unsigned long long*)input)[6], ((unsigned long long*)input)[5]);
-
-        __m512i b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, zero, 0x96), zero, padding, 0x96);
-        __m512i b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        __m512i b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(zero, b0, b1, 0x96), rhoK));
-        __m512i b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(zero, b0, b1, 0x96), rhoM));
-        __m512i b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(padding, b0, b1, 0x96), rhoS));
-        __m512i b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst0);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        __m512i Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        __m512i Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        __m512i Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst1);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst2);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst3);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst4);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst5);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst6);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst7);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst8);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst9);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-        Baeiou = _mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst10);
-        Gaeiou = _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2);
-        Kaeiou = _mm512_ternarylogic_epi64(b2, b3, b4, 0xD2);
-        Maeiou = _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2);
-        Saeiou = _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2);
-        b0 = _mm512_permutex2var_epi64(_mm512_unpacklo_epi64(Baeiou, Gaeiou), pi2S1, Saeiou);
-        b2 = _mm512_permutex2var_epi64(_mm512_unpackhi_epi64(Baeiou, Gaeiou), pi2S2, Saeiou);
-        b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou);
-        b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou);
-        Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1);
-        Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3);
-        Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1);
-        Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3);
-        Saeiou = _mm512_mask_blend_epi64(0x10, _mm512_permutex2var_epi64(b0, pi2S3, b1), Saeiou);
-
-        b0 = _mm512_ternarylogic_epi64(_mm512_ternarylogic_epi64(Baeiou, Gaeiou, Kaeiou, 0x96), Maeiou, Saeiou, 0x96);
-        b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0);
-        b0 = _mm512_rol_epi64(_mm512_permutexvar_epi64(moveThetaNext, b0), 1);
-        b2 = _mm512_permutexvar_epi64(pi1K, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Kaeiou, b0, b1, 0x96), rhoK));
-        b3 = _mm512_permutexvar_epi64(pi1M, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Maeiou, b0, b1, 0x96), rhoM));
-        b4 = _mm512_permutexvar_epi64(pi1S, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Saeiou, b0, b1, 0x96), rhoS));
-        b5 = _mm512_permutexvar_epi64(pi1G, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Gaeiou, b0, b1, 0x96), rhoG));
-        b0 = _mm512_permutexvar_epi64(pi1B, _mm512_rolv_epi64(_mm512_ternarylogic_epi64(Baeiou, b0, b1, 0x96), rhoB));
-
-        _mm512_mask_storeu_epi64(output, 0xF, _mm512_permutex2var_epi64(_mm512_permutex2var_epi64(_mm512_unpacklo_epi64(_mm512_xor_si512(_mm512_ternarylogic_epi64(b0, b5, b2, 0xD2), K12RoundConst11), _mm512_ternarylogic_epi64(b5, b2, b3, 0xD2)), pi2S1, _mm512_ternarylogic_epi64(b4, b0, b5, 0xD2)), pi2BG, _mm512_unpacklo_epi64(_mm512_ternarylogic_epi64(b2, b3, b4, 0xD2), _mm512_ternarylogic_epi64(b3, b4, b0, 0xD2))));
-    #else
-        unsigned long long Aba, Abe, Abi, Abo, Abu;
-        unsigned long long Aga, Age, Agi, Ago, Agu;
-        unsigned long long Aka, Ake, Aki, Ako, Aku;
-        unsigned long long Ama, Ame, Ami, Amo, Amu;
-        unsigned long long Asa, Ase, Asi, Aso, Asu;
-        unsigned long long Bba, Bbe, Bbi, Bbo, Bbu;
-        unsigned long long Bga, Bge, Bgi, Bgo, Bgu;
-        unsigned long long Bka, Bke, Bki, Bko, Bku;
-        unsigned long long Bma, Bme, Bmi, Bmo, Bmu;
-        unsigned long long Bsa, Bse, Bsi, Bso, Bsu;
-        unsigned long long Ca, Ce, Ci, Co, Cu;
-        unsigned long long Da, De, Di, Do, Du;
-        unsigned long long Eba, Ebe, Ebi, Ebo, Ebu;
-        unsigned long long Ega, Ege, Egi, Ego, Egu;
-        unsigned long long Eka, Eke, Eki, Eko, Eku;
-        unsigned long long Ema, Eme, Emi, Emo, Emu;
-        unsigned long long Esa, Ese, Esi, Eso, Esu;
-
-        Ca = ((unsigned long long*)input)[0] ^ ((unsigned long long*)input)[5] ^ 0x8000000000000000;
-        Ce = ((unsigned long long*)input)[1] ^ ((unsigned long long*)input)[6];
-        Ci = ((unsigned long long*)input)[2] ^ ((unsigned long long*)input)[7];
-        Co = ((unsigned long long*)input)[3] ^ 0x0700;
-
-        Da = ((unsigned long long*)input)[4] ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(((unsigned long long*)input)[4], 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Aba = ((unsigned long long*)input)[0] ^ Da;
-        Bbe = ROL64(((unsigned long long*)input)[6] ^ De, 44);
-        Bbi = ROL64(Di, 43);
-        Bbo = ROL64(Do, 21);
-        Bbu = ROL64(Du, 14);
-        Eba = Aba ^ __andn_u64(Bbe, Bbi) ^ 0x000000008000808bULL;
-        Ebe = Bbe ^ __andn_u64(Bbi, Bbo);
-        Ebi = Bbi ^ __andn_u64(Bbo, Bbu);
-        Ebo = Bbo ^ __andn_u64(Bbu, Aba);
-        Ebu = Bbu ^ __andn_u64(Aba, Bbe);
-        Bga = ROL64(((unsigned long long*)input)[3] ^ Do, 28);
-        Bge = ROL64(Du, 20);
-        Bgi = ROL64(Da, 3);
-        Bgo = ROL64(De, 45);
-        Bgu = ROL64(Di, 61);
-        Ega = Bga ^ __andn_u64(Bge, Bgi);
-        Ege = Bge ^ __andn_u64(Bgi, Bgo);
-        Egi = Bgi ^ __andn_u64(Bgo, Bgu);
-        Ego = Bgo ^ __andn_u64(Bgu, Bga);
-        Egu = Bgu ^ __andn_u64(Bga, Bge);
-        Bka = ROL64(((unsigned long long*)input)[1] ^ De, 1);
-        Bke = ROL64(((unsigned long long*)input)[7] ^ Di, 6);
-        Bki = ROL64(Do, 25);
-        Bko = ROL64(Du, 8);
-        Bku = ROL64(Da ^ 0x8000000000000000, 18);
-        Eka = Bka ^ __andn_u64(Bke, Bki);
-        Eke = Bke ^ __andn_u64(Bki, Bko);
-        Eki = Bki ^ __andn_u64(Bko, Bku);
-        Eko = Bko ^ __andn_u64(Bku, Bka);
-        Eku = Bku ^ __andn_u64(Bka, Bke);
-        Bma = ROL64(((unsigned long long*)input)[4] ^ Du, 27);
-        Bme = ROL64(((unsigned long long*)input)[5] ^ Da, 36);
-        Bmi = ROL64(De, 10);
-        Bmo = ROL64(Di, 15);
-        Bmu = ROL64(Do, 56);
-        Ema = Bma ^ __andn_u64(Bme, Bmi);
-        Eme = Bme ^ __andn_u64(Bmi, Bmo);
-        Emi = Bmi ^ __andn_u64(Bmo, Bmu);
-        Emo = Bmo ^ __andn_u64(Bmu, Bma);
-        Emu = Bmu ^ __andn_u64(Bma, Bme);
-        Bsa = ROL64(((unsigned long long*)input)[2] ^ Di, 62);
-        Bse = ROL64(Do ^ 0x0700, 55);
-        Bsi = ROL64(Du, 39);
-        Bso = ROL64(Da, 41);
-        Bsu = ROL64(De, 2);
-        Esa = Bsa ^ __andn_u64(Bse, Bsi);
-        Ese = Bse ^ __andn_u64(Bsi, Bso);
-        Esi = Bsi ^ __andn_u64(Bso, Bsu);
-        Eso = Bso ^ __andn_u64(Bsu, Bsa);
-        Esu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
-        Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
-        Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
-        Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
-        Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
-
-        Da = Cu ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(Cu, 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Eba ^= Da;
-        Bbe = ROL64(Ege ^ De, 44);
-        Bbi = ROL64(Eki ^ Di, 43);
-        Bbo = ROL64(Emo ^ Do, 21);
-        Bbu = ROL64(Esu ^ Du, 14);
-        Aba = Eba ^ __andn_u64(Bbe, Bbi) ^ 0x800000000000008bULL;
-        Abe = Bbe ^ __andn_u64(Bbi, Bbo);
-        Abi = Bbi ^ __andn_u64(Bbo, Bbu);
-        Abo = Bbo ^ __andn_u64(Bbu, Eba);
-        Abu = Bbu ^ __andn_u64(Eba, Bbe);
-        Bga = ROL64(Ebo ^ Do, 28);
-        Bge = ROL64(Egu ^ Du, 20);
-        Bgi = ROL64(Eka ^ Da, 3);
-        Bgo = ROL64(Eme ^ De, 45);
-        Bgu = ROL64(Esi ^ Di, 61);
-        Aga = Bga ^ __andn_u64(Bge, Bgi);
-        Age = Bge ^ __andn_u64(Bgi, Bgo);
-        Agi = Bgi ^ __andn_u64(Bgo, Bgu);
-        Ago = Bgo ^ __andn_u64(Bgu, Bga);
-        Agu = Bgu ^ __andn_u64(Bga, Bge);
-        Bka = ROL64(Ebe ^ De, 1);
-        Bke = ROL64(Egi ^ Di, 6);
-        Bki = ROL64(Eko ^ Do, 25);
-        Bko = ROL64(Emu ^ Du, 8);
-        Bku = ROL64(Esa ^ Da, 18);
-        Aka = Bka ^ __andn_u64(Bke, Bki);
-        Ake = Bke ^ __andn_u64(Bki, Bko);
-        Aki = Bki ^ __andn_u64(Bko, Bku);
-        Ako = Bko ^ __andn_u64(Bku, Bka);
-        Aku = Bku ^ __andn_u64(Bka, Bke);
-        Bma = ROL64(Ebu ^ Du, 27);
-        Bme = ROL64(Ega ^ Da, 36);
-        Bmi = ROL64(Eke ^ De, 10);
-        Bmo = ROL64(Emi ^ Di, 15);
-        Bmu = ROL64(Eso ^ Do, 56);
-        Ama = Bma ^ __andn_u64(Bme, Bmi);
-        Ame = Bme ^ __andn_u64(Bmi, Bmo);
-        Ami = Bmi ^ __andn_u64(Bmo, Bmu);
-        Amo = Bmo ^ __andn_u64(Bmu, Bma);
-        Amu = Bmu ^ __andn_u64(Bma, Bme);
-        Bsa = ROL64(Ebi ^ Di, 62);
-        Bse = ROL64(Ego ^ Do, 55);
-        Bsi = ROL64(Eku ^ Du, 39);
-        Bso = ROL64(Ema ^ Da, 41);
-        Bsu = ROL64(Ese ^ De, 2);
-        Asa = Bsa ^ __andn_u64(Bse, Bsi);
-        Ase = Bse ^ __andn_u64(Bsi, Bso);
-        Asi = Bsi ^ __andn_u64(Bso, Bsu);
-        Aso = Bso ^ __andn_u64(Bsu, Bsa);
-        Asu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
-        Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase;
-        Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
-        Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
-        Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
-
-        Da = Cu ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(Cu, 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Aba ^= Da;
-        Bbe = ROL64(Age ^ De, 44);
-        Bbi = ROL64(Aki ^ Di, 43);
-        Bbo = ROL64(Amo ^ Do, 21);
-        Bbu = ROL64(Asu ^ Du, 14);
-        Eba = Aba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000000008089ULL;
-        Ebe = Bbe ^ __andn_u64(Bbi, Bbo);
-        Ebi = Bbi ^ __andn_u64(Bbo, Bbu);
-        Ebo = Bbo ^ __andn_u64(Bbu, Aba);
-        Ebu = Bbu ^ __andn_u64(Aba, Bbe);
-        Bga = ROL64(Abo ^ Do, 28);
-        Bge = ROL64(Agu ^ Du, 20);
-        Bgi = ROL64(Aka ^ Da, 3);
-        Bgo = ROL64(Ame ^ De, 45);
-        Bgu = ROL64(Asi ^ Di, 61);
-        Ega = Bga ^ __andn_u64(Bge, Bgi);
-        Ege = Bge ^ __andn_u64(Bgi, Bgo);
-        Egi = Bgi ^ __andn_u64(Bgo, Bgu);
-        Ego = Bgo ^ __andn_u64(Bgu, Bga);
-        Egu = Bgu ^ __andn_u64(Bga, Bge);
-        Bka = ROL64(Abe ^ De, 1);
-        Bke = ROL64(Agi ^ Di, 6);
-        Bki = ROL64(Ako ^ Do, 25);
-        Bko = ROL64(Amu ^ Du, 8);
-        Bku = ROL64(Asa ^ Da, 18);
-        Eka = Bka ^ __andn_u64(Bke, Bki);
-        Eke = Bke ^ __andn_u64(Bki, Bko);
-        Eki = Bki ^ __andn_u64(Bko, Bku);
-        Eko = Bko ^ __andn_u64(Bku, Bka);
-        Eku = Bku ^ __andn_u64(Bka, Bke);
-        Bma = ROL64(Abu ^ Du, 27);
-        Bme = ROL64(Aga ^ Da, 36);
-        Bmi = ROL64(Ake ^ De, 10);
-        Bmo = ROL64(Ami ^ Di, 15);
-        Bmu = ROL64(Aso ^ Do, 56);
-        Ema = Bma ^ __andn_u64(Bme, Bmi);
-        Eme = Bme ^ __andn_u64(Bmi, Bmo);
-        Emi = Bmi ^ __andn_u64(Bmo, Bmu);
-        Emo = Bmo ^ __andn_u64(Bmu, Bma);
-        Emu = Bmu ^ __andn_u64(Bma, Bme);
-        Bsa = ROL64(Abi ^ Di, 62);
-        Bse = ROL64(Ago ^ Do, 55);
-        Bsi = ROL64(Aku ^ Du, 39);
-        Bso = ROL64(Ama ^ Da, 41);
-        Bsu = ROL64(Ase ^ De, 2);
-        Esa = Bsa ^ __andn_u64(Bse, Bsi);
-        Ese = Bse ^ __andn_u64(Bsi, Bso);
-        Esi = Bsi ^ __andn_u64(Bso, Bsu);
-        Eso = Bso ^ __andn_u64(Bsu, Bsa);
-        Esu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
-        Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
-        Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
-        Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
-        Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
-
-        Da = Cu ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(Cu, 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Eba ^= Da;
-        Bbe = ROL64(Ege ^ De, 44);
-        Bbi = ROL64(Eki ^ Di, 43);
-        Bbo = ROL64(Emo ^ Do, 21);
-        Bbu = ROL64(Esu ^ Du, 14);
-        Aba = Eba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000000008003ULL;
-        Abe = Bbe ^ __andn_u64(Bbi, Bbo);
-        Abi = Bbi ^ __andn_u64(Bbo, Bbu);
-        Abo = Bbo ^ __andn_u64(Bbu, Eba);
-        Abu = Bbu ^ __andn_u64(Eba, Bbe);
-        Bga = ROL64(Ebo ^ Do, 28);
-        Bge = ROL64(Egu ^ Du, 20);
-        Bgi = ROL64(Eka ^ Da, 3);
-        Bgo = ROL64(Eme ^ De, 45);
-        Bgu = ROL64(Esi ^ Di, 61);
-        Aga = Bga ^ __andn_u64(Bge, Bgi);
-        Age = Bge ^ __andn_u64(Bgi, Bgo);
-        Agi = Bgi ^ __andn_u64(Bgo, Bgu);
-        Ago = Bgo ^ __andn_u64(Bgu, Bga);
-        Agu = Bgu ^ __andn_u64(Bga, Bge);
-        Bka = ROL64(Ebe ^ De, 1);
-        Bke = ROL64(Egi ^ Di, 6);
-        Bki = ROL64(Eko ^ Do, 25);
-        Bko = ROL64(Emu ^ Du, 8);
-        Bku = ROL64(Esa ^ Da, 18);
-        Aka = Bka ^ __andn_u64(Bke, Bki);
-        Ake = Bke ^ __andn_u64(Bki, Bko);
-        Aki = Bki ^ __andn_u64(Bko, Bku);
-        Ako = Bko ^ __andn_u64(Bku, Bka);
-        Aku = Bku ^ __andn_u64(Bka, Bke);
-        Bma = ROL64(Ebu ^ Du, 27);
-        Bme = ROL64(Ega ^ Da, 36);
-        Bmi = ROL64(Eke ^ De, 10);
-        Bmo = ROL64(Emi ^ Di, 15);
-        Bmu = ROL64(Eso ^ Do, 56);
-        Ama = Bma ^ __andn_u64(Bme, Bmi);
-        Ame = Bme ^ __andn_u64(Bmi, Bmo);
-        Ami = Bmi ^ __andn_u64(Bmo, Bmu);
-        Amo = Bmo ^ __andn_u64(Bmu, Bma);
-        Amu = Bmu ^ __andn_u64(Bma, Bme);
-        Bsa = ROL64(Ebi ^ Di, 62);
-        Bse = ROL64(Ego ^ Do, 55);
-        Bsi = ROL64(Eku ^ Du, 39);
-        Bso = ROL64(Ema ^ Da, 41);
-        Bsu = ROL64(Ese ^ De, 2);
-        Asa = Bsa ^ __andn_u64(Bse, Bsi);
-        Ase = Bse ^ __andn_u64(Bsi, Bso);
-        Asi = Bsi ^ __andn_u64(Bso, Bsu);
-        Aso = Bso ^ __andn_u64(Bsu, Bsa);
-        Asu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
-        Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase;
-        Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
-        Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
-        Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
-
-        Da = Cu ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(Cu, 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Aba ^= Da;
-        Bbe = ROL64(Age ^ De, 44);
-        Bbi = ROL64(Aki ^ Di, 43);
-        Bbo = ROL64(Amo ^ Do, 21);
-        Bbu = ROL64(Asu ^ Du, 14);
-        Eba = Aba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000000008002ULL;
-        Ebe = Bbe ^ __andn_u64(Bbi, Bbo);
-        Ebi = Bbi ^ __andn_u64(Bbo, Bbu);
-        Ebo = Bbo ^ __andn_u64(Bbu, Aba);
-        Ebu = Bbu ^ __andn_u64(Aba, Bbe);
-        Bga = ROL64(Abo ^ Do, 28);
-        Bge = ROL64(Agu ^ Du, 20);
-        Bgi = ROL64(Aka ^ Da, 3);
-        Bgo = ROL64(Ame ^ De, 45);
-        Bgu = ROL64(Asi ^ Di, 61);
-        Ega = Bga ^ __andn_u64(Bge, Bgi);
-        Ege = Bge ^ __andn_u64(Bgi, Bgo);
-        Egi = Bgi ^ __andn_u64(Bgo, Bgu);
-        Ego = Bgo ^ __andn_u64(Bgu, Bga);
-        Egu = Bgu ^ __andn_u64(Bga, Bge);
-        Bka = ROL64(Abe ^ De, 1);
-        Bke = ROL64(Agi ^ Di, 6);
-        Bki = ROL64(Ako ^ Do, 25);
-        Bko = ROL64(Amu ^ Du, 8);
-        Bku = ROL64(Asa ^ Da, 18);
-        Eka = Bka ^ __andn_u64(Bke, Bki);
-        Eke = Bke ^ __andn_u64(Bki, Bko);
-        Eki = Bki ^ __andn_u64(Bko, Bku);
-        Eko = Bko ^ __andn_u64(Bku, Bka);
-        Eku = Bku ^ __andn_u64(Bka, Bke);
-        Bma = ROL64(Abu ^ Du, 27);
-        Bme = ROL64(Aga ^ Da, 36);
-        Bmi = ROL64(Ake ^ De, 10);
-        Bmo = ROL64(Ami ^ Di, 15);
-        Bmu = ROL64(Aso ^ Do, 56);
-        Ema = Bma ^ __andn_u64(Bme, Bmi);
-        Eme = Bme ^ __andn_u64(Bmi, Bmo);
-        Emi = Bmi ^ __andn_u64(Bmo, Bmu);
-        Emo = Bmo ^ __andn_u64(Bmu, Bma);
-        Emu = Bmu ^ __andn_u64(Bma, Bme);
-        Bsa = ROL64(Abi ^ Di, 62);
-        Bse = ROL64(Ago ^ Do, 55);
-        Bsi = ROL64(Aku ^ Du, 39);
-        Bso = ROL64(Ama ^ Da, 41);
-        Bsu = ROL64(Ase ^ De, 2);
-        Esa = Bsa ^ __andn_u64(Bse, Bsi);
-        Ese = Bse ^ __andn_u64(Bsi, Bso);
-        Esi = Bsi ^ __andn_u64(Bso, Bsu);
-        Eso = Bso ^ __andn_u64(Bsu, Bsa);
-        Esu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
-        Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
-        Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
-        Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
-        Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
-
-        Da = Cu ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(Cu, 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Eba ^= Da;
-        Bbe = ROL64(Ege ^ De, 44);
-        Bbi = ROL64(Eki ^ Di, 43);
-        Bbo = ROL64(Emo ^ Do, 21);
-        Bbu = ROL64(Esu ^ Du, 14);
-        Aba = Eba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000000000080ULL;
-        Abe = Bbe ^ __andn_u64(Bbi, Bbo);
-        Abi = Bbi ^ __andn_u64(Bbo, Bbu);
-        Abo = Bbo ^ __andn_u64(Bbu, Eba);
-        Abu = Bbu ^ __andn_u64(Eba, Bbe);
-        Bga = ROL64(Ebo ^ Do, 28);
-        Bge = ROL64(Egu ^ Du, 20);
-        Bgi = ROL64(Eka ^ Da, 3);
-        Bgo = ROL64(Eme ^ De, 45);
-        Bgu = ROL64(Esi ^ Di, 61);
-        Aga = Bga ^ __andn_u64(Bge, Bgi);
-        Age = Bge ^ __andn_u64(Bgi, Bgo);
-        Agi = Bgi ^ __andn_u64(Bgo, Bgu);
-        Ago = Bgo ^ __andn_u64(Bgu, Bga);
-        Agu = Bgu ^ __andn_u64(Bga, Bge);
-        Bka = ROL64(Ebe ^ De, 1);
-        Bke = ROL64(Egi ^ Di, 6);
-        Bki = ROL64(Eko ^ Do, 25);
-        Bko = ROL64(Emu ^ Du, 8);
-        Bku = ROL64(Esa ^ Da, 18);
-        Aka = Bka ^ __andn_u64(Bke, Bki);
-        Ake = Bke ^ __andn_u64(Bki, Bko);
-        Aki = Bki ^ __andn_u64(Bko, Bku);
-        Ako = Bko ^ __andn_u64(Bku, Bka);
-        Aku = Bku ^ __andn_u64(Bka, Bke);
-        Bma = ROL64(Ebu ^ Du, 27);
-        Bme = ROL64(Ega ^ Da, 36);
-        Bmi = ROL64(Eke ^ De, 10);
-        Bmo = ROL64(Emi ^ Di, 15);
-        Bmu = ROL64(Eso ^ Do, 56);
-        Ama = Bma ^ __andn_u64(Bme, Bmi);
-        Ame = Bme ^ __andn_u64(Bmi, Bmo);
-        Ami = Bmi ^ __andn_u64(Bmo, Bmu);
-        Amo = Bmo ^ __andn_u64(Bmu, Bma);
-        Amu = Bmu ^ __andn_u64(Bma, Bme);
-        Bsa = ROL64(Ebi ^ Di, 62);
-        Bse = ROL64(Ego ^ Do, 55);
-        Bsi = ROL64(Eku ^ Du, 39);
-        Bso = ROL64(Ema ^ Da, 41);
-        Bsu = ROL64(Ese ^ De, 2);
-        Asa = Bsa ^ __andn_u64(Bse, Bsi);
-        Ase = Bse ^ __andn_u64(Bsi, Bso);
-        Asi = Bsi ^ __andn_u64(Bso, Bsu);
-        Aso = Bso ^ __andn_u64(Bsu, Bsa);
-        Asu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
-        Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase;
-        Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
-        Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
-        Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
-
-        Da = Cu ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(Cu, 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Aba ^= Da;
-        Bbe = ROL64(Age ^ De, 44);
-        Bbi = ROL64(Aki ^ Di, 43);
-        Bbo = ROL64(Amo ^ Do, 21);
-        Bbu = ROL64(Asu ^ Du, 14);
-        Eba = Aba ^ __andn_u64(Bbe, Bbi) ^ 0x000000000000800aULL;
-        Ebe = Bbe ^ __andn_u64(Bbi, Bbo);
-        Ebi = Bbi ^ __andn_u64(Bbo, Bbu);
-        Ebo = Bbo ^ __andn_u64(Bbu, Aba);
-        Ebu = Bbu ^ __andn_u64(Aba, Bbe);
-        Bga = ROL64(Abo ^ Do, 28);
-        Bge = ROL64(Agu ^ Du, 20);
-        Bgi = ROL64(Aka ^ Da, 3);
-        Bgo = ROL64(Ame ^ De, 45);
-        Bgu = ROL64(Asi ^ Di, 61);
-        Ega = Bga ^ __andn_u64(Bge, Bgi);
-        Ege = Bge ^ __andn_u64(Bgi, Bgo);
-        Egi = Bgi ^ __andn_u64(Bgo, Bgu);
-        Ego = Bgo ^ __andn_u64(Bgu, Bga);
-        Egu = Bgu ^ __andn_u64(Bga, Bge);
-        Bka = ROL64(Abe ^ De, 1);
-        Bke = ROL64(Agi ^ Di, 6);
-        Bki = ROL64(Ako ^ Do, 25);
-        Bko = ROL64(Amu ^ Du, 8);
-        Bku = ROL64(Asa ^ Da, 18);
-        Eka = Bka ^ __andn_u64(Bke, Bki);
-        Eke = Bke ^ __andn_u64(Bki, Bko);
-        Eki = Bki ^ __andn_u64(Bko, Bku);
-        Eko = Bko ^ __andn_u64(Bku, Bka);
-        Eku = Bku ^ __andn_u64(Bka, Bke);
-        Bma = ROL64(Abu ^ Du, 27);
-        Bme = ROL64(Aga ^ Da, 36);
-        Bmi = ROL64(Ake ^ De, 10);
-        Bmo = ROL64(Ami ^ Di, 15);
-        Bmu = ROL64(Aso ^ Do, 56);
-        Ema = Bma ^ __andn_u64(Bme, Bmi);
-        Eme = Bme ^ __andn_u64(Bmi, Bmo);
-        Emi = Bmi ^ __andn_u64(Bmo, Bmu);
-        Emo = Bmo ^ __andn_u64(Bmu, Bma);
-        Emu = Bmu ^ __andn_u64(Bma, Bme);
-        Bsa = ROL64(Abi ^ Di, 62);
-        Bse = ROL64(Ago ^ Do, 55);
-        Bsi = ROL64(Aku ^ Du, 39);
-        Bso = ROL64(Ama ^ Da, 41);
-        Bsu = ROL64(Ase ^ De, 2);
-        Esa = Bsa ^ __andn_u64(Bse, Bsi);
-        Ese = Bse ^ __andn_u64(Bsi, Bso);
-        Esi = Bsi ^ __andn_u64(Bso, Bsu);
-        Eso = Bso ^ __andn_u64(Bsu, Bsa);
-        Esu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
-        Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
-        Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
-        Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
-        Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
-
-        Da = Cu ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(Cu, 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Eba ^= Da;
-        Bbe = ROL64(Ege ^ De, 44);
-        Bbi = ROL64(Eki ^ Di, 43);
-        Bbo = ROL64(Emo ^ Do, 21);
-        Bbu = ROL64(Esu ^ Du, 14);
-        Aba = Eba ^ __andn_u64(Bbe, Bbi) ^ 0x800000008000000aULL;
-        Abe = Bbe ^ __andn_u64(Bbi, Bbo);
-        Abi = Bbi ^ __andn_u64(Bbo, Bbu);
-        Abo = Bbo ^ __andn_u64(Bbu, Eba);
-        Abu = Bbu ^ __andn_u64(Eba, Bbe);
-        Bga = ROL64(Ebo ^ Do, 28);
-        Bge = ROL64(Egu ^ Du, 20);
-        Bgi = ROL64(Eka ^ Da, 3);
-        Bgo = ROL64(Eme ^ De, 45);
-        Bgu = ROL64(Esi ^ Di, 61);
-        Aga = Bga ^ __andn_u64(Bge, Bgi);
-        Age = Bge ^ __andn_u64(Bgi, Bgo);
-        Agi = Bgi ^ __andn_u64(Bgo, Bgu);
-        Ago = Bgo ^ __andn_u64(Bgu, Bga);
-        Agu = Bgu ^ __andn_u64(Bga, Bge);
-        Bka = ROL64(Ebe ^ De, 1);
-        Bke = ROL64(Egi ^ Di, 6);
-        Bki = ROL64(Eko ^ Do, 25);
-        Bko = ROL64(Emu ^ Du, 8);
-        Bku = ROL64(Esa ^ Da, 18);
-        Aka = Bka ^ __andn_u64(Bke, Bki);
-        Ake = Bke ^ __andn_u64(Bki, Bko);
-        Aki = Bki ^ __andn_u64(Bko, Bku);
-        Ako = Bko ^ __andn_u64(Bku, Bka);
-        Aku = Bku ^ __andn_u64(Bka, Bke);
-        Bma = ROL64(Ebu ^ Du, 27);
-        Bme = ROL64(Ega ^ Da, 36);
-        Bmi = ROL64(Eke ^ De, 10);
-        Bmo = ROL64(Emi ^ Di, 15);
-        Bmu = ROL64(Eso ^ Do, 56);
-        Ama = Bma ^ __andn_u64(Bme, Bmi);
-        Ame = Bme ^ __andn_u64(Bmi, Bmo);
-        Ami = Bmi ^ __andn_u64(Bmo, Bmu);
-        Amo = Bmo ^ __andn_u64(Bmu, Bma);
-        Amu = Bmu ^ __andn_u64(Bma, Bme);
-        Bsa = ROL64(Ebi ^ Di, 62);
-        Bse = ROL64(Ego ^ Do, 55);
-        Bsi = ROL64(Eku ^ Du, 39);
-        Bso = ROL64(Ema ^ Da, 41);
-        Bsu = ROL64(Ese ^ De, 2);
-        Asa = Bsa ^ __andn_u64(Bse, Bsi);
-        Ase = Bse ^ __andn_u64(Bsi, Bso);
-        Asi = Bsi ^ __andn_u64(Bso, Bsu);
-        Aso = Bso ^ __andn_u64(Bsu, Bsa);
-        Asu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
-        Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase;
-        Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
-        Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
-        Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
-
-        Da = Cu ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(Cu, 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Aba ^= Da;
-        Bbe = ROL64(Age ^ De, 44);
-        Bbi = ROL64(Aki ^ Di, 43);
-        Bbo = ROL64(Amo ^ Do, 21);
-        Bbu = ROL64(Asu ^ Du, 14);
-        Eba = Aba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000080008081ULL;
-        Ebe = Bbe ^ __andn_u64(Bbi, Bbo);
-        Ebi = Bbi ^ __andn_u64(Bbo, Bbu);
-        Ebo = Bbo ^ __andn_u64(Bbu, Aba);
-        Ebu = Bbu ^ __andn_u64(Aba, Bbe);
-        Bga = ROL64(Abo ^ Do, 28);
-        Bge = ROL64(Agu ^ Du, 20);
-        Bgi = ROL64(Aka ^ Da, 3);
-        Bgo = ROL64(Ame ^ De, 45);
-        Bgu = ROL64(Asi ^ Di, 61);
-        Ega = Bga ^ __andn_u64(Bge, Bgi);
-        Ege = Bge ^ __andn_u64(Bgi, Bgo);
-        Egi = Bgi ^ __andn_u64(Bgo, Bgu);
-        Ego = Bgo ^ __andn_u64(Bgu, Bga);
-        Egu = Bgu ^ __andn_u64(Bga, Bge);
-        Bka = ROL64(Abe ^ De, 1);
-        Bke = ROL64(Agi ^ Di, 6);
-        Bki = ROL64(Ako ^ Do, 25);
-        Bko = ROL64(Amu ^ Du, 8);
-        Bku = ROL64(Asa ^ Da, 18);
-        Eka = Bka ^ __andn_u64(Bke, Bki);
-        Eke = Bke ^ __andn_u64(Bki, Bko);
-        Eki = Bki ^ __andn_u64(Bko, Bku);
-        Eko = Bko ^ __andn_u64(Bku, Bka);
-        Eku = Bku ^ __andn_u64(Bka, Bke);
-        Bma = ROL64(Abu ^ Du, 27);
-        Bme = ROL64(Aga ^ Da, 36);
-        Bmi = ROL64(Ake ^ De, 10);
-        Bmo = ROL64(Ami ^ Di, 15);
-        Bmu = ROL64(Aso ^ Do, 56);
-        Ema = Bma ^ __andn_u64(Bme, Bmi);
-        Eme = Bme ^ __andn_u64(Bmi, Bmo);
-        Emi = Bmi ^ __andn_u64(Bmo, Bmu);
-        Emo = Bmo ^ __andn_u64(Bmu, Bma);
-        Emu = Bmu ^ __andn_u64(Bma, Bme);
-        Bsa = ROL64(Abi ^ Di, 62);
-        Bse = ROL64(Ago ^ Do, 55);
-        Bsi = ROL64(Aku ^ Du, 39);
-        Bso = ROL64(Ama ^ Da, 41);
-        Bsu = ROL64(Ase ^ De, 2);
-        Esa = Bsa ^ __andn_u64(Bse, Bsi);
-        Ese = Bse ^ __andn_u64(Bsi, Bso);
-        Esi = Bsi ^ __andn_u64(Bso, Bsu);
-        Eso = Bso ^ __andn_u64(Bsu, Bsa);
-        Esu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
-        Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
-        Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
-        Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
-        Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
-
-        Da = Cu ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(Cu, 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Eba ^= Da;
-        Bbe = ROL64(Ege ^ De, 44);
-        Bbi = ROL64(Eki ^ Di, 43);
-        Bbo = ROL64(Emo ^ Do, 21);
-        Bbu = ROL64(Esu ^ Du, 14);
-        Aba = Eba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000000008080ULL;
-        Abe = Bbe ^ __andn_u64(Bbi, Bbo);
-        Abi = Bbi ^ __andn_u64(Bbo, Bbu);
-        Abo = Bbo ^ __andn_u64(Bbu, Eba);
-        Abu = Bbu ^ __andn_u64(Eba, Bbe);
-        Bga = ROL64(Ebo ^ Do, 28);
-        Bge = ROL64(Egu ^ Du, 20);
-        Bgi = ROL64(Eka ^ Da, 3);
-        Bgo = ROL64(Eme ^ De, 45);
-        Bgu = ROL64(Esi ^ Di, 61);
-        Aga = Bga ^ __andn_u64(Bge, Bgi);
-        Age = Bge ^ __andn_u64(Bgi, Bgo);
-        Agi = Bgi ^ __andn_u64(Bgo, Bgu);
-        Ago = Bgo ^ __andn_u64(Bgu, Bga);
-        Agu = Bgu ^ __andn_u64(Bga, Bge);
-        Bka = ROL64(Ebe ^ De, 1);
-        Bke = ROL64(Egi ^ Di, 6);
-        Bki = ROL64(Eko ^ Do, 25);
-        Bko = ROL64(Emu ^ Du, 8);
-        Bku = ROL64(Esa ^ Da, 18);
-        Aka = Bka ^ __andn_u64(Bke, Bki);
-        Ake = Bke ^ __andn_u64(Bki, Bko);
-        Aki = Bki ^ __andn_u64(Bko, Bku);
-        Ako = Bko ^ __andn_u64(Bku, Bka);
-        Aku = Bku ^ __andn_u64(Bka, Bke);
-        Bma = ROL64(Ebu ^ Du, 27);
-        Bme = ROL64(Ega ^ Da, 36);
-        Bmi = ROL64(Eke ^ De, 10);
-        Bmo = ROL64(Emi ^ Di, 15);
-        Bmu = ROL64(Eso ^ Do, 56);
-        Ama = Bma ^ __andn_u64(Bme, Bmi);
-        Ame = Bme ^ __andn_u64(Bmi, Bmo);
-        Ami = Bmi ^ __andn_u64(Bmo, Bmu);
-        Amo = Bmo ^ __andn_u64(Bmu, Bma);
-        Amu = Bmu ^ __andn_u64(Bma, Bme);
-        Bsa = ROL64(Ebi ^ Di, 62);
-        Bse = ROL64(Ego ^ Do, 55);
-        Bsi = ROL64(Eku ^ Du, 39);
-        Bso = ROL64(Ema ^ Da, 41);
-        Bsu = ROL64(Ese ^ De, 2);
-        Asa = Bsa ^ __andn_u64(Bse, Bsi);
-        Ase = Bse ^ __andn_u64(Bsi, Bso);
-        Asi = Bsi ^ __andn_u64(Bso, Bsu);
-        Aso = Bso ^ __andn_u64(Bsu, Bsa);
-        Asu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
-        Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase;
-        Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
-        Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
-        Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
-
-        Da = Cu ^ ROL64(Ce, 1);
-        De = Ca ^ ROL64(Ci, 1);
-        Di = Ce ^ ROL64(Co, 1);
-        Do = Ci ^ ROL64(Cu, 1);
-        Du = Co ^ ROL64(Ca, 1);
-        Bba = Aba ^ Da;
-        Bbe = ROL64(Age ^ De, 44);
-        Bbi = ROL64(Aki ^ Di, 43);
-        Bbo = ROL64(Amo ^ Do, 21);
-        Bbu = ROL64(Asu ^ Du, 14);
-        Bga = ROL64(Abo ^ Do, 28);
-        Bge = ROL64(Agu ^ Du, 20);
-        Bgi = ROL64(Aka ^ Da, 3);
-        Bgo = ROL64(Ame ^ De, 45);
-        Bgu = ROL64(Asi ^ Di, 61);
-        Bka = ROL64(Abe ^ De, 1);
-        Bke = ROL64(Agi ^ Di, 6);
-        Bki = ROL64(Ako ^ Do, 25);
-        Bko = ROL64(Amu ^ Du, 8);
-        Bku = ROL64(Asa ^ Da, 18);
-        Bma = ROL64(Abu ^ Du, 27);
-        Bme = ROL64(Aga ^ Da, 36);
-        Bmi = ROL64(Ake ^ De, 10);
-        Bmo = ROL64(Ami ^ Di, 15);
-        Bmu = ROL64(Aso ^ Do, 56);
-        Bsa = ROL64(Abi ^ Di, 62);
-        Bse = ROL64(Ago ^ Do, 55);
-        Bsi = ROL64(Aku ^ Du, 39);
-        Bso = ROL64(Ama ^ Da, 41);
-        Bsu = ROL64(Ase ^ De, 2);
-        Eba = Bba ^ __andn_u64(Bbe, Bbi) ^ 0x0000000080000001ULL;
-        Ege = Bge ^ __andn_u64(Bgi, Bgo);
-        Eki = Bki ^ __andn_u64(Bko, Bku);
-        Emo = Bmo ^ __andn_u64(Bmu, Bma);
-        Esu = Bsu ^ __andn_u64(Bsa, Bse);
-        Ca = Eba ^ Bga ^ Bka ^ Bma ^ Bsa ^ __andn_u64(Bge, Bgi) ^ __andn_u64(Bke, Bki) ^ __andn_u64(Bme, Bmi) ^ __andn_u64(Bse, Bsi);
-        Ce = Bbe ^ Ege ^ Bke ^ Bme ^ Bse ^ __andn_u64(Bbi, Bbo) ^ __andn_u64(Bki, Bko) ^ __andn_u64(Bmi, Bmo) ^ __andn_u64(Bsi, Bso);
-        Ci = Bbi ^ Bgi ^ Eki ^ Bmi ^ Bsi ^ __andn_u64(Bbo, Bbu) ^ __andn_u64(Bgo, Bgu) ^ __andn_u64(Bmo, Bmu) ^ __andn_u64(Bso, Bsu);
-        Co = Bbo ^ Bgo ^ Bko ^ Emo ^ Bso ^ __andn_u64(Bbu, Bba) ^ __andn_u64(Bgu, Bga) ^ __andn_u64(Bku, Bka) ^ __andn_u64(Bsu, Bsa);
-        Cu = Bbu ^ Bgu ^ Bku ^ Bmu ^ Esu ^ __andn_u64(Bba, Bbe) ^ __andn_u64(Bga, Bge) ^ __andn_u64(Bka, Bke) ^ __andn_u64(Bma, Bme);
-
-        Bba = Eba ^ Cu ^ ROL64(Ce, 1);
-        Bbe = ROL64(Ege ^ Ca ^ ROL64(Ci, 1), 44);
-        Bbi = ROL64(Eki ^ Ce ^ ROL64(Co, 1), 43);
-        Bbo = ROL64(Emo ^ Ci ^ ROL64(Cu, 1), 21);
-        Bbu = ROL64(Esu ^ Co ^ ROL64(Ca, 1), 14);
-        ((unsigned long long*)output)[0] = Bba ^ __andn_u64(Bbe, Bbi) ^ 0x8000000080008008ULL;
-        ((unsigned long long*)output)[1] = Bbe ^ __andn_u64(Bbi, Bbo);
-        ((unsigned long long*)output)[2] = Bbi ^ __andn_u64(Bbo, Bbu);
-        ((unsigned long long*)output)[3] = Bbo ^ __andn_u64(Bbu, Bba);
-    #endif
-    }
-
-
-
         /* Qubic Specific */
         typedef unsigned long long felm_t[2]; // Datatype for representing 128-bit field elements
         typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements
@@ -1709,7 +280,7 @@ extern "C" {
                 }
             }
             unsigned int identityBytesChecksum;
-            KangarooTwelve(publicKey, 32, (unsigned char*)&identityBytesChecksum, 3);
+            KangarooTwelve(publicKey, 32, (unsigned char*)&identityBytesChecksum, 3, NULL, 0);
             identityBytesChecksum &= 0x3FFFF;
             for (int i = 0; i < 4; i++)
             {
@@ -1721,7 +292,7 @@ extern "C" {
 
         void getPrivateKey(unsigned char* subseed, unsigned char* privateKey)
         {
-            KangarooTwelve(subseed, 32, privateKey, 32);
+            KangarooTwelve(subseed, 32, privateKey, 32, NULL, 0);
         }
 
 
@@ -1754,7 +325,7 @@ extern "C" {
                     }
                 }
                 unsigned int identityBytesChecksum;
-                KangarooTwelve(publicKeyBuffer, 32, (unsigned char*)&identityBytesChecksum, 3);
+                KangarooTwelve(publicKeyBuffer, 32, (unsigned char*)&identityBytesChecksum, 3, NULL, 0);
                 identityBytesChecksum &= 0x3FFFF;
                 for (int i = 0; i < 4; i++)
                 {
@@ -1780,7 +351,7 @@ extern "C" {
                     }
                     seedBytes[i] = seed[i] - 'a';
                 }
-                KangarooTwelve(seedBytes, sizeof(seedBytes), subseed, 32);
+                KangarooTwelve(seedBytes, sizeof(seedBytes), subseed, 32, NULL, 0);
 
                 return true;
             }
diff --git a/identity/build.rs b/identity/build.rs
index 11cfb38..3f29496 100644
--- a/identity/build.rs
+++ b/identity/build.rs
@@ -13,7 +13,7 @@ fn main() {
     } else {
         cc::Build::new()
             .define("__LINUX__", "1")
-            .define("_ARM_", "1")
+            .define("_X86_", "1")
             .define("_AVX_", "1")
             .define("USE_ENDO", "true")
             .include("../ffi-deps/FourQlib/FourQ_32bit")
@@ -32,9 +32,11 @@ fn main() {
 
         cc::Build::new()
             .file("../ffi-deps/chopper-linux.cpp")
-            .define("_AMD64_", "1")
+            .define("__LINUX__", "1")
+            .define("_X86_", "1")
             .define("_AVX_", "1")
-            .include("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h")
+            .include("../ffi-deps/FourQlib/FourQ_32bit")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h")
             .compile("Chopper")
     }
 }

From 23d99914e5aa3476403c73375c7d14d7cc588bb7 Mon Sep 17 00:00:00 2001
From: Matthew Darnell <matthew.darnell@protonmail.com>
Date: Sat, 9 Dec 2023 18:04:36 -0500
Subject: [PATCH 2/7] Add more K12 functions

---
 .../lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S  |  623 ++++++++++
 .../K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h     |   65 +
 .../K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c   |  227 ++++
 .../K12/lib/Inplace32BI/KeccakP-1600-SnP.h    |   35 +
 .../Inplace32BI/KeccakP-1600-inplace32BI.c    | 1068 +++++++++++++++++
 .../K12/lib/Optimized64/KeccakP-1600-AVX2.s   |  664 ++++++++++
 .../Optimized64/KeccakP-1600-AVX512-plainC.c  |  241 ++++
 .../K12/lib/Optimized64/KeccakP-1600-AVX512.s |  551 +++++++++
 .../K12/lib/Optimized64/KeccakP-1600-SnP.h    |   74 ++
 .../K12/lib/Optimized64/KeccakP-1600-opt64.c  | 1026 ++++++++++++++++
 .../KeccakP-1600-runtimeDispatch.c            |  406 +++++++
 .../Optimized64/KeccakP-1600-timesN-AVX2.c    |  419 +++++++
 .../Optimized64/KeccakP-1600-timesN-AVX512.c  |  458 +++++++
 .../Optimized64/KeccakP-1600-timesN-SSSE3.c   |  438 +++++++
 14 files changed, 6295 insertions(+)
 create mode 100644 ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S
 create mode 100644 ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h
 create mode 100644 ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c
 create mode 100644 ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h
 create mode 100644 ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c
 create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s
 create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c
 create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s
 create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h
 create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c
 create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c
 create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c
 create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c
 create mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c

diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S
new file mode 100644
index 0000000..09aa0d2
--- /dev/null
+++ b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S
@@ -0,0 +1,623 @@
+# K12 based on the eXtended Keccak Code Package (XKCP)
+# https://github.com/XKCP/XKCP
+#
+# The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+#
+# Implementation by Gilles Van Assche, hereby denoted as "the implementer".
+# Core subroutine is based on one by Andy Polyakov, available
+# at https://github.com/dot-asm/cryptogams. Used with permission.
+#
+# For more information, feedback or questions, please refer to the Keccak Team website:
+# https://keccak.team/
+#
+# To the extent possible under law, the implementer has waived all copyright
+# and related or neighboring rights to the source code in this file.
+# http://creativecommons.org/publicdomain/zero/1.0/
+
+.text
+
+.balign 64  // strategic alignment and padding that allows to use
+        // address value as loop termination condition...
+    .quad   0,0,0,0,0,0,0,0
+.ifdef macOS
+.else
+.type   iotas,%object
+.endif
+iotas:
+    .quad   0x0000000000000001
+    .quad   0x0000000000008082
+    .quad   0x800000000000808a
+    .quad   0x8000000080008000
+    .quad   0x000000000000808b
+    .quad   0x0000000080000001
+    .quad   0x8000000080008081
+    .quad   0x8000000000008009
+    .quad   0x000000000000008a
+    .quad   0x0000000000000088
+    .quad   0x0000000080008009
+    .quad   0x000000008000000a
+iotas12:
+    .quad   0x000000008000808b
+    .quad   0x800000000000008b
+    .quad   0x8000000000008089
+    .quad   0x8000000000008003
+    .quad   0x8000000000008002
+    .quad   0x8000000000000080
+    .quad   0x000000000000800a
+    .quad   0x800000008000000a
+    .quad   0x8000000080008081
+    .quad   0x8000000000008080
+    .quad   0x0000000080000001
+    .quad   0x8000000080008008
+.ifdef macOS
+.else
+.size   iotas,.-iotas
+.endif
+
+.ifdef macOS
+.else
+.type   KeccakP1600_ARMv8Asha3_Permute_12rounds_internal,%function
+.endif
+KeccakP1600_ARMv8Asha3_Permute_12rounds_internal:
+.balign 32
+    mov x9,#12
+    adr x10,iotas12
+    b   .Loop_ce
+.balign 16
+.Loop_ce:
+    ////////////////////////////////////////////////// Theta
+    eor3    v25.16b,v20.16b,v15.16b,v10.16b
+    eor3    v26.16b,v21.16b,v16.16b,v11.16b
+    eor3    v27.16b,v22.16b,v17.16b,v12.16b
+    eor3    v28.16b,v23.16b,v18.16b,v13.16b
+    eor3    v29.16b,v24.16b,v19.16b,v14.16b
+    eor3    v25.16b,v25.16b,   v5.16b,v0.16b
+    eor3    v26.16b,v26.16b,   v6.16b,v1.16b
+    eor3    v27.16b,v27.16b,   v7.16b,v2.16b
+    eor3    v28.16b,v28.16b,   v8.16b,v3.16b
+    eor3    v29.16b,v29.16b,   v9.16b,v4.16b
+
+    rax1    v30.2d,v25.2d,v27.2d            // D[1]
+    rax1    v31.2d,v26.2d,v28.2d            // D[2]
+    rax1    v27.2d,v27.2d,v29.2d            // D[3]
+    rax1    v28.2d,v28.2d,v25.2d            // D[4]
+    rax1    v29.2d,v29.2d,v26.2d            // D[0]
+
+    ////////////////////////////////////////////////// Theta+Rho+Pi
+    xar v25.2d,   v1.2d,v30.2d,#64-1 // C[0]=A[2][0]
+
+    xar v1.2d,v6.2d,v30.2d,#64-44
+    xar v6.2d,v9.2d,v28.2d,#64-20
+    xar v9.2d,v22.2d,v31.2d,#64-61
+    xar v22.2d,v14.2d,v28.2d,#64-39
+    xar v14.2d,v20.2d,v29.2d,#64-18
+
+    xar v26.2d,   v2.2d,v31.2d,#64-62 // C[1]=A[4][0]
+
+    xar v2.2d,v12.2d,v31.2d,#64-43
+    xar v12.2d,v13.2d,v27.2d,#64-25
+    xar v13.2d,v19.2d,v28.2d,#64-8
+    xar v19.2d,v23.2d,v27.2d,#64-56
+    xar v23.2d,v15.2d,v29.2d,#64-41
+
+    xar v15.2d,v4.2d,v28.2d,#64-27
+
+    xar v28.2d,   v24.2d,v28.2d,#64-14 // D[4]=A[0][4]
+    xar v24.2d,v21.2d,v30.2d,#64-2
+    xar v8.2d,v8.2d,v27.2d,#64-55 // A[1][3]=A[4][1]
+    xar v4.2d,v16.2d,v30.2d,#64-45 // A[0][4]=A[1][3]
+    xar v16.2d,v5.2d,v29.2d,#64-36
+
+    xar v5.2d,v3.2d,v27.2d,#64-28
+
+    eor v0.16b,v0.16b,v29.16b
+
+    xar v27.2d,   v18.2d,v27.2d,#64-21 // D[3]=A[0][3]
+    xar v3.2d,v17.2d,v31.2d,#64-15 // A[0][3]=A[3][3]
+    xar v30.2d,   v11.2d,v30.2d,#64-10 // D[1]=A[3][2]
+    xar v31.2d,   v7.2d,v31.2d,#64-6 // D[2]=A[2][1]
+    xar v29.2d,   v10.2d,v29.2d,#64-3 // D[0]=A[1][2]
+
+    ////////////////////////////////////////////////// Chi+Iota
+    bcax    v20.16b,v26.16b,   v22.16b,v8.16b   // A[1][3]=A[4][1]
+    bcax    v21.16b,v8.16b,v23.16b,v22.16b  // A[1][3]=A[4][1]
+    bcax    v22.16b,v22.16b,v24.16b,v23.16b
+    bcax    v23.16b,v23.16b,v26.16b,   v24.16b
+    bcax    v24.16b,v24.16b,v8.16b,v26.16b  // A[1][3]=A[4][1]
+
+    ld1r    {v26.2d},[x10],#8
+
+    bcax    v17.16b,v30.16b,   v19.16b,v3.16b   // A[0][3]=A[3][3]
+    bcax    v18.16b,v3.16b,v15.16b,v19.16b  // A[0][3]=A[3][3]
+    bcax    v19.16b,v19.16b,v16.16b,v15.16b
+    bcax    v15.16b,v15.16b,v30.16b,   v16.16b
+    bcax    v16.16b,v16.16b,v3.16b,v30.16b  // A[0][3]=A[3][3]
+
+    bcax    v10.16b,v25.16b,   v12.16b,v31.16b
+    bcax    v11.16b,v31.16b,   v13.16b,v12.16b
+    bcax    v12.16b,v12.16b,v14.16b,v13.16b
+    bcax    v13.16b,v13.16b,v25.16b,   v14.16b
+    bcax    v14.16b,v14.16b,v31.16b,   v25.16b
+
+    bcax    v7.16b,v29.16b,   v9.16b,v4.16b // A[0][4]=A[1][3]
+    bcax    v8.16b,v4.16b,v5.16b,v9.16b // A[0][4]=A[1][3]
+    bcax    v9.16b,v9.16b,v6.16b,v5.16b
+    bcax    v5.16b,v5.16b,v29.16b,   v6.16b
+    bcax    v6.16b,v6.16b,v4.16b,v29.16b    // A[0][4]=A[1][3]
+
+    bcax    v3.16b,v27.16b,   v0.16b,v28.16b
+    bcax    v4.16b,v28.16b,   v1.16b,v0.16b
+    bcax    v0.16b,v0.16b,v2.16b,v1.16b
+    bcax    v1.16b,v1.16b,v27.16b,   v2.16b
+    bcax    v2.16b,v2.16b,v28.16b,   v27.16b
+
+    eor v0.16b,v0.16b,v26.16b
+
+    subs    x9,x9,#1
+    bne .Loop_ce
+
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_ARMv8Asha3_Permute_12rounds_internal,.-KeccakP1600_ARMv8Asha3_Permute_12rounds_internal
+.endif
+
+.ifdef macOS
+.globl  _KeccakP1600_ARMv8Asha3_Permute_12rounds
+_KeccakP1600_ARMv8Asha3_Permute_12rounds:
+.else
+.globl  KeccakP1600_ARMv8Asha3_Permute_12rounds
+.type   KeccakP1600_ARMv8Asha3_Permute_12rounds,%function
+KeccakP1600_ARMv8Asha3_Permute_12rounds:
+.endif
+.balign 32
+    stp x29,x30,[sp,#-80]!
+    add x29,sp,#0
+    stp d8,d9,[sp,#16]      // per ABI requirement
+    stp d10,d11,[sp,#32]
+    stp d12,d13,[sp,#48]
+    stp d14,d15,[sp,#64]
+    ldp d0,d1,[x0,#8*0]
+    ldp d2,d3,[x0,#8*2]
+    ldp d4,d5,[x0,#8*4]
+    ldp d6,d7,[x0,#8*6]
+    ldp d8,d9,[x0,#8*8]
+    ldp d10,d11,[x0,#8*10]
+    ldp d12,d13,[x0,#8*12]
+    ldp d14,d15,[x0,#8*14]
+    ldp d16,d17,[x0,#8*16]
+    ldp d18,d19,[x0,#8*18]
+    ldp d20,d21,[x0,#8*20]
+    ldp d22,d23,[x0,#8*22]
+    ldr d24,[x0,#8*24]
+    bl  KeccakP1600_ARMv8Asha3_Permute_12rounds_internal
+    ldr x30,[sp,#8]
+    stp d0,d1,[x0,#8*0]
+    stp d2,d3,[x0,#8*2]
+    stp d4,d5,[x0,#8*4]
+    stp d6,d7,[x0,#8*6]
+    stp d8,d9,[x0,#8*8]
+    stp d10,d11,[x0,#8*10]
+    stp d12,d13,[x0,#8*12]
+    stp d14,d15,[x0,#8*14]
+    stp d16,d17,[x0,#8*16]
+    stp d18,d19,[x0,#8*18]
+    stp d20,d21,[x0,#8*20]
+    stp d22,d23,[x0,#8*22]
+    str d24,[x0,#8*24]
+
+    ldp d8,d9,[sp,#16]
+    ldp d10,d11,[sp,#32]
+    ldp d12,d13,[sp,#48]
+    ldp d14,d15,[sp,#64]
+    ldr x29,[sp],#80
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_ARMv8Asha3_Permute_12rounds,.-KeccakP1600_ARMv8Asha3_Permute_12rounds
+.endif
+
+// size_t KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb(
+//      void *state(x0),
+//      unsigned int laneCount(x1) = 21,
+//      const unsigned char *data(x2),
+//      size_t dataByteLen(x3))
+.ifdef macOS
+.globl  _KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb
+_KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb:
+.else
+.globl  KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb
+.type   KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb,%function
+KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb:
+.endif
+.balign 32
+    stp x29,x30,[sp,#-80]!
+    add x29,sp,#0
+    stp d8,d9,[sp,#16]      // per ABI requirement
+    stp d10,d11,[sp,#32]
+    stp d12,d13,[sp,#48]
+    stp d14,d15,[sp,#64]
+
+    ldp d0,d1,[x0,#8*0]
+    ldp d2,d3,[x0,#8*2]
+    ldp d4,d5,[x0,#8*4]
+    ldp d6,d7,[x0,#8*6]
+    ldp d8,d9,[x0,#8*8]
+    ldp d10,d11,[x0,#8*10]
+    ldp d12,d13,[x0,#8*12]
+    ldp d14,d15,[x0,#8*14]
+    ldp d16,d17,[x0,#8*16]
+    ldp d18,d19,[x0,#8*18]
+    ldp d20,d21,[x0,#8*20]
+    ldp d22,d23,[x0,#8*22]
+    ldr d24,[x0,#8*24]
+
+    // Prepare the return value
+    mov x11, #0
+    b .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop
+
+.balign 16
+.KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop:
+    subs x3, x3, #8*21
+    b.cc .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_end
+
+    // Lanes 0-3
+    ld1 {v27.8b-v30.8b}, [x2], #32
+    eor  v0.16b,  v0.16b, v27.16b
+    eor  v1.16b,  v1.16b, v28.16b
+    eor  v2.16b,  v2.16b, v29.16b
+    eor  v3.16b,  v3.16b, v30.16b
+
+    // Lanes 4-7
+    ld1 {v27.8b-v30.8b}, [x2], #32
+    eor  v4.16b,  v4.16b, v27.16b
+    eor  v5.16b,  v5.16b, v28.16b
+    eor  v6.16b,  v6.16b, v29.16b
+    eor  v7.16b,  v7.16b, v30.16b
+
+    // Lanes 8-11
+    ld1 {v27.8b-v30.8b}, [x2], #32
+    eor  v8.16b,  v8.16b, v27.16b
+    eor  v9.16b,  v9.16b, v28.16b
+    eor v10.16b, v10.16b, v29.16b
+    eor v11.16b, v11.16b, v30.16b
+
+    // Lanes 12-15
+    ld1 {v27.8b-v30.8b}, [x2], #32
+    eor v12.16b, v12.16b, v27.16b
+    eor v13.16b, v13.16b, v28.16b
+    eor v14.16b, v14.16b, v29.16b
+    eor v15.16b, v15.16b, v30.16b
+
+    // Lanes 16-20
+    ld1 {v27.8b-v30.8b}, [x2], #32
+    eor v16.16b, v16.16b, v27.16b
+    eor v17.16b, v17.16b, v28.16b
+    eor v18.16b, v18.16b, v29.16b
+    eor v19.16b, v19.16b, v30.16b
+    ld1 {v27.8b}, [x2], #8
+    eor v20.16b, v20.16b, v27.16b
+
+    bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal
+
+    add x11, x11, #8*21
+
+    b .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop
+.KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_end:
+
+    stp d0,d1,[x0,#8*0]
+    stp d2,d3,[x0,#8*2]
+    stp d4,d5,[x0,#8*4]
+    stp d6,d7,[x0,#8*6]
+    stp d8,d9,[x0,#8*8]
+    stp d10,d11,[x0,#8*10]
+    stp d12,d13,[x0,#8*12]
+    stp d14,d15,[x0,#8*14]
+    stp d16,d17,[x0,#8*16]
+    stp d18,d19,[x0,#8*18]
+    stp d20,d21,[x0,#8*20]
+    stp d22,d23,[x0,#8*22]
+    str d24,[x0,#8*24]
+
+    mov x0, x11
+
+    ldr x30,[sp,#8]
+    ldp d8,d9,[sp,#16]
+    ldp d10,d11,[sp,#32]
+    ldp d12,d13,[sp,#48]
+    ldp d14,d15,[sp,#64]
+    ldr x29,[sp],#80
+
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb,.-KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb
+.endif
+
+.ifdef macOS
+.globl  _KeccakP1600times2_ARMv8Asha3_Permute_12rounds
+_KeccakP1600times2_ARMv8Asha3_Permute_12rounds:
+.else
+.globl  KeccakP1600times2_ARMv8Asha3_Permute_12rounds
+.type   KeccakP1600times2_ARMv8Asha3_Permute_12rounds,%function
+KeccakP1600times2_ARMv8Asha3_Permute_12rounds:
+.endif
+.balign 32
+    stp x29,x30,[sp,#-80]!
+    add x29,sp,#0
+    stp d8,d9,[sp,#16]      // per ABI requirement
+    stp d10,d11,[sp,#32]
+    stp d12,d13,[sp,#48]
+    stp d14,d15,[sp,#64]
+
+    ld1 { v0.2d,  v1.2d,  v2.2d,  v3.2d}, [x0], #64
+    ld1 { v4.2d,  v5.2d,  v6.2d,  v7.2d}, [x0], #64
+    ld1 { v8.2d,  v9.2d, v10.2d, v11.2d}, [x0], #64
+    ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64
+    ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64
+    ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64
+    ld1 {v24.2d}, [x0]
+    sub x0, x0, #64*6
+
+    bl  KeccakP1600_ARMv8Asha3_Permute_12rounds_internal
+
+    ldr x30,[sp,#8]
+    st1 { v0.2d,  v1.2d,  v2.2d,  v3.2d}, [x0], #64
+    st1 { v4.2d,  v5.2d,  v6.2d,  v7.2d}, [x0], #64
+    st1 { v8.2d,  v9.2d, v10.2d, v11.2d}, [x0], #64
+    st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64
+    st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64
+    st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64
+    st1 {v24.2d}, [x0]
+
+    ldp d8,d9,[sp,#16]
+    ldp d10,d11,[sp,#32]
+    ldp d12,d13,[sp,#48]
+    ldp d14,d15,[sp,#64]
+    ldr x29,[sp],#80
+
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600times2_ARMv8Asha3_Permute_12rounds,.-KeccakP1600times2_ARMv8Asha3_Permute_12rounds
+.endif
+
+.ifdef macOS
+.globl  _KangarooTwelve_ARMv8Asha3_Process2Leaves
+_KangarooTwelve_ARMv8Asha3_Process2Leaves:
+.else
+.globl  KangarooTwelve_ARMv8Asha3_Process2Leaves
+.type   KangarooTwelve_ARMv8Asha3_Process2Leaves,%function
+KangarooTwelve_ARMv8Asha3_Process2Leaves:
+.endif
+.balign 32
+    stp x29,x30,[sp,#-80]!
+    add x29,sp,#0
+    stp d8,d9,[sp,#16]      // per ABI requirement
+    stp d10,d11,[sp,#32]
+    stp d12,d13,[sp,#48]
+    stp d14,d15,[sp,#64]
+
+    movi  v0.2d, #0
+    movi  v1.2d, #0
+    movi  v2.2d, #0
+    movi  v3.2d, #0
+    movi  v4.2d, #0
+    movi  v5.2d, #0
+    movi  v6.2d, #0
+    movi  v7.2d, #0
+    movi  v8.2d, #0
+    movi  v9.2d, #0
+    movi v10.2d, #0
+    movi v11.2d, #0
+    movi v12.2d, #0
+    movi v13.2d, #0
+    movi v14.2d, #0
+    movi v15.2d, #0
+    movi v16.2d, #0
+    movi v17.2d, #0
+    movi v18.2d, #0
+    movi v19.2d, #0
+    movi v20.2d, #0
+    movi v21.2d, #0
+    movi v22.2d, #0
+    movi v23.2d, #0
+    movi v24.2d, #0
+
+    // x12 is input + chunkSize
+    add x12, x0, #8192
+
+    // Loop over the first 48 blocks
+    mov x11, 48
+    b .KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks
+.KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks:
+
+    // Lanes 0-3
+    ld1 {v25.1d-v28.1d}, [x0], #32
+    ld1 {v25.d}[1], [x12], #8
+    ld1 {v26.d}[1], [x12], #8
+    ld1 {v27.d}[1], [x12], #8
+    ld1 {v28.d}[1], [x12], #8
+#ifdef  __AARCH64EB__
+    rev64 v25.16b, v25.16b
+    rev64 v26.16b, v26.16b
+    rev64 v27.16b, v27.16b
+    rev64 v28.16b, v28.16b
+#endif
+    eor v0.16b, v0.16b, v25.16b
+    eor v1.16b, v1.16b, v26.16b
+    eor v2.16b, v2.16b, v27.16b
+    eor v3.16b, v3.16b, v28.16b
+
+    // Lanes 4-7
+    ld1 {v25.1d-v28.1d}, [x0], #32
+    ld1 {v25.d}[1], [x12], #8
+    ld1 {v26.d}[1], [x12], #8
+    ld1 {v27.d}[1], [x12], #8
+    ld1 {v28.d}[1], [x12], #8
+#ifdef  __AARCH64EB__
+    rev64 v25.16b, v25.16b
+    rev64 v26.16b, v26.16b
+    rev64 v27.16b, v27.16b
+    rev64 v28.16b, v28.16b
+#endif
+    eor v4.16b, v4.16b, v25.16b
+    eor v5.16b, v5.16b, v26.16b
+    eor v6.16b, v6.16b, v27.16b
+    eor v7.16b, v7.16b, v28.16b
+
+    // Lanes 8-11
+    ld1 {v25.1d-v28.1d}, [x0], #32
+    ld1 {v25.d}[1], [x12], #8
+    ld1 {v26.d}[1], [x12], #8
+    ld1 {v27.d}[1], [x12], #8
+    ld1 {v28.d}[1], [x12], #8
+#ifdef  __AARCH64EB__
+    rev64 v25.16b, v25.16b
+    rev64 v26.16b, v26.16b
+    rev64 v27.16b, v27.16b
+    rev64 v28.16b, v28.16b
+#endif
+    eor  v8.16b,  v8.16b, v25.16b
+    eor  v9.16b,  v9.16b, v26.16b
+    eor v10.16b, v10.16b, v27.16b
+    eor v11.16b, v11.16b, v28.16b
+
+    // Lanes 12-15
+    ld1 {v25.1d-v28.1d}, [x0], #32
+    ld1 {v25.d}[1], [x12], #8
+    ld1 {v26.d}[1], [x12], #8
+    ld1 {v27.d}[1], [x12], #8
+    ld1 {v28.d}[1], [x12], #8
+#ifdef  __AARCH64EB__
+    rev64 v25.16b, v25.16b
+    rev64 v26.16b, v26.16b
+    rev64 v27.16b, v27.16b
+    rev64 v28.16b, v28.16b
+#endif
+    eor v12.16b, v12.16b, v25.16b
+    eor v13.16b, v13.16b, v26.16b
+    eor v14.16b, v14.16b, v27.16b
+    eor v15.16b, v15.16b, v28.16b
+
+    // Lanes 16-20
+    ld1 {v25.1d-v28.1d}, [x0], #32
+    ld1 {v25.d}[1], [x12], #8
+    ld1 {v26.d}[1], [x12], #8
+    ld1 {v27.d}[1], [x12], #8
+    ld1 {v28.d}[1], [x12], #8
+    ld1 {v29.d}[0], [x0], #8
+    ld1 {v29.d}[1], [x12], #8
+#ifdef  __AARCH64EB__
+    rev64 v25.16b, v25.16b
+    rev64 v26.16b, v26.16b
+    rev64 v27.16b, v27.16b
+    rev64 v28.16b, v28.16b
+    rev64 v29.16b, v29.16b
+#endif
+    eor v16.16b, v16.16b, v25.16b
+    eor v17.16b, v17.16b, v26.16b
+    eor v18.16b, v18.16b, v27.16b
+    eor v19.16b, v19.16b, v28.16b
+    eor v20.16b, v20.16b, v29.16b
+
+    bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal
+
+    subs x11, x11, #1
+    bne .KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks
+
+    // Lanes 0-3
+    ld1 {v25.1d-v28.1d}, [x0], #32
+    ld1 {v25.d}[1], [x12], #8
+    ld1 {v26.d}[1], [x12], #8
+    ld1 {v27.d}[1], [x12], #8
+    ld1 {v28.d}[1], [x12], #8
+#ifdef  __AARCH64EB__
+    rev64 v25.16b, v25.16b
+    rev64 v26.16b, v26.16b
+    rev64 v27.16b, v27.16b
+    rev64 v28.16b, v28.16b
+#endif
+    eor v0.16b, v0.16b, v25.16b
+    eor v1.16b, v1.16b, v26.16b
+    eor v2.16b, v2.16b, v27.16b
+    eor v3.16b, v3.16b, v28.16b
+
+    // Lanes 4-7
+    ld1 {v25.1d-v28.1d}, [x0], #32
+    ld1 {v25.d}[1], [x12], #8
+    ld1 {v26.d}[1], [x12], #8
+    ld1 {v27.d}[1], [x12], #8
+    ld1 {v28.d}[1], [x12], #8
+#ifdef  __AARCH64EB__
+    rev64 v25.16b, v25.16b
+    rev64 v26.16b, v26.16b
+    rev64 v27.16b, v27.16b
+    rev64 v28.16b, v28.16b
+#endif
+    eor v4.16b, v4.16b, v25.16b
+    eor v5.16b, v5.16b, v26.16b
+    eor v6.16b, v6.16b, v27.16b
+    eor v7.16b, v7.16b, v28.16b
+
+    // Lanes 8-11
+    ld1 {v25.1d-v28.1d}, [x0], #32
+    ld1 {v25.d}[1], [x12], #8
+    ld1 {v26.d}[1], [x12], #8
+    ld1 {v27.d}[1], [x12], #8
+    ld1 {v28.d}[1], [x12], #8
+#ifdef  __AARCH64EB__
+    rev64 v25.16b, v25.16b
+    rev64 v26.16b, v26.16b
+    rev64 v27.16b, v27.16b
+    rev64 v28.16b, v28.16b
+#endif
+    eor  v8.16b,  v8.16b, v25.16b
+    eor  v9.16b,  v9.16b, v26.16b
+    eor v10.16b, v10.16b, v27.16b
+    eor v11.16b, v11.16b, v28.16b
+
+    // Lanes 12-15
+    ld1 {v25.1d-v28.1d}, [x0], #32
+    ld1 {v25.d}[1], [x12], #8
+    ld1 {v26.d}[1], [x12], #8
+    ld1 {v27.d}[1], [x12], #8
+    ld1 {v28.d}[1], [x12], #8
+#ifdef  __AARCH64EB__
+    rev64 v25.16b, v25.16b
+    rev64 v26.16b, v26.16b
+    rev64 v27.16b, v27.16b
+    rev64 v28.16b, v28.16b
+#endif
+    eor v12.16b, v12.16b, v25.16b
+    eor v13.16b, v13.16b, v26.16b
+    eor v14.16b, v14.16b, v27.16b
+    eor v15.16b, v15.16b, v28.16b
+
+    mov x13, #0x0B
+    dup v25.2d, x13
+    mov x13, #0x8000000000000000
+    dup v26.2d, x13
+    eor v16.16b, v16.16b, v25.16b
+    eor v20.16b, v20.16b, v26.16b
+
+    bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal
+
+    st1 {v0.1d-v3.1d}, [x1], #32
+    st1 {v0.d}[1], [x1], #8
+    st1 {v1.d}[1], [x1], #8
+    st1 {v2.d}[1], [x1], #8
+    st1 {v3.d}[1], [x1], #8
+
+    ldr x30,[sp,#8]
+    ldp d8,d9,[sp,#16]
+    ldp d10,d11,[sp,#32]
+    ldp d12,d13,[sp,#48]
+    ldp d14,d15,[sp,#64]
+    ldr x29,[sp],#80
+
+    ret
+.ifdef macOS
+.else
+.size   KangarooTwelve_ARMv8Asha3_Process2Leaves,.-KangarooTwelve_ARMv8Asha3_Process2Leaves
+.endif
diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h
new file mode 100644
index 0000000..512eca3
--- /dev/null
+++ b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h
@@ -0,0 +1,65 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+/* Keccak-p[1600] */
+
+#define KeccakP1600_stateSizeInBytes    200
+#define KeccakP1600_stateAlignment      8
+#define KeccakP1600_12rounds_FastLoop_supported
+
+const char * KeccakP1600_GetImplementation();
+void KeccakP1600_opt64_Initialize(void *state);
+void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_ARMv8Asha3_Permute_12rounds(void *state);
+void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+size_t KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+
+#define KeccakP1600_Initialize KeccakP1600_opt64_Initialize
+#define KeccakP1600_AddByte KeccakP1600_opt64_AddByte
+#define KeccakP1600_AddBytes KeccakP1600_opt64_AddBytes
+#define KeccakP1600_Permute_12rounds KeccakP1600_ARMv8Asha3_Permute_12rounds
+#define KeccakP1600_ExtractBytes KeccakP1600_opt64_ExtractBytes
+#define KeccakP1600_12rounds_FastLoop_Absorb KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb
+
+/* Keccak-p[1600]×2 */
+
+int KeccakP1600times2_IsAvailable();
+const char * KeccakP1600times2_GetImplementation();
+void KeccakP1600times2_ARMv8Asha3_Permute_12rounds(void *state);
+void KangarooTwelve_ARMv8Asha3_Process2Leaves(const unsigned char *input, unsigned char *output);
+
+#define KeccakP1600times2_Permute_12rounds KeccakP1600times2_ARMv8Asha3_Permute_12rounds
+#define KangarooTwelve_Process2Leaves KangarooTwelve_ARMv8Asha3_Process2Leaves
+
+/* Keccak-p[1600]×4 */
+
+int KeccakP1600times4_IsAvailable();
+const char * KeccakP1600times4_GetImplementation();
+
+/* Keccak-p[1600]×8 */
+
+int KeccakP1600times8_IsAvailable();
+const char * KeccakP1600times8_GetImplementation();
+
+#endif
diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c
new file mode 100644
index 0000000..7228d7a
--- /dev/null
+++ b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c
@@ -0,0 +1,227 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <KeccakP-1600-SnP.h>
+
+const char * KeccakP1600_GetImplementation()
+{
+    return "ARMv8-A+SHA3 optimized implementation";
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_opt64_Initialize(void *state)
+{
+    memset(state, 0, 200);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    uint64_t lane;
+
+    if (length == 0)
+        return;
+    if (length == 1)
+        lane = data[0];
+    else {
+        lane = 0;
+        memcpy(&lane, data, length);
+    }
+    lane <<= offset*8;
+    ((uint64_t*)state)[lanePosition] ^= lane;
+}
+
+/* ---------------------------------------------------------------- */
+
+static void KeccakP1600_opt64_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+    unsigned int i = 0;
+
+    for( ; (i+8)<=laneCount; i+=8) {
+        ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+        ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
+        ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2];
+        ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3];
+        ((uint64_t*)state)[i+4] ^= ((uint64_t*)data)[i+4];
+        ((uint64_t*)state)[i+5] ^= ((uint64_t*)data)[i+5];
+        ((uint64_t*)state)[i+6] ^= ((uint64_t*)data)[i+6];
+        ((uint64_t*)state)[i+7] ^= ((uint64_t*)data)[i+7];
+    }
+    for( ; (i+4)<=laneCount; i+=4) {
+        ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+        ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
+        ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2];
+        ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3];
+    }
+    for( ; (i+2)<=laneCount; i+=2) {
+        ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+        ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
+    }
+    if (i<laneCount) {
+        ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+    }
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_opt64_AddByte(void *state, unsigned char byte, unsigned int offset)
+{
+    ((unsigned char*)(state))[offset] ^= byte;
+}
+
+/* ---------------------------------------------------------------- */
+
+#define SnP_AddBytes(state, data, offset, length, SnP_AddLanes, SnP_AddBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_AddLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_AddBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    uint64_t lane = ((uint64_t*)state)[lanePosition];
+    {
+        uint64_t lane1[1];
+        lane1[0] = lane;
+        memcpy(data, (uint8_t*)lane1+offset, length);
+    }
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
+{
+    memcpy(data, state, laneCount*8);
+}
+
+/* ---------------------------------------------------------------- */
+
+#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_ExtractBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+/* Keccak-p[1600]×2 */
+
+int KeccakP1600times2_IsAvailable()
+{
+    return 1;
+}
+
+const char * KeccakP1600times2_GetImplementation()
+{
+    return "ARMv8-A+SHA3 optimized implementation";
+}
+
+/* Keccak-p[1600]×4 */
+
+int KeccakP1600times4_IsAvailable()
+{
+    return 0;
+}
+
+const char * KeccakP1600times4_GetImplementation()
+{
+    return "";
+}
+
+void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output)
+{
+}
+
+/* Keccak-p[1600]×8 */
+
+int KeccakP1600times8_IsAvailable()
+{
+    return 0;
+}
+
+const char * KeccakP1600times8_GetImplementation()
+{
+    return "";
+}
+
+void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output)
+{
+}
diff --git a/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h
new file mode 100644
index 0000000..ac76272
--- /dev/null
+++ b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h
@@ -0,0 +1,35 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+#define KeccakP1600_stateSizeInBytes    200
+#define KeccakP1600_stateAlignment      8
+#define KeccakP1600_disableParallelism
+
+const char * KeccakP1600_GetImplementation();
+void KeccakP1600_Initialize(void *state);
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_Permute_12rounds(void *state);
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+
+#endif
diff --git a/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c
new file mode 100644
index 0000000..a72dc7c
--- /dev/null
+++ b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c
@@ -0,0 +1,1068 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include "brg_endian.h"
+#include "KeccakP-1600-SnP.h"
+
+const char * KeccakP1600_GetImplementation()
+{
+    return "in-place 32-bit implementation";
+}
+
+
+#define ROL32(a, offset) ((((uint32_t)a) << (offset)) ^ (((uint32_t)a) >> (32-(offset))))
+
+/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+#define prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        temp0 = (low); \
+        temp = (temp0 ^ (temp0 >>  1)) & 0x22222222UL;  temp0 = temp0 ^ temp ^ (temp <<  1); \
+        temp = (temp0 ^ (temp0 >>  2)) & 0x0C0C0C0CUL;  temp0 = temp0 ^ temp ^ (temp <<  2); \
+        temp = (temp0 ^ (temp0 >>  4)) & 0x00F000F0UL;  temp0 = temp0 ^ temp ^ (temp <<  4); \
+        temp = (temp0 ^ (temp0 >>  8)) & 0x0000FF00UL;  temp0 = temp0 ^ temp ^ (temp <<  8); \
+        temp1 = (high); \
+        temp = (temp1 ^ (temp1 >>  1)) & 0x22222222UL;  temp1 = temp1 ^ temp ^ (temp <<  1); \
+        temp = (temp1 ^ (temp1 >>  2)) & 0x0C0C0C0CUL;  temp1 = temp1 ^ temp ^ (temp <<  2); \
+        temp = (temp1 ^ (temp1 >>  4)) & 0x00F000F0UL;  temp1 = temp1 ^ temp ^ (temp <<  4); \
+        temp = (temp1 ^ (temp1 >>  8)) & 0x0000FF00UL;  temp1 = temp1 ^ temp ^ (temp <<  8);
+
+#define toBitInterleavingAndXOR(low, high, even, odd, temp, temp0, temp1) \
+        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        even ^= (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        odd ^= (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+#define toBitInterleavingAndAND(low, high, even, odd, temp, temp0, temp1) \
+        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        even &= (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        odd &= (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+#define toBitInterleavingAndSet(low, high, even, odd, temp, temp0, temp1) \
+        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        even = (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        odd = (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+#define prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+        temp0 = (even); \
+        temp1 = (odd); \
+        temp = (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        temp1 = (temp0 >> 16) | (temp1 & 0xFFFF0000); \
+        temp0 = temp; \
+        temp = (temp0 ^ (temp0 >>  8)) & 0x0000FF00UL;  temp0 = temp0 ^ temp ^ (temp <<  8); \
+        temp = (temp0 ^ (temp0 >>  4)) & 0x00F000F0UL;  temp0 = temp0 ^ temp ^ (temp <<  4); \
+        temp = (temp0 ^ (temp0 >>  2)) & 0x0C0C0C0CUL;  temp0 = temp0 ^ temp ^ (temp <<  2); \
+        temp = (temp0 ^ (temp0 >>  1)) & 0x22222222UL;  temp0 = temp0 ^ temp ^ (temp <<  1); \
+        temp = (temp1 ^ (temp1 >>  8)) & 0x0000FF00UL;  temp1 = temp1 ^ temp ^ (temp <<  8); \
+        temp = (temp1 ^ (temp1 >>  4)) & 0x00F000F0UL;  temp1 = temp1 ^ temp ^ (temp <<  4); \
+        temp = (temp1 ^ (temp1 >>  2)) & 0x0C0C0C0CUL;  temp1 = temp1 ^ temp ^ (temp <<  2); \
+        temp = (temp1 ^ (temp1 >>  1)) & 0x22222222UL;  temp1 = temp1 ^ temp ^ (temp <<  1);
+
+#define fromBitInterleaving(even, odd, low, high, temp, temp0, temp1) \
+        prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+        low = temp0; \
+        high = temp1;
+
+#define fromBitInterleavingAndXOR(even, odd, lowIn, highIn, lowOut, highOut, temp, temp0, temp1) \
+        prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+        lowOut = lowIn ^ temp0; \
+        highOut = highIn ^ temp1;
+
+void KeccakP1600_SetBytesInLaneToZero(void *state, unsigned int lanePosition, unsigned int offset, unsigned int length)
+{
+    uint8_t laneAsBytes[8];
+    uint32_t low, high;
+    uint32_t temp, temp0, temp1;
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
+
+    memset(laneAsBytes, 0xFF, offset);
+    memset(laneAsBytes+offset, 0x00, length);
+    memset(laneAsBytes+offset+length, 0xFF, 8-offset-length);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    low = *((uint32_t*)(laneAsBytes+0));
+    high = *((uint32_t*)(laneAsBytes+4));
+#else
+    low = laneAsBytes[0]
+        | ((uint32_t)(laneAsBytes[1]) << 8)
+        | ((uint32_t)(laneAsBytes[2]) << 16)
+        | ((uint32_t)(laneAsBytes[3]) << 24);
+    high = laneAsBytes[4]
+        | ((uint32_t)(laneAsBytes[5]) << 8)
+        | ((uint32_t)(laneAsBytes[6]) << 16)
+        | ((uint32_t)(laneAsBytes[7]) << 24);
+#endif
+    toBitInterleavingAndAND(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Initialize(void *state)
+{
+    memset(state, 0, 200);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
+{
+    unsigned int lanePosition = offset/8;
+    unsigned int offsetInLane = offset%8;
+    uint32_t low, high;
+    uint32_t temp, temp0, temp1;
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
+
+    if (offsetInLane < 4) {
+        low = (uint32_t)byte << (offsetInLane*8);
+        high = 0;
+    }
+    else {
+        low = 0;
+        high = (uint32_t)byte << ((offsetInLane-4)*8);
+    }
+    toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    uint8_t laneAsBytes[8];
+    uint32_t low, high;
+    uint32_t temp, temp0, temp1;
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
+
+    memset(laneAsBytes, 0, 8);
+    memcpy(laneAsBytes+offset, data, length);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    low = *((uint32_t*)(laneAsBytes+0));
+    high = *((uint32_t*)(laneAsBytes+4));
+#else
+    low = laneAsBytes[0]
+        | ((uint32_t)(laneAsBytes[1]) << 8)
+        | ((uint32_t)(laneAsBytes[2]) << 16)
+        | ((uint32_t)(laneAsBytes[3]) << 24);
+    high = laneAsBytes[4]
+        | ((uint32_t)(laneAsBytes[5]) << 8)
+        | ((uint32_t)(laneAsBytes[6]) << 16)
+        | ((uint32_t)(laneAsBytes[7]) << 24);
+#endif
+    toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+static void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    const uint32_t * pI = (const uint32_t *)data;
+    uint32_t * pS = (uint32_t*)state;
+    uint32_t t, x0, x1;
+    int i;
+    for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+        uint32_t low;
+        uint32_t high;
+        memcpy(&low, pI++, 4);
+        memcpy(&high, pI++, 4);
+        toBitInterleavingAndXOR(low, high, *(pS++), *(pS++), t, x0, x1);
+#else
+        toBitInterleavingAndXOR(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1)
+#endif
+    }
+#else
+    unsigned int lanePosition;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+        uint8_t laneAsBytes[8];
+        memcpy(laneAsBytes, data+lanePosition*8, 8);
+        uint32_t low = laneAsBytes[0]
+            | ((uint32_t)(laneAsBytes[1]) << 8)
+            | ((uint32_t)(laneAsBytes[2]) << 16)
+            | ((uint32_t)(laneAsBytes[3]) << 24);
+        uint32_t high = laneAsBytes[4]
+            | ((uint32_t)(laneAsBytes[5]) << 8)
+            | ((uint32_t)(laneAsBytes[6]) << 16)
+            | ((uint32_t)(laneAsBytes[7]) << 24);
+        uint32_t even, odd, temp, temp0, temp1;
+        uint32_t *stateAsHalfLanes = (uint32_t*)state;
+        toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#define SnP_AddBytes(state, data, offset, length, SnP_AddLanes, SnP_AddBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_AddLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_AddBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
+    uint32_t low, high, temp, temp0, temp1;
+    uint8_t laneAsBytes[8];
+
+    fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    *((uint32_t*)(laneAsBytes+0)) = low;
+    *((uint32_t*)(laneAsBytes+4)) = high;
+#else
+    laneAsBytes[0] = low & 0xFF;
+    laneAsBytes[1] = (low >> 8) & 0xFF;
+    laneAsBytes[2] = (low >> 16) & 0xFF;
+    laneAsBytes[3] = (low >> 24) & 0xFF;
+    laneAsBytes[4] = high & 0xFF;
+    laneAsBytes[5] = (high >> 8) & 0xFF;
+    laneAsBytes[6] = (high >> 16) & 0xFF;
+    laneAsBytes[7] = (high >> 24) & 0xFF;
+#endif
+    memcpy(data, laneAsBytes+offset, length);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    uint32_t * pI = (uint32_t *)data;
+    const uint32_t * pS = ( const uint32_t *)state;
+    uint32_t t, x0, x1;
+    int i;
+    for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+        uint32_t low;
+        uint32_t high;
+        fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
+        memcpy(pI++, &low, 4);
+        memcpy(pI++, &high, 4);
+#else
+        fromBitInterleaving(*(pS++), *(pS++), *(pI++), *(pI++), t, x0, x1)
+#endif
+    }
+#else
+    unsigned int lanePosition;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+        uint32_t *stateAsHalfLanes = (uint32_t*)state;
+        uint32_t low, high, temp, temp0, temp1;
+        fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+        uint8_t laneAsBytes[8];
+        laneAsBytes[0] = low & 0xFF;
+        laneAsBytes[1] = (low >> 8) & 0xFF;
+        laneAsBytes[2] = (low >> 16) & 0xFF;
+        laneAsBytes[3] = (low >> 24) & 0xFF;
+        laneAsBytes[4] = high & 0xFF;
+        laneAsBytes[5] = (high >> 8) & 0xFF;
+        laneAsBytes[6] = (high >> 16) & 0xFF;
+        laneAsBytes[7] = (high >> 24) & 0xFF;
+        memcpy(data+lanePosition*8, laneAsBytes, 8);
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_ExtractBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+static const uint32_t KeccakF1600RoundConstants_int2[2*24+1] =
+{
+    0x00000001UL,    0x00000000UL,
+    0x00000000UL,    0x00000089UL,
+    0x00000000UL,    0x8000008bUL,
+    0x00000000UL,    0x80008080UL,
+    0x00000001UL,    0x0000008bUL,
+    0x00000001UL,    0x00008000UL,
+    0x00000001UL,    0x80008088UL,
+    0x00000001UL,    0x80000082UL,
+    0x00000000UL,    0x0000000bUL,
+    0x00000000UL,    0x0000000aUL,
+    0x00000001UL,    0x00008082UL,
+    0x00000000UL,    0x00008003UL,
+    0x00000001UL,    0x0000808bUL,
+    0x00000001UL,    0x8000000bUL,
+    0x00000001UL,    0x8000008aUL,
+    0x00000001UL,    0x80000081UL,
+    0x00000000UL,    0x80000081UL,
+    0x00000000UL,    0x80000008UL,
+    0x00000000UL,    0x00000083UL,
+    0x00000000UL,    0x80008003UL,
+    0x00000001UL,    0x80008088UL,
+    0x00000000UL,    0x80000088UL,
+    0x00000001UL,    0x00008000UL,
+    0x00000000UL,    0x80008082UL,
+    0x000000FFUL
+};
+
+#define KeccakRound0() \
+        Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \
+        Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \
+        Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \
+        Da1 = Cz^Du0; \
+        Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \
+        Do1 = Cy^Cx; \
+        Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \
+        De1 = Cz^Cw; \
+        Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \
+        Di1 = Du1^Cw; \
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Age0^De0), 22); \
+        Bi = ROL32((Aki1^Di1), 22); \
+        Bo = ROL32((Amo1^Do1), 11); \
+        Bu = ROL32((Asu0^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Age1^De1), 22); \
+        Bi = ROL32((Aki0^Di0), 21); \
+        Bo = ROL32((Amo0^Do0), 10); \
+        Bu = ROL32((Asu1^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aka1^Da1),  2); \
+        Bo = ROL32((Ame1^De1), 23); \
+        Bu = ROL32((Asi1^Di1), 31); \
+        Ba = ROL32((Abo0^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aka0^Da0),  1); \
+        Bo = ROL32((Ame0^De0), 22); \
+        Bu = ROL32((Asi0^Di0), 30); \
+        Ba = ROL32((Abo1^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Asa0^Da0),  9); \
+        Ba = ROL32((Abe1^De1),  1); \
+        Be = ROL32((Agi0^Di0),  3); \
+        Bi = ROL32((Ako1^Do1), 13); \
+        Bo = ROL32((Amu0^Du0),  4); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Asa1^Da1),  9); \
+        Ba = (Abe0^De0); \
+        Be = ROL32((Agi1^Di1),  3); \
+        Bi = ROL32((Ako0^Do0), 12); \
+        Bo = ROL32((Amu1^Du1),  4); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aga0^Da0), 18); \
+        Bi = ROL32((Ake0^De0),  5); \
+        Bo = ROL32((Ami1^Di1),  8); \
+        Bu = ROL32((Aso0^Do0), 28); \
+        Ba = ROL32((Abu1^Du1), 14); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aga1^Da1), 18); \
+        Bi = ROL32((Ake1^De1),  5); \
+        Bo = ROL32((Ami0^Di0),  7); \
+        Bu = ROL32((Aso1^Do1), 28); \
+        Ba = ROL32((Abu0^Du0), 13); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Ama1^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Abi0^Di0), 31); \
+        Be = ROL32((Ago1^Do1), 28); \
+        Bi = ROL32((Aku1^Du1), 20); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Ama0^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Abi1^Di1), 31); \
+        Be = ROL32((Ago0^Do0), 27); \
+        Bi = ROL32((Aku0^Du0), 19); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be )
+
+#define KeccakRound1() \
+        Cx = Asu0^Agu0^Amu0^Abu1^Aku1; \
+        Du1 = Age1^Ame0^Abe0^Ake1^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Asu1^Agu1^Amu1^Abu0^Aku0; \
+        Du0 = Age0^Ame1^Abe1^Ake0^Ase0; \
+        Da1 = Cz^Du0; \
+        Cw = Aki1^Asi1^Agi0^Ami1^Abi0; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Aki0^Asi0^Agi1^Ami0^Abi1; \
+        Do1 = Cy^Cx; \
+        Cx = Aba0^Aka1^Asa0^Aga0^Ama1; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Aka0^Asa1^Aga1^Ama0; \
+        De1 = Cz^Cw; \
+        Cy = Amo0^Abo1^Ako0^Aso1^Ago0; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Amo1^Abo0^Ako1^Aso0^Ago1; \
+        Di1 = Du1^Cw; \
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Ame1^De0), 22); \
+        Bi = ROL32((Agi1^Di1), 22); \
+        Bo = ROL32((Aso1^Do1), 11); \
+        Bu = ROL32((Aku1^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Ame0^De1), 22); \
+        Bi = ROL32((Agi0^Di0), 21); \
+        Bo = ROL32((Aso0^Do0), 10); \
+        Bu = ROL32((Aku0^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Asa1^Da1),  2); \
+        Bo = ROL32((Ake1^De1), 23); \
+        Bu = ROL32((Abi1^Di1), 31); \
+        Ba = ROL32((Amo1^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Asa0^Da0),  1); \
+        Bo = ROL32((Ake0^De0), 22); \
+        Bu = ROL32((Abi0^Di0), 30); \
+        Ba = ROL32((Amo0^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Ama1^Da0),  9); \
+        Ba = ROL32((Age1^De1),  1); \
+        Be = ROL32((Asi1^Di0),  3); \
+        Bi = ROL32((Ako0^Do1), 13); \
+        Bo = ROL32((Abu1^Du0),  4); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Ama0^Da1),  9); \
+        Ba = (Age0^De0); \
+        Be = ROL32((Asi0^Di1),  3); \
+        Bi = ROL32((Ako1^Do0), 12); \
+        Bo = ROL32((Abu0^Du1),  4); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aka1^Da0), 18); \
+        Bi = ROL32((Abe1^De0),  5); \
+        Bo = ROL32((Ami0^Di1),  8); \
+        Bu = ROL32((Ago1^Do0), 28); \
+        Ba = ROL32((Asu1^Du1), 14); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aka0^Da1), 18); \
+        Bi = ROL32((Abe0^De1),  5); \
+        Bo = ROL32((Ami1^Di0),  7); \
+        Bu = ROL32((Ago0^Do1), 28); \
+        Ba = ROL32((Asu0^Du0), 13); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aga1^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Aki1^Di0), 31); \
+        Be = ROL32((Abo1^Do1), 28); \
+        Bi = ROL32((Amu1^Du1), 20); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aga0^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Aki0^Di1), 31); \
+        Be = ROL32((Abo0^Do0), 27); \
+        Bi = ROL32((Amu0^Du0), 19); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be );
+
+#define KeccakRound2() \
+        Cx = Aku1^Agu0^Abu1^Asu1^Amu1; \
+        Du1 = Ame0^Ake0^Age0^Abe0^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Aku0^Agu1^Abu0^Asu0^Amu0; \
+        Du0 = Ame1^Ake1^Age1^Abe1^Ase0; \
+        Da1 = Cz^Du0; \
+        Cw = Agi1^Abi1^Asi1^Ami0^Aki1; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Agi0^Abi0^Asi0^Ami1^Aki0; \
+        Do1 = Cy^Cx; \
+        Cx = Aba0^Asa1^Ama1^Aka1^Aga1; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Asa0^Ama0^Aka0^Aga0; \
+        De1 = Cz^Cw; \
+        Cy = Aso0^Amo0^Ako1^Ago0^Abo0; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Aso1^Amo1^Ako0^Ago1^Abo1; \
+        Di1 = Du1^Cw; \
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Ake1^De0), 22); \
+        Bi = ROL32((Asi0^Di1), 22); \
+        Bo = ROL32((Ago0^Do1), 11); \
+        Bu = ROL32((Amu1^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Ake0^De1), 22); \
+        Bi = ROL32((Asi1^Di0), 21); \
+        Bo = ROL32((Ago1^Do0), 10); \
+        Bu = ROL32((Amu0^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Ama0^Da1),  2); \
+        Bo = ROL32((Abe0^De1), 23); \
+        Bu = ROL32((Aki0^Di1), 31); \
+        Ba = ROL32((Aso1^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Ama1^Da0),  1); \
+        Bo = ROL32((Abe1^De0), 22); \
+        Bu = ROL32((Aki1^Di0), 30); \
+        Ba = ROL32((Aso0^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aga1^Da0),  9); \
+        Ba = ROL32((Ame0^De1),  1); \
+        Be = ROL32((Abi1^Di0),  3); \
+        Bi = ROL32((Ako1^Do1), 13); \
+        Bo = ROL32((Asu1^Du0),  4); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aga0^Da1),  9); \
+        Ba = (Ame1^De0); \
+        Be = ROL32((Abi0^Di1),  3); \
+        Bi = ROL32((Ako0^Do0), 12); \
+        Bo = ROL32((Asu0^Du1),  4); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Asa1^Da0), 18); \
+        Bi = ROL32((Age1^De0),  5); \
+        Bo = ROL32((Ami1^Di1),  8); \
+        Bu = ROL32((Abo1^Do0), 28); \
+        Ba = ROL32((Aku0^Du1), 14); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Asa0^Da1), 18); \
+        Bi = ROL32((Age0^De1),  5); \
+        Bo = ROL32((Ami0^Di0),  7); \
+        Bu = ROL32((Abo0^Do1), 28); \
+        Ba = ROL32((Aku1^Du0), 13); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aka0^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Agi1^Di0), 31); \
+        Be = ROL32((Amo0^Do1), 28); \
+        Bi = ROL32((Abu0^Du1), 20); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aka1^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Agi0^Di1), 31); \
+        Be = ROL32((Amo1^Do0), 27); \
+        Bi = ROL32((Abu1^Du0), 19); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be );
+
+#define KeccakRound3() \
+        Cx = Amu1^Agu0^Asu1^Aku0^Abu0; \
+        Du1 = Ake0^Abe1^Ame1^Age0^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Amu0^Agu1^Asu0^Aku1^Abu1; \
+        Du0 = Ake1^Abe0^Ame0^Age1^Ase0; \
+        Da1 = Cz^Du0; \
+        Cw = Asi0^Aki0^Abi1^Ami1^Agi1; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Asi1^Aki1^Abi0^Ami0^Agi0; \
+        Do1 = Cy^Cx; \
+        Cx = Aba0^Ama0^Aga1^Asa1^Aka0; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Ama1^Aga0^Asa0^Aka1; \
+        De1 = Cz^Cw; \
+        Cy = Ago1^Aso0^Ako0^Abo0^Amo1; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Ago0^Aso1^Ako1^Abo1^Amo0; \
+        Di1 = Du1^Cw; \
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Abe0^De0), 22); \
+        Bi = ROL32((Abi0^Di1), 22); \
+        Bo = ROL32((Abo0^Do1), 11); \
+        Bu = ROL32((Abu0^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Abe1^De1), 22); \
+        Bi = ROL32((Abi1^Di0), 21); \
+        Bo = ROL32((Abo1^Do0), 10); \
+        Bu = ROL32((Abu1^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aga0^Da1),  2); \
+        Bo = ROL32((Age0^De1), 23); \
+        Bu = ROL32((Agi0^Di1), 31); \
+        Ba = ROL32((Ago0^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aga1^Da0),  1); \
+        Bo = ROL32((Age1^De0), 22); \
+        Bu = ROL32((Agi1^Di0), 30); \
+        Ba = ROL32((Ago1^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aka0^Da0),  9); \
+        Ba = ROL32((Ake0^De1),  1); \
+        Be = ROL32((Aki0^Di0),  3); \
+        Bi = ROL32((Ako0^Do1), 13); \
+        Bo = ROL32((Aku0^Du0),  4); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aka1^Da1),  9); \
+        Ba = (Ake1^De0); \
+        Be = ROL32((Aki1^Di1),  3); \
+        Bi = ROL32((Ako1^Do0), 12); \
+        Bo = ROL32((Aku1^Du1),  4); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Ama0^Da0), 18); \
+        Bi = ROL32((Ame0^De0),  5); \
+        Bo = ROL32((Ami0^Di1),  8); \
+        Bu = ROL32((Amo0^Do0), 28); \
+        Ba = ROL32((Amu0^Du1), 14); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Ama1^Da1), 18); \
+        Bi = ROL32((Ame1^De1),  5); \
+        Bo = ROL32((Ami1^Di0),  7); \
+        Bu = ROL32((Amo1^Do1), 28); \
+        Ba = ROL32((Amu1^Du0), 13); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Asa0^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Asi0^Di0), 31); \
+        Be = ROL32((Aso0^Do1), 28); \
+        Bi = ROL32((Asu0^Du1), 20); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Asa1^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Asi1^Di1), 31); \
+        Be = ROL32((Aso1^Do0), 27); \
+        Bi = ROL32((Asu1^Du0), 19); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be );
+
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nRounds)
+{
+    uint32_t Da0, De0, Di0, Do0, Du0;
+    uint32_t Da1, De1, Di1, Do1, Du1;
+    uint32_t Ba, Be, Bi, Bo, Bu;
+    uint32_t Cx, Cy, Cz, Cw;
+    const uint32_t *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2;
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
+    #define Aba0 stateAsHalfLanes[ 0]
+    #define Aba1 stateAsHalfLanes[ 1]
+    #define Abe0 stateAsHalfLanes[ 2]
+    #define Abe1 stateAsHalfLanes[ 3]
+    #define Abi0 stateAsHalfLanes[ 4]
+    #define Abi1 stateAsHalfLanes[ 5]
+    #define Abo0 stateAsHalfLanes[ 6]
+    #define Abo1 stateAsHalfLanes[ 7]
+    #define Abu0 stateAsHalfLanes[ 8]
+    #define Abu1 stateAsHalfLanes[ 9]
+    #define Aga0 stateAsHalfLanes[10]
+    #define Aga1 stateAsHalfLanes[11]
+    #define Age0 stateAsHalfLanes[12]
+    #define Age1 stateAsHalfLanes[13]
+    #define Agi0 stateAsHalfLanes[14]
+    #define Agi1 stateAsHalfLanes[15]
+    #define Ago0 stateAsHalfLanes[16]
+    #define Ago1 stateAsHalfLanes[17]
+    #define Agu0 stateAsHalfLanes[18]
+    #define Agu1 stateAsHalfLanes[19]
+    #define Aka0 stateAsHalfLanes[20]
+    #define Aka1 stateAsHalfLanes[21]
+    #define Ake0 stateAsHalfLanes[22]
+    #define Ake1 stateAsHalfLanes[23]
+    #define Aki0 stateAsHalfLanes[24]
+    #define Aki1 stateAsHalfLanes[25]
+    #define Ako0 stateAsHalfLanes[26]
+    #define Ako1 stateAsHalfLanes[27]
+    #define Aku0 stateAsHalfLanes[28]
+    #define Aku1 stateAsHalfLanes[29]
+    #define Ama0 stateAsHalfLanes[30]
+    #define Ama1 stateAsHalfLanes[31]
+    #define Ame0 stateAsHalfLanes[32]
+    #define Ame1 stateAsHalfLanes[33]
+    #define Ami0 stateAsHalfLanes[34]
+    #define Ami1 stateAsHalfLanes[35]
+    #define Amo0 stateAsHalfLanes[36]
+    #define Amo1 stateAsHalfLanes[37]
+    #define Amu0 stateAsHalfLanes[38]
+    #define Amu1 stateAsHalfLanes[39]
+    #define Asa0 stateAsHalfLanes[40]
+    #define Asa1 stateAsHalfLanes[41]
+    #define Ase0 stateAsHalfLanes[42]
+    #define Ase1 stateAsHalfLanes[43]
+    #define Asi0 stateAsHalfLanes[44]
+    #define Asi1 stateAsHalfLanes[45]
+    #define Aso0 stateAsHalfLanes[46]
+    #define Aso1 stateAsHalfLanes[47]
+    #define Asu0 stateAsHalfLanes[48]
+    #define Asu1 stateAsHalfLanes[49]
+
+    nRounds &= 3;
+    switch ( nRounds )
+    {
+        #define I0 Ba
+        #define I1 Be
+        #define T0 Bi
+        #define T1 Bo
+        #define SwapPI13( in0,in1,in2,in3,eo0,eo1,eo2,eo3 ) \
+            I0 = (in0)[0]; I1 = (in0)[1];       \
+            T0 = (in1)[0]; T1 = (in1)[1];       \
+            (in0)[eo0] = T0; (in0)[eo0^1] = T1; \
+            T0 = (in2)[0]; T1 = (in2)[1];       \
+            (in1)[eo1] = T0; (in1)[eo1^1] = T1; \
+            T0 = (in3)[0]; T1 = (in3)[1];       \
+            (in2)[eo2] = T0; (in2)[eo2^1] = T1; \
+            (in3)[eo3] = I0; (in3)[eo3^1] = I1
+        #define SwapPI2( in0,in1,in2,in3 ) \
+            I0 = (in0)[0]; I1 = (in0)[1]; \
+            T0 = (in1)[0]; T1 = (in1)[1]; \
+            (in0)[1] = T0; (in0)[0] = T1; \
+            (in1)[1] = I0; (in1)[0] = I1; \
+            I0 = (in2)[0]; I1 = (in2)[1]; \
+            T0 = (in3)[0]; T1 = (in3)[1]; \
+            (in2)[1] = T0; (in2)[0] = T1; \
+            (in3)[1] = I0; (in3)[0] = I1
+        #define SwapEO( even,odd ) T0 = even; even = odd; odd = T0
+
+        case 1:
+            SwapPI13( &Aga0, &Aka0, &Asa0, &Ama0, 1, 0, 1, 0 );
+            SwapPI13( &Abe0, &Age0, &Ame0, &Ake0, 0, 1, 0, 1 );
+            SwapPI13( &Abi0, &Aki0, &Agi0, &Asi0, 1, 0, 1, 0 );
+            SwapEO( Ami0, Ami1 );
+            SwapPI13( &Abo0, &Amo0, &Aso0, &Ago0, 1, 0, 1, 0 );
+            SwapEO( Ako0, Ako1 );
+            SwapPI13( &Abu0, &Asu0, &Aku0, &Amu0, 0, 1, 0, 1 );
+            break;        
+
+        case 2:
+            SwapPI2( &Aga0, &Asa0, &Aka0, &Ama0 );
+            SwapPI2( &Abe0, &Ame0, &Age0, &Ake0 );
+            SwapPI2( &Abi0, &Agi0, &Aki0, &Asi0 );
+            SwapPI2( &Abo0, &Aso0, &Ago0, &Amo0 );
+            SwapPI2( &Abu0, &Aku0, &Amu0, &Asu0 );
+            break;        
+
+        case 3:
+            SwapPI13( &Aga0, &Ama0, &Asa0, &Aka0, 0, 1, 0, 1 );
+            SwapPI13( &Abe0, &Ake0, &Ame0, &Age0, 1, 0, 1, 0 );
+            SwapPI13( &Abi0, &Asi0, &Agi0, &Aki0, 0, 1, 0, 1 );
+            SwapEO( Ami0, Ami1 );
+            SwapPI13( &Abo0, &Ago0, &Aso0, &Amo0, 0, 1, 0, 1 );
+            SwapEO( Ako0, Ako1 );
+            SwapPI13( &Abu0, &Amu0, &Aku0, &Asu0, 1, 0, 1, 0 );
+            break;        
+        #undef I0
+        #undef I1
+        #undef T0
+        #undef T1
+        #undef SwapPI13
+        #undef SwapPI2
+        #undef SwapEO
+    }
+
+    do
+    {
+        /* Code for 4 rounds, using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+        switch ( nRounds )
+        {
+            case 0: KeccakRound0(); /* fall through */
+            case 3: KeccakRound1();
+            case 2: KeccakRound2();
+            case 1: KeccakRound3();
+        }
+        nRounds = 0;
+    }
+    while ( *pRoundConstants != 0xFF );
+
+    #undef Aba0
+    #undef Aba1
+    #undef Abe0
+    #undef Abe1
+    #undef Abi0
+    #undef Abi1
+    #undef Abo0
+    #undef Abo1
+    #undef Abu0
+    #undef Abu1
+    #undef Aga0
+    #undef Aga1
+    #undef Age0
+    #undef Age1
+    #undef Agi0
+    #undef Agi1
+    #undef Ago0
+    #undef Ago1
+    #undef Agu0
+    #undef Agu1
+    #undef Aka0
+    #undef Aka1
+    #undef Ake0
+    #undef Ake1
+    #undef Aki0
+    #undef Aki1
+    #undef Ako0
+    #undef Ako1
+    #undef Aku0
+    #undef Aku1
+    #undef Ama0
+    #undef Ama1
+    #undef Ame0
+    #undef Ame1
+    #undef Ami0
+    #undef Ami1
+    #undef Amo0
+    #undef Amo1
+    #undef Amu0
+    #undef Amu1
+    #undef Asa0
+    #undef Asa1
+    #undef Ase0
+    #undef Ase1
+    #undef Asi0
+    #undef Asi1
+    #undef Aso0
+    #undef Aso1
+    #undef Asu0
+    #undef Asu1
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_12rounds(void *state)
+{
+     KeccakP1600_Permute_Nrounds(state, 12);
+}
diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s
new file mode 100644
index 0000000..d7ae46b
--- /dev/null
+++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s
@@ -0,0 +1,664 @@
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# Copyright (c) 2017 Ronny Van Keer
+# All rights reserved.
+#
+# The source code in this file is licensed under the CRYPTOGAMS license.
+# For further details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Notes:
+# The code for the permutation (__KeccakF1600) was generated with
+# Andy Polyakov's keccak1600-avx2.pl from the CRYPTOGAMS project
+# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx2.pl).
+# The rest of the code was written by Ronny Van Keer.
+# Adaptations for macOS by Stéphane Léon.
+
+.text
+
+# -----------------------------------------------------------------------------
+#
+# void KeccakP1600_AVX2_Initialize(void *state);
+#
+.ifdef macOS
+.globl  _KeccakP1600_AVX2_Initialize
+_KeccakP1600_AVX2_Initialize:
+.else
+.globl  KeccakP1600_AVX2_Initialize
+.type   KeccakP1600_AVX2_Initialize,@function
+KeccakP1600_AVX2_Initialize:
+.endif
+.balign  32
+    vpxor       %ymm0,%ymm0,%ymm0
+    vmovdqu     %ymm0,0*32(%rdi)
+    vmovdqu     %ymm0,1*32(%rdi)
+    vmovdqu     %ymm0,2*32(%rdi)
+    vmovdqu     %ymm0,3*32(%rdi)
+    vmovdqu     %ymm0,4*32(%rdi)
+    vmovdqu     %ymm0,5*32(%rdi)
+    movq        $0,6*32(%rdi)
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX2_Initialize,.-KeccakP1600_AVX2_Initialize
+.endif
+
+# -----------------------------------------------------------------------------
+#
+# void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset);
+#                                %rdi                 %rsi               %rdx
+#
+.ifdef macOS
+.globl  _KeccakP1600_AVX2_AddByte
+_KeccakP1600_AVX2_AddByte:
+.else
+.globl  KeccakP1600_AVX2_AddByte
+.type   KeccakP1600_AVX2_AddByte,@function
+KeccakP1600_AVX2_AddByte:
+.endif
+.balign 32
+    mov         %rdx, %rax
+    and         $7, %rax
+    and         $0xFFFFFFF8, %edx
+    lea         mapState(%rip), %r9
+    mov         (%r9, %rdx), %rdx
+    add         %rdx, %rdi
+    add         %rax, %rdi
+    xorb        %sil, (%rdi)
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX2_AddByte,.-KeccakP1600_AVX2_AddByte
+.endif
+
+# -----------------------------------------------------------------------------
+#
+# void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+#                                %rdi                         %rsi               %rdx                 %rcx
+#
+.ifdef macOS
+.globl  _KeccakP1600_AVX2_AddBytes
+_KeccakP1600_AVX2_AddBytes:
+.else
+.globl  KeccakP1600_AVX2_AddBytes
+.type   KeccakP1600_AVX2_AddBytes,@function
+KeccakP1600_AVX2_AddBytes:
+.endif
+.balign 32
+    cmp         $0, %rcx
+    jz          KeccakP1600_AVX2_AddBytes_Exit
+    mov         %rdx, %rax                              # rax offset in lane
+    and         $0xFFFFFFF8, %edx                       # rdx pointer into state index mapper
+    lea         mapState(%rip), %r9
+    add         %r9, %rdx
+    and         $7, %rax
+    jz          KeccakP1600_AVX2_AddBytes_LaneAlignedCheck
+    mov         $8, %r9                                 # r9 is (max) length of incomplete lane
+    sub         %rax, %r9
+    cmp         %rcx, %r9
+    cmovae      %rcx, %r9
+    sub         %r9, %rcx                               # length -= length of incomplete lane
+    add         (%rdx), %rax                            # rax = pointer to state lane
+    add         $8, %rdx
+    add         %rdi, %rax
+KeccakP1600_AVX2_AddBytes_NotAlignedLoop:
+    mov         (%rsi), %r8b
+    inc         %rsi
+    xorb        %r8b, (%rax)
+    inc         %rax
+    dec         %r9
+    jnz         KeccakP1600_AVX2_AddBytes_NotAlignedLoop
+    jmp         KeccakP1600_AVX2_AddBytes_LaneAlignedCheck
+KeccakP1600_AVX2_AddBytes_LaneAlignedLoop:
+    mov         (%rsi), %r8
+    add         $8, %rsi
+    mov         (%rdx), %rax
+    add         $8, %rdx
+    add         %rdi, %rax
+    xor         %r8, (%rax)
+KeccakP1600_AVX2_AddBytes_LaneAlignedCheck:
+    sub         $8, %rcx
+    jnc         KeccakP1600_AVX2_AddBytes_LaneAlignedLoop
+KeccakP1600_AVX2_AddBytes_LastIncompleteLane:
+    add         $8, %rcx
+    jz          KeccakP1600_AVX2_AddBytes_Exit
+    mov         (%rdx), %rax
+    add         %rdi, %rax
+KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop:
+    mov         (%rsi), %r8b
+    inc         %rsi
+    xor         %r8b, (%rax)
+    inc         %rax
+    dec         %rcx
+    jnz         KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop
+KeccakP1600_AVX2_AddBytes_Exit:
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX2_AddBytes,.-KeccakP1600_AVX2_AddBytes
+.endif
+
+# -----------------------------------------------------------------------------
+#
+# void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+#                                           %rdi                  %rsi               %rdx                 %rcx
+#
+.ifdef macOS
+.globl  _KeccakP1600_AVX2_ExtractBytes
+_KeccakP1600_AVX2_ExtractBytes:
+.else
+.globl  KeccakP1600_AVX2_ExtractBytes
+.type   KeccakP1600_AVX2_ExtractBytes,@function
+KeccakP1600_AVX2_ExtractBytes:
+.endif
+.balign  32
+    push        %rbx
+    cmp         $0, %rcx
+    jz          KeccakP1600_AVX2_ExtractBytes_Exit
+    mov         %rdx, %rax                              # rax offset in lane
+    and         $0xFFFFFFF8, %edx                       # rdx pointer into state index mapper
+    lea         mapState(%rip), %r9
+    add         %r9, %rdx
+    and         $7, %rax
+    jz          KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck
+    mov         $8, %rbx                                # rbx is (max) length of incomplete lane
+    sub         %rax, %rbx
+    cmp         %rcx, %rbx
+    cmovae      %rcx, %rbx
+    sub         %rbx, %rcx                              # length -= length of incomplete lane
+    mov         (%rdx), %r9
+    add         $8, %rdx
+    add         %rdi, %r9
+    add         %rax, %r9
+KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop:
+    mov         (%r9), %r8b
+    inc         %r9
+    mov         %r8b, (%rsi)
+    inc         %rsi
+    dec         %rbx
+    jnz         KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop
+    jmp         KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck
+KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop:
+    mov         (%rdx), %rax
+    add         $8, %rdx
+    add         %rdi, %rax
+    mov         (%rax), %r8
+    mov         %r8, (%rsi)
+    add         $8, %rsi
+KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck:
+    sub         $8, %rcx
+    jnc         KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop
+KeccakP1600_AVX2_ExtractBytes_LastIncompleteLane:
+    add         $8, %rcx
+    jz          KeccakP1600_AVX2_ExtractBytes_Exit
+    mov         (%rdx), %rax
+    add         %rdi, %rax
+    mov         (%rax), %r8
+KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop:
+    mov         %r8b, (%rsi)
+    shr         $8, %r8
+    inc         %rsi
+    dec         %rcx
+    jnz         KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop
+KeccakP1600_AVX2_ExtractBytes_Exit:
+    pop         %rbx
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX2_ExtractBytes,.-KeccakP1600_AVX2_ExtractBytes
+.endif
+
+# -----------------------------------------------------------------------------
+#
+# internal    
+#
+.ifdef macOS
+.else
+.type    __KeccakF1600,@function
+.endif
+.balign  32
+__KeccakF1600:
+.Loop_avx2:
+    ######################################### Theta
+    vpshufd     $0b01001110,%ymm2,%ymm13
+    vpxor       %ymm3,%ymm5,%ymm12
+    vpxor       %ymm6,%ymm4,%ymm9
+    vpxor       %ymm1,%ymm12,%ymm12
+    vpxor       %ymm9,%ymm12,%ymm12         # C[1..4]
+
+    vpermq      $0b10010011,%ymm12,%ymm11
+    vpxor       %ymm2,%ymm13,%ymm13
+    vpermq      $0b01001110,%ymm13,%ymm7
+
+    vpsrlq      $63,%ymm12,%ymm8
+    vpaddq      %ymm12,%ymm12,%ymm9
+    vpor        %ymm9,%ymm8,%ymm8           # ROL64(C[1..4],1)
+
+    vpermq      $0b00111001,%ymm8,%ymm15
+    vpxor       %ymm11,%ymm8,%ymm14
+    vpermq      $0b00000000,%ymm14,%ymm14   # D[0..0] = ROL64(C[1],1) ^ C[4]
+
+    vpxor       %ymm0,%ymm13,%ymm13
+    vpxor       %ymm7,%ymm13,%ymm13         # C[0..0]
+
+    vpsrlq      $63,%ymm13,%ymm7
+    vpaddq      %ymm13,%ymm13,%ymm8
+    vpor        %ymm7,%ymm8,%ymm8           # ROL64(C[0..0],1)
+
+    vpxor       %ymm14,%ymm2,%ymm2          # ^= D[0..0]
+    vpxor       %ymm14,%ymm0,%ymm0          # ^= D[0..0]
+
+    vpblendd    $0b11000000,%ymm8,%ymm15,%ymm15
+    vpblendd    $0b00000011,%ymm13,%ymm11,%ymm11
+    vpxor       %ymm11,%ymm15,%ymm15        # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
+
+    ######################################### Rho + Pi + pre-Chi shuffle
+    vpsllvq     0*32-96(%r8),%ymm2,%ymm10
+    vpsrlvq     0*32-96(%r9),%ymm2,%ymm2
+    vpor        %ymm10,%ymm2,%ymm2
+
+    vpxor       %ymm15,%ymm3,%ymm3          # ^= D[1..4] from Theta
+    vpsllvq     2*32-96(%r8),%ymm3,%ymm11
+    vpsrlvq     2*32-96(%r9),%ymm3,%ymm3
+    vpor        %ymm11,%ymm3,%ymm3
+
+    vpxor       %ymm15,%ymm4,%ymm4          # ^= D[1..4] from Theta
+    vpsllvq     3*32-96(%r8),%ymm4,%ymm12
+    vpsrlvq     3*32-96(%r9),%ymm4,%ymm4
+    vpor        %ymm12,%ymm4,%ymm4
+
+    vpxor       %ymm15,%ymm5,%ymm5          # ^= D[1..4] from Theta
+    vpsllvq     4*32-96(%r8),%ymm5,%ymm13
+    vpsrlvq     4*32-96(%r9),%ymm5,%ymm5
+    vpor        %ymm13,%ymm5,%ymm5
+
+    vpxor       %ymm15,%ymm6,%ymm6          # ^= D[1..4] from Theta
+    vpermq      $0b10001101,%ymm2,%ymm10    # %ymm2 -> future %ymm3
+    vpermq      $0b10001101,%ymm3,%ymm11    # %ymm3 -> future %ymm4
+    vpsllvq     5*32-96(%r8),%ymm6,%ymm14
+    vpsrlvq     5*32-96(%r9),%ymm6,%ymm8
+    vpor        %ymm14,%ymm8,%ymm8          # %ymm6 -> future %ymm1
+
+    vpxor       %ymm15,%ymm1,%ymm1          # ^= D[1..4] from Theta
+    vpermq      $0b00011011,%ymm4,%ymm12    # %ymm4 -> future %ymm5
+    vpermq      $0b01110010,%ymm5,%ymm13    # %ymm5 -> future %ymm6
+    vpsllvq     1*32-96(%r8),%ymm1,%ymm15
+    vpsrlvq     1*32-96(%r9),%ymm1,%ymm9
+    vpor        %ymm15,%ymm9,%ymm9          # %ymm1 -> future %ymm2
+
+    ######################################### Chi
+    vpsrldq     $8,%ymm8,%ymm14
+    vpandn      %ymm14,%ymm8,%ymm7                  # tgting  [0][0] [0][0] [0][0] [0][0]
+
+    vpblendd    $0b00001100,%ymm13,%ymm9,%ymm3      #               [4][4] [2][0]
+    vpblendd    $0b00001100,%ymm9,%ymm11,%ymm15     #               [4][0] [2][1]
+    vpblendd    $0b00001100,%ymm11,%ymm10,%ymm5     #               [4][2] [2][4]
+    vpblendd    $0b00001100,%ymm10,%ymm9,%ymm14     #               [4][3] [2][0]
+    vpblendd    $0b00110000,%ymm11,%ymm3,%ymm3      #        [1][3] [4][4] [2][0]
+    vpblendd    $0b00110000,%ymm12,%ymm15,%ymm15    #        [1][4] [4][0] [2][1]
+    vpblendd    $0b00110000,%ymm9,%ymm5,%ymm5       #        [1][0] [4][2] [2][4]
+    vpblendd    $0b00110000,%ymm13,%ymm14,%ymm14    #        [1][1] [4][3] [2][0]
+    vpblendd    $0b11000000,%ymm12,%ymm3,%ymm3      # [3][2] [1][3] [4][4] [2][0]
+    vpblendd    $0b11000000,%ymm13,%ymm15,%ymm15    # [3][3] [1][4] [4][0] [2][1]
+    vpblendd    $0b11000000,%ymm13,%ymm5,%ymm5      # [3][3] [1][0] [4][2] [2][4]
+    vpblendd    $0b11000000,%ymm11,%ymm14,%ymm14    # [3][4] [1][1] [4][3] [2][0]
+    vpandn      %ymm15,%ymm3,%ymm3                  # tgting  [3][1] [1][2] [4][3] [2][4]
+    vpandn      %ymm14,%ymm5,%ymm5                  # tgting  [3][2] [1][4] [4][1] [2][3]
+
+    vpblendd    $0b00001100,%ymm9,%ymm12,%ymm6      #               [4][0] [2][3]
+    vpblendd    $0b00001100,%ymm12,%ymm10,%ymm15    #               [4][1] [2][4]
+    vpxor       %ymm10,%ymm3,%ymm3
+    vpblendd    $0b00110000,%ymm10,%ymm6,%ymm6      #        [1][2] [4][0] [2][3]
+    vpblendd    $0b00110000,%ymm11,%ymm15,%ymm15    #        [1][3] [4][1] [2][4]
+    vpxor       %ymm12,%ymm5,%ymm5
+    vpblendd    $0b11000000,%ymm11,%ymm6,%ymm6      # [3][4] [1][2] [4][0] [2][3]
+    vpblendd    $0b11000000,%ymm9,%ymm15,%ymm15     # [3][0] [1][3] [4][1] [2][4]
+    vpandn      %ymm15,%ymm6,%ymm6                  # tgting  [3][3] [1][1] [4][4] [2][2]
+    vpxor       %ymm13,%ymm6,%ymm6
+
+    vpermq      $0b00011110,%ymm8,%ymm4             # [0][1] [0][2] [0][4] [0][3]
+    vpblendd    $0b00110000,%ymm0,%ymm4,%ymm15      # [0][1] [0][0] [0][4] [0][3]
+    vpermq      $0b00111001,%ymm8,%ymm1             # [0][1] [0][4] [0][3] [0][2]
+    vpblendd    $0b11000000,%ymm0,%ymm1,%ymm1       # [0][0] [0][4] [0][3] [0][2]
+    vpandn      %ymm15,%ymm1,%ymm1                  # tgting  [0][4] [0][3] [0][2] [0][1]
+
+    vpblendd    $0b00001100,%ymm12,%ymm11,%ymm2     #               [4][1] [2][1]
+    vpblendd    $0b00001100,%ymm11,%ymm13,%ymm14    #               [4][2] [2][2]
+    vpblendd    $0b00110000,%ymm13,%ymm2,%ymm2      #        [1][1] [4][1] [2][1]
+    vpblendd    $0b00110000,%ymm10,%ymm14,%ymm14    #        [1][2] [4][2] [2][2]
+    vpblendd    $0b11000000,%ymm10,%ymm2,%ymm2      # [3][1] [1][1] [4][1] [2][1]
+    vpblendd    $0b11000000,%ymm12,%ymm14,%ymm14    # [3][2] [1][2] [4][2] [2][2]
+    vpandn      %ymm14,%ymm2,%ymm2                  # tgting  [3][0] [1][0] [4][0] [2][0]
+    vpxor       %ymm9,%ymm2,%ymm2
+
+    vpermq      $0b00000000,%ymm7,%ymm7             # [0][0] [0][0] [0][0] [0][0]
+    vpermq      $0b00011011,%ymm3,%ymm3             # post-Chi shuffle
+    vpermq      $0b10001101,%ymm5,%ymm5
+    vpermq      $0b01110010,%ymm6,%ymm6
+
+    vpblendd    $0b00001100,%ymm10,%ymm13,%ymm4     #               [4][3] [2][2]
+    vpblendd    $0b00001100,%ymm13,%ymm12,%ymm14    #               [4][4] [2][3]
+    vpblendd    $0b00110000,%ymm12,%ymm4,%ymm4      #        [1][4] [4][3] [2][2]
+    vpblendd    $0b00110000,%ymm9,%ymm14,%ymm14     #        [1][0] [4][4] [2][3]
+    vpblendd    $0b11000000,%ymm9,%ymm4,%ymm4       # [3][0] [1][4] [4][3] [2][2]
+    vpblendd    $0b11000000,%ymm10,%ymm14,%ymm14    # [3][1] [1][0] [4][4] [2][3]
+    vpandn      %ymm14,%ymm4,%ymm4                  # tgting  [3][4] [1][3] [4][2] [2][1]
+
+    vpxor       %ymm7,%ymm0,%ymm0
+    vpxor       %ymm8,%ymm1,%ymm1
+    vpxor       %ymm11,%ymm4,%ymm4
+
+    ######################################### Iota
+    vpxor       (%r10),%ymm0,%ymm0
+    lea         32(%r10),%r10
+
+    dec         %eax
+    jnz         .Loop_avx2
+    ret
+.ifdef macOS
+.else
+.size   __KeccakF1600,.-__KeccakF1600
+.endif
+
+
+
+.ifdef macOS
+.globl  _KeccakP1600_AVX2_Permute_12rounds
+_KeccakP1600_AVX2_Permute_12rounds:
+.else
+.globl  KeccakP1600_AVX2_Permute_12rounds
+.type   KeccakP1600_AVX2_Permute_12rounds,@function
+KeccakP1600_AVX2_Permute_12rounds:
+.endif
+.balign  32
+    lea             rhotates_left+96(%rip),%r8
+    lea             rhotates_right+96(%rip),%r9
+    lea             iotas+12*4*8(%rip),%r10
+    mov             $12,%eax
+    lea             96(%rdi),%rdi
+    vzeroupper
+    vpbroadcastq    -96(%rdi),%ymm0         # load A[5][5]
+    vmovdqu         8+32*0-96(%rdi),%ymm1
+    vmovdqu         8+32*1-96(%rdi),%ymm2
+    vmovdqu         8+32*2-96(%rdi),%ymm3
+    vmovdqu         8+32*3-96(%rdi),%ymm4
+    vmovdqu         8+32*4-96(%rdi),%ymm5
+    vmovdqu         8+32*5-96(%rdi),%ymm6
+    call            __KeccakF1600
+    vmovq           %xmm0,-96(%rdi)
+    vmovdqu         %ymm1,8+32*0-96(%rdi)
+    vmovdqu         %ymm2,8+32*1-96(%rdi)
+    vmovdqu         %ymm3,8+32*2-96(%rdi)
+    vmovdqu         %ymm4,8+32*3-96(%rdi)
+    vmovdqu         %ymm5,8+32*4-96(%rdi)
+    vmovdqu         %ymm6,8+32*5-96(%rdi)
+    vzeroupper
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX2_Permute_12rounds,.-KeccakP1600_AVX2_Permute_12rounds
+.endif
+
+# -----------------------------------------------------------------------------
+#
+# size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+#                                          %rdi                %rsi                            %rdx         %rcx
+#
+.ifdef macOS
+.globl  _KeccakP1600_AVX2_12rounds_FastLoop_Absorb
+_KeccakP1600_AVX2_12rounds_FastLoop_Absorb:
+.else
+.globl  KeccakP1600_AVX2_12rounds_FastLoop_Absorb
+.type   KeccakP1600_AVX2_12rounds_FastLoop_Absorb,@function
+KeccakP1600_AVX2_12rounds_FastLoop_Absorb:
+.endif
+.balign  32
+    push            %rbx
+    push            %r10
+    shr             $3, %rcx                # rcx = data length in lanes
+    mov             %rdx, %rbx              # rbx = initial data pointer
+    cmp             %rsi, %rcx
+    jb              KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit
+    vzeroupper
+    cmp             $21, %rsi    
+    jnz             KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes
+    sub             $21, %rcx
+    lea             rhotates_left+96(%rip),%r8
+    lea             rhotates_right+96(%rip),%r9
+    lea             96(%rdi),%rdi
+    vpbroadcastq    -96(%rdi),%ymm0         # load A[5][5]
+    vmovdqu         8+32*0-96(%rdi),%ymm1
+    vmovdqu         8+32*1-96(%rdi),%ymm2
+    vmovdqu         8+32*2-96(%rdi),%ymm3
+    vmovdqu         8+32*3-96(%rdi),%ymm4
+    vmovdqu         8+32*4-96(%rdi),%ymm5
+    vmovdqu         8+32*5-96(%rdi),%ymm6
+KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes:
+    vpbroadcastq    (%rdx),%ymm7
+    vmovdqu         8(%rdx),%ymm8
+
+    vmovdqa         map2(%rip), %xmm15
+    vpcmpeqd        %ymm14, %ymm14, %ymm14
+    vpgatherdq      %ymm14, (%rdx, %xmm15, 1), %ymm9
+
+    vmovdqa         mask3_21(%rip), %ymm14
+    vpxor           %ymm10, %ymm10, %ymm10
+    vmovdqa         map3(%rip), %xmm15
+    vpgatherdq      %ymm14, (%rdx, %xmm15, 1), %ymm10
+
+    vmovdqa         mask4_21(%rip), %ymm14
+    vpxor           %ymm11, %ymm11, %ymm11
+    vmovdqa         map4(%rip), %xmm15
+    vpgatherdq      %ymm14, (%rdx, %xmm15, 1), %ymm11
+
+    vmovdqa         mask5_21(%rip), %ymm14
+    vpxor           %ymm12, %ymm12, %ymm12
+    vmovdqa         map5(%rip), %xmm15
+    vpgatherdq      %ymm14, (%rdx, %xmm15, 1), %ymm12
+
+    vmovdqa         mask6_21(%rip), %ymm14
+    vpxor           %ymm13, %ymm13, %ymm13
+    vmovdqa         map6(%rip), %xmm15
+    vpgatherdq      %ymm14, (%rdx, %xmm15, 1), %ymm13
+
+    vpxor           %ymm7,%ymm0,%ymm0
+    vpxor           %ymm8,%ymm1,%ymm1
+    vpxor           %ymm9,%ymm2,%ymm2
+    vpxor           %ymm10,%ymm3,%ymm3
+    vpxor           %ymm11,%ymm4,%ymm4
+    vpxor           %ymm12,%ymm5,%ymm5
+    vpxor           %ymm13,%ymm6,%ymm6
+    add             $21*8, %rdx
+    lea             iotas+12*4*8(%rip),%r10
+    mov             $12,%eax
+    call            __KeccakF1600
+    sub             $21, %rcx
+    jnc             KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes
+KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit:
+    vmovq           %xmm0,-96(%rdi)
+    vmovdqu         %ymm1,8+32*0-96(%rdi)
+    vmovdqu         %ymm2,8+32*1-96(%rdi)
+    vmovdqu         %ymm3,8+32*2-96(%rdi)
+    vmovdqu         %ymm4,8+32*3-96(%rdi)
+    vmovdqu         %ymm5,8+32*4-96(%rdi)
+    vmovdqu         %ymm6,8+32*5-96(%rdi)
+KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit:
+    vzeroupper
+    mov             %rdx, %rax               # return number of bytes processed
+    sub             %rbx, %rax
+    pop             %r10
+    pop             %rbx
+    ret
+KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes:
+    cmp             $17, %rsi    
+    jnz             KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes
+    sub             $17, %rcx
+    lea             rhotates_left+96(%rip),%r8
+    lea             rhotates_right+96(%rip),%r9
+    lea             96(%rdi),%rdi
+    vpbroadcastq    -96(%rdi),%ymm0         # load A[5][5]
+    vmovdqu         8+32*0-96(%rdi),%ymm1
+    vmovdqu         8+32*1-96(%rdi),%ymm2
+    vmovdqu         8+32*2-96(%rdi),%ymm3
+    vmovdqu         8+32*3-96(%rdi),%ymm4
+    vmovdqu         8+32*4-96(%rdi),%ymm5
+    vmovdqu         8+32*5-96(%rdi),%ymm6
+KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes:
+    vpbroadcastq    (%rdx),%ymm7
+    vmovdqu         8(%rdx),%ymm8
+
+    vmovdqa         mask2_17(%rip), %ymm14
+    vpxor           %ymm9, %ymm9, %ymm9
+    vmovdqa         map2(%rip), %xmm15
+    vpgatherdq      %ymm14, (%rdx, %xmm15, 1), %ymm9
+
+    vmovdqa         mask3_17(%rip), %ymm14
+    vpxor           %ymm10, %ymm10, %ymm10
+    vmovdqa         map3(%rip), %xmm15
+    vpgatherdq      %ymm14, (%rdx, %xmm15, 1), %ymm10
+
+    vmovdqa         mask4_17(%rip), %ymm14
+    vpxor           %ymm11, %ymm11, %ymm11
+    vmovdqa         map4(%rip), %xmm15
+    vpgatherdq      %ymm14, (%rdx, %xmm15, 1), %ymm11
+
+    vmovdqa         mask5_17(%rip), %ymm14
+    vpxor           %ymm12, %ymm12, %ymm12
+    vmovdqa         map5(%rip), %xmm15
+    vpgatherdq      %ymm14, (%rdx, %xmm15, 1), %ymm12
+
+    vmovdqa         mask6_17(%rip), %ymm14
+    vpxor           %ymm13, %ymm13, %ymm13
+    vmovdqa         map6(%rip), %xmm15
+    vpgatherdq      %ymm14, (%rdx, %xmm15, 1), %ymm13
+
+    vpxor           %ymm7,%ymm0,%ymm0
+    vpxor           %ymm8,%ymm1,%ymm1
+    vpxor           %ymm9,%ymm2,%ymm2
+    vpxor           %ymm10,%ymm3,%ymm3
+    vpxor           %ymm11,%ymm4,%ymm4
+    vpxor           %ymm12,%ymm5,%ymm5
+    vpxor           %ymm13,%ymm6,%ymm6
+    add             $17*8, %rdx
+    lea             iotas+12*4*8(%rip),%r10
+    mov             $12,%eax
+    call            __KeccakF1600
+    sub             $17, %rcx
+    jnc             KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes
+    jmp             KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit
+KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes:
+    lea             mapState(%rip), %r9
+    mov             %rsi, %rax
+KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop:
+    mov             (%rdx), %r8
+    add             $8, %rdx
+    mov             (%r9), %r10
+    add             $8, %r9
+    add             %rdi, %r10
+    xor             %r8, (%r10)
+    sub             $1, %rax
+    jnz             KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop
+    sub             %rsi, %rcx
+    push            %rdi
+    push            %rsi
+    push            %rdx
+    push            %rcx
+.ifdef macOS
+    call            _KeccakP1600_AVX2_Permute_12rounds
+.else
+    call            KeccakP1600_AVX2_Permute_12rounds@PLT
+.endif
+    pop             %rcx
+    pop             %rdx
+    pop             %rsi
+    pop             %rdi
+    cmp             %rsi, %rcx
+    jae             KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes
+    jmp             KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX2_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX2_12rounds_FastLoop_Absorb
+.endif
+
+.equ    ALLON,        0xFFFFFFFFFFFFFFFF
+
+.balign  64
+rhotates_left:
+    .quad     3,   18,    36,    41         # [2][0] [4][0] [1][0] [3][0]
+    .quad     1,   62,    28,    27         # [0][1] [0][2] [0][3] [0][4]
+    .quad    45,    6,    56,    39         # [3][1] [1][2] [4][3] [2][4]
+    .quad    10,   61,    55,     8         # [2][1] [4][2] [1][3] [3][4]
+    .quad     2,   15,    25,    20         # [4][1] [3][2] [2][3] [1][4]
+    .quad    44,   43,    21,    14         # [1][1] [2][2] [3][3] [4][4]
+rhotates_right:
+    .quad    64-3,  64-18,  64-36,  64-41
+    .quad    64-1,  64-62,  64-28,  64-27
+    .quad    64-45, 64-6,   64-56,  64-39
+    .quad    64-10, 64-61,  64-55,  64-8
+    .quad    64-2,  64-15,  64-25,  64-20
+    .quad    64-44, 64-43,  64-21,  64-14
+iotas:
+    .quad    0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
+    .quad    0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
+    .quad    0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
+    .quad    0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
+    .quad    0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
+    .quad    0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
+    .quad    0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
+    .quad    0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
+    .quad    0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
+    .quad    0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
+    .quad    0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
+    .quad    0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
+    .quad    0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
+    .quad    0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
+    .quad    0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
+    .quad    0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
+    .quad    0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
+    .quad    0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
+    .quad    0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
+    .quad    0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
+    .quad    0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
+    .quad    0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
+    .quad    0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
+    .quad    0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
+
+mapState:
+    .quad     0*8,  1*8,  2*8,  3*8,  4*8
+    .quad     7*8, 21*8, 10*8, 15*8, 20*8
+    .quad     5*8, 13*8, 22*8, 19*8, 12*8
+    .quad     8*8,  9*8, 18*8, 23*8, 16*8
+    .quad     6*8, 17*8, 14*8, 11*8, 24*8
+
+    .balign   16
+map2:
+    .long    10*8, 20*8,  5*8, 15*8
+map3:
+    .long    16*8,  7*8, 23*8, 14*8
+map4:
+    .long    11*8, 22*8,  8*8, 19*8
+map5:
+    .long    21*8, 17*8, 13*8,  9*8
+map6:
+    .long     6*8, 12*8, 18*8, 24*8
+
+    .balign   32
+mask3_21:
+    .quad    ALLON, ALLON,     0, ALLON
+mask4_21:
+    .quad    ALLON,     0, ALLON, ALLON
+mask5_21:
+    .quad    0,     ALLON, ALLON, ALLON
+mask6_21:
+    .quad    ALLON, ALLON, ALLON,     0
+
+mask2_17:
+    .quad    ALLON,     0, ALLON, ALLON
+mask3_17:
+    .quad    ALLON, ALLON,     0, ALLON
+mask4_17:
+    .quad    ALLON,     0, ALLON,     0
+mask5_17:
+    .quad        0,     0, ALLON, ALLON
+mask6_17:
+    .quad    ALLON, ALLON,     0,     0
+
+.asciz  "Keccak-1600 for AVX2, CRYPTOGAMS by <appro@openssl.org>"
diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c
new file mode 100644
index 0000000..b426421
--- /dev/null
+++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c
@@ -0,0 +1,241 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+We would like to thank Vladimir Sedach, we have used parts of his Keccak AVX-512 C++ code.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "align.h"
+
+typedef __m512i     V512;
+
+#define XOR(a,b)                    _mm512_xor_si512(a,b)
+#define XOR3(a,b,c)                 _mm512_ternarylogic_epi64(a,b,c,0x96)
+#define XOR5(a,b,c,d,e)             XOR3(XOR3(a,b,c),d,e)
+#define ROL(a,offset)               _mm512_rol_epi64(a,offset)
+#define Chi(a,b,c)                  _mm512_ternarylogic_epi64(a,b,c,0xD2)
+
+#define LOAD_Lanes(m,a)             _mm512_maskz_loadu_epi64(m,a)
+#define LOAD_Lane(a)                LOAD_Lanes(0x01,a)
+#define LOAD_Plane(a)               LOAD_Lanes(0x1F,a)
+#define LOAD_8Lanes(a)              LOAD_Lanes(0xFF,a)
+#define STORE_Lanes(a,m,v)          _mm512_mask_storeu_epi64(a,m,v)
+#define STORE_Lane(a,v)             STORE_Lanes(a,0x01,v)
+#define STORE_Plane(a,v)            STORE_Lanes(a,0x1F,v)
+#define STORE_8Lanes(a,v)           STORE_Lanes(a,0xFF,v)
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AVX512_Initialize(void *state)
+{
+    memset(state, 0, 1600/8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    uint8_t  *stateAsBytes;
+    uint64_t *stateAsLanes;
+
+    for( stateAsBytes = (uint8_t*)state; ((offset % 8) != 0) && (length != 0); ++offset, --length)
+        stateAsBytes[offset] ^= *(data++);
+    for (stateAsLanes = (uint64_t*)(stateAsBytes + offset); length >= 8*8; stateAsLanes += 8, data += 8*8, length -= 8*8)
+        STORE_8Lanes( stateAsLanes, XOR(LOAD_8Lanes(stateAsLanes), LOAD_8Lanes((const uint64_t*)data)));
+    for (/* empty */; length >= 8; ++stateAsLanes, data += 8, length -= 8)
+        STORE_Lane( stateAsLanes, XOR(LOAD_Lane(stateAsLanes), LOAD_Lane((const uint64_t*)data)));
+    for ( stateAsBytes = (uint8_t*)stateAsLanes; length != 0; --length)
+        *(stateAsBytes++) ^= *(data++);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    memcpy(data, (unsigned char*)state+offset, length);
+}
+
+/* ---------------------------------------------------------------- */
+
+const uint64_t KeccakP1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+#define KeccakP_DeclareVars \
+    V512    b0, b1, b2, b3, b4; \
+    V512    Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou; \
+    V512    moveThetaPrev = _mm512_setr_epi64(4, 0, 1, 2, 3, 5, 6, 7); \
+    V512    moveThetaNext = _mm512_setr_epi64(1, 2, 3, 4, 0, 5, 6, 7); \
+    V512    rhoB = _mm512_setr_epi64( 0,  1, 62, 28, 27, 0, 0, 0); \
+    V512    rhoG = _mm512_setr_epi64(36, 44,  6, 55, 20, 0, 0, 0); \
+    V512    rhoK = _mm512_setr_epi64( 3, 10, 43, 25, 39, 0, 0, 0); \
+    V512    rhoM = _mm512_setr_epi64(41, 45, 15, 21,  8, 0, 0, 0); \
+    V512    rhoS = _mm512_setr_epi64(18,  2, 61, 56, 14, 0, 0, 0); \
+    V512    pi1B = _mm512_setr_epi64(0, 3, 1, 4, 2, 5, 6, 7); \
+    V512    pi1G = _mm512_setr_epi64(1, 4, 2, 0, 3, 5, 6, 7); \
+    V512    pi1K = _mm512_setr_epi64(2, 0, 3, 1, 4, 5, 6, 7); \
+    V512    pi1M = _mm512_setr_epi64(3, 1, 4, 2, 0, 5, 6, 7); \
+    V512    pi1S = _mm512_setr_epi64(4, 2, 0, 3, 1, 5, 6, 7); \
+    V512    pi2S1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 0+8, 2+8); \
+    V512    pi2S2 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 1+8, 3+8); \
+    V512    pi2BG = _mm512_setr_epi64(0, 1, 0+8, 1+8, 6, 5, 6, 7); \
+    V512    pi2KM = _mm512_setr_epi64(2, 3, 2+8, 3+8, 7, 5, 6, 7); \
+    V512    pi2S3 = _mm512_setr_epi64(4, 5, 4+8, 5+8, 4, 5, 6, 7);
+
+#define copyFromState(pState) \
+    Baeiou = LOAD_Plane(pState+ 0); \
+    Gaeiou = LOAD_Plane(pState+ 5); \
+    Kaeiou = LOAD_Plane(pState+10); \
+    Maeiou = LOAD_Plane(pState+15); \
+    Saeiou = LOAD_Plane(pState+20);
+
+#define copyToState(pState) \
+    STORE_Plane(pState+ 0, Baeiou); \
+    STORE_Plane(pState+ 5, Gaeiou); \
+    STORE_Plane(pState+10, Kaeiou); \
+    STORE_Plane(pState+15, Maeiou); \
+    STORE_Plane(pState+20, Saeiou);
+
+#define KeccakP_Round(i) \
+    /* Theta */ \
+    b0 = XOR5( Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou ); \
+    b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); \
+    b0 = _mm512_permutexvar_epi64(moveThetaNext, b0); \
+    b0 = _mm512_rol_epi64(b0, 1); \
+    Baeiou = XOR3( Baeiou, b0, b1 ); \
+    Gaeiou = XOR3( Gaeiou, b0, b1 ); \
+    Kaeiou = XOR3( Kaeiou, b0, b1 ); \
+    Maeiou = XOR3( Maeiou, b0, b1 ); \
+    Saeiou = XOR3( Saeiou, b0, b1 ); \
+    /* Rho */ \
+    Baeiou = _mm512_rolv_epi64(Baeiou, rhoB); \
+    Gaeiou = _mm512_rolv_epi64(Gaeiou, rhoG); \
+    Kaeiou = _mm512_rolv_epi64(Kaeiou, rhoK); \
+    Maeiou = _mm512_rolv_epi64(Maeiou, rhoM); \
+    Saeiou = _mm512_rolv_epi64(Saeiou, rhoS); \
+    /* Pi 1 */ \
+    b0 = _mm512_permutexvar_epi64(pi1B, Baeiou); \
+    b1 = _mm512_permutexvar_epi64(pi1G, Gaeiou); \
+    b2 = _mm512_permutexvar_epi64(pi1K, Kaeiou); \
+    b3 = _mm512_permutexvar_epi64(pi1M, Maeiou); \
+    b4 = _mm512_permutexvar_epi64(pi1S, Saeiou); \
+    /* Chi */ \
+    Baeiou = Chi(b0, b1, b2); \
+    Gaeiou = Chi(b1, b2, b3); \
+    Kaeiou = Chi(b2, b3, b4); \
+    Maeiou = Chi(b3, b4, b0); \
+    Saeiou = Chi(b4, b0, b1); \
+    /* Iota */ \
+    Baeiou = XOR(Baeiou, LOAD_Lane(KeccakP1600RoundConstants+i)); \
+    /* Pi 2 */ \
+    b0 = _mm512_unpacklo_epi64(Baeiou, Gaeiou); \
+    b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); \
+    b0 = _mm512_permutex2var_epi64(b0, pi2S1, Saeiou); \
+    b2 = _mm512_unpackhi_epi64(Baeiou, Gaeiou); \
+    b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); \
+    b2 = _mm512_permutex2var_epi64(b2, pi2S2, Saeiou); \
+    Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); \
+    Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); \
+    Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); \
+    Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); \
+    b0 = _mm512_permutex2var_epi64(b0, pi2S3, b1); \
+    Saeiou = _mm512_mask_blend_epi64(0x10, b0, Saeiou)
+
+#define rounds12 \
+    KeccakP_Round( 12 ); \
+    KeccakP_Round( 13 ); \
+    KeccakP_Round( 14 ); \
+    KeccakP_Round( 15 ); \
+    KeccakP_Round( 16 ); \
+    KeccakP_Round( 17 ); \
+    KeccakP_Round( 18 ); \
+    KeccakP_Round( 19 ); \
+    KeccakP_Round( 20 ); \
+    KeccakP_Round( 21 ); \
+    KeccakP_Round( 22 ); \
+    KeccakP_Round( 23 )
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AVX512_Permute_12rounds(void *state)
+{
+    KeccakP_DeclareVars
+    uint64_t *stateAsLanes = (uint64_t*)state;
+
+    copyFromState(stateAsLanes);
+    rounds12;
+    copyToState(stateAsLanes);
+}
+
+/* ---------------------------------------------------------------- */
+
+#include <assert.h>
+
+size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen)
+{
+    size_t originalDataByteLen = dataByteLen;
+
+    assert(laneCount == 21);
+
+    KeccakP_DeclareVars;
+    uint64_t *stateAsLanes = (uint64_t*)state;
+    uint64_t *inDataAsLanes = (uint64_t*)data;
+
+    copyFromState(stateAsLanes);
+    while(dataByteLen >= 21*8) {
+        Baeiou = XOR(Baeiou, LOAD_Plane(inDataAsLanes+ 0));
+        Gaeiou = XOR(Gaeiou, LOAD_Plane(inDataAsLanes+ 5));
+        Kaeiou = XOR(Kaeiou, LOAD_Plane(inDataAsLanes+10));
+        Maeiou = XOR(Maeiou, LOAD_Plane(inDataAsLanes+15));
+        Saeiou = XOR(Saeiou, LOAD_Lane(inDataAsLanes+20));
+        rounds12;
+        inDataAsLanes += 21;
+        dataByteLen -= 21*8;
+    }
+    copyToState(stateAsLanes);
+
+    return originalDataByteLen - dataByteLen;
+}
diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s
new file mode 100644
index 0000000..383ca43
--- /dev/null
+++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s
@@ -0,0 +1,551 @@
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# Copyright (c) 2018 Ronny Van Keer
+# All rights reserved.
+#
+# The source code in this file is licensed under the CRYPTOGAMS license.
+# For further details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Notes:
+# The code for the permutation (__KeccakF1600) was generated with
+# Andy Polyakov's keccak1600-avx512.pl from the CRYPTOGAMS project
+# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx512.pl).
+# The rest of the code was written by Ronny Van Keer.
+# Adaptations for macOS by Stéphane Léon.
+
+.text
+
+# -----------------------------------------------------------------------------
+#
+# void KeccakP1600_AVX512_Initialize(void *state);
+#
+.ifdef macOS
+.globl  _KeccakP1600_AVX512_Initialize
+_KeccakP1600_AVX512_Initialize:
+.else
+.globl  KeccakP1600_AVX512_Initialize
+.type   KeccakP1600_AVX512_Initialize,@function
+KeccakP1600_AVX512_Initialize:
+.endif
+.balign  32
+    vpxorq      %zmm0,%zmm0,%zmm0
+    vmovdqu64   %zmm0,0*64(%rdi)
+    vmovdqu64   %zmm0,1*64(%rdi)
+    vmovdqu64   %zmm0,2*64(%rdi)
+    movq        $0,3*64(%rdi)
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX512_Initialize,.-KeccakP1600_AVX512_Initialize
+.endif
+
+# -----------------------------------------------------------------------------
+#
+# void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset);
+#                                %rdi                 %rsi               %rdx
+#!!
+#.globl  KeccakP1600_AVX512_AddByte
+#.type   KeccakP1600_AVX512_AddByte,@function
+#.balign  32
+#KeccakP1600_AVX512_AddByte:
+#    mov         %rdx, %rax
+#    and         $7, %rax
+#    and         $0xFFFFFFF8, %edx
+#    mov         mapState(%rdx), %rdx
+#    add         %rdx, %rdi
+#    add         %rax, %rdi
+#    xorb        %sil, (%rdi)
+#    ret
+#.size   KeccakP1600_AVX512_AddByte,.-KeccakP1600_AVX512_AddByte
+
+# -----------------------------------------------------------------------------
+#
+# void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+#                                %rdi                         %rsi               %rdx                 %rcx
+#
+.ifdef macOS
+.globl  _KeccakP1600_AVX512_AddBytes
+_KeccakP1600_AVX512_AddBytes:
+.else
+.globl  KeccakP1600_AVX512_AddBytes
+.type   KeccakP1600_AVX512_AddBytes,@function
+KeccakP1600_AVX512_AddBytes:
+.endif
+.balign 32
+    cmp         $0, %rcx
+    jz          KeccakP1600_AVX512_AddBytes_Exit
+    add            %rdx, %rdi                                # state += offset
+    and         $7, %rdx
+    jz          KeccakP1600_AVX512_AddBytes_LaneAlignedCheck
+    mov         $8, %r9                                 # r9 is (max) length of incomplete lane
+    sub         %rdx, %r9
+    cmp         %rcx, %r9
+    cmovae      %rcx, %r9
+    sub         %r9, %rcx                               # length -= length of incomplete lane
+KeccakP1600_AVX512_AddBytes_NotAlignedLoop:
+    mov         (%rsi), %r8b
+    inc         %rsi
+    xorb        %r8b, (%rdi)
+    inc         %rdi
+    dec         %r9
+    jnz         KeccakP1600_AVX512_AddBytes_NotAlignedLoop
+    jmp         KeccakP1600_AVX512_AddBytes_LaneAlignedCheck
+KeccakP1600_AVX512_AddBytes_LaneAlignedLoop:
+    mov         (%rsi), %r8
+    add         $8, %rsi
+    xor         %r8, (%rdi)
+    add         $8, %rdi
+KeccakP1600_AVX512_AddBytes_LaneAlignedCheck:
+    sub         $8, %rcx
+    jnc         KeccakP1600_AVX512_AddBytes_LaneAlignedLoop
+KeccakP1600_AVX512_AddBytes_LastIncompleteLane:
+    add         $8, %rcx
+    jz          KeccakP1600_AVX512_AddBytes_Exit
+KeccakP1600_AVX512_AddBytes_LastIncompleteLaneLoop:
+    mov         (%rsi), %r8b
+    inc         %rsi
+    xor         %r8b, (%rdi)
+    inc         %rdi
+    dec         %rcx
+    jnz         KeccakP1600_AVX512_AddBytes_LastIncompleteLaneLoop
+KeccakP1600_AVX512_AddBytes_Exit:
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX512_AddBytes,.-KeccakP1600_AVX512_AddBytes
+.endif
+
+# -----------------------------------------------------------------------------
+#
+# void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+#                                           %rdi                  %rsi               %rdx                 %rcx
+#
+.ifdef macOS
+.globl  _KeccakP1600_AVX512_ExtractBytes
+_KeccakP1600_AVX512_ExtractBytes:
+.else
+.globl  KeccakP1600_AVX512_ExtractBytes
+.type   KeccakP1600_AVX512_ExtractBytes,@function
+KeccakP1600_AVX512_ExtractBytes:
+.endif
+.balign  32
+    cmp         $0, %rcx
+    jz          KeccakP1600_AVX512_ExtractBytes_Exit
+    add         %rdx, %rdi                              # state += offset
+    and         $7, %rdx
+    jz          KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck
+    mov         $8, %rax                                # rax is (max) length of incomplete lane
+    sub         %rdx, %rax
+    cmp         %rcx, %rax
+    cmovae      %rcx, %rax
+    sub         %rax, %rcx                              # length -= length of incomplete lane
+KeccakP1600_AVX512_ExtractBytes_NotAlignedLoop:
+    mov         (%rdi), %r8b
+    inc         %rdi
+    mov         %r8b, (%rsi)
+    inc         %rsi
+    dec         %rax
+    jnz         KeccakP1600_AVX512_ExtractBytes_NotAlignedLoop
+    jmp         KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck
+KeccakP1600_AVX512_ExtractBytes_LaneAlignedLoop:
+    mov         (%rdi), %r8
+    add         $8, %rdi
+    mov         %r8, (%rsi)
+    add         $8, %rsi
+KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck:
+    sub         $8, %rcx
+    jnc         KeccakP1600_AVX512_ExtractBytes_LaneAlignedLoop
+KeccakP1600_AVX512_ExtractBytes_LastIncompleteLane:
+    add         $8, %rcx
+    jz          KeccakP1600_AVX512_ExtractBytes_Exit
+    mov         (%rdi), %r8
+KeccakP1600_AVX512_ExtractBytes_LastIncompleteLaneLoop:
+    mov         %r8b, (%rsi)
+    shr         $8, %r8
+    inc         %rsi
+    dec         %rcx
+    jnz         KeccakP1600_AVX512_ExtractBytes_LastIncompleteLaneLoop
+KeccakP1600_AVX512_ExtractBytes_Exit:
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX512_ExtractBytes,.-KeccakP1600_AVX512_ExtractBytes
+.endif
+
+# -----------------------------------------------------------------------------
+#
+# internal    
+#
+.text
+.ifdef macOS
+.else
+.type    __KeccakF1600,@function
+.endif
+.balign  32
+__KeccakF1600:
+.Loop_avx512:
+    ######################################### Theta, even round
+    vmovdqa64    %zmm0,%zmm5        # put aside original A00
+    vpternlogq    $0x96,%zmm2,%zmm1,%zmm0    # and use it as "C00"
+    vpternlogq    $0x96,%zmm4,%zmm3,%zmm0
+    vprolq        $1,%zmm0,%zmm6
+    vpermq        %zmm0,%zmm13,%zmm0
+    vpermq        %zmm6,%zmm16,%zmm6
+    vpternlogq    $0x96,%zmm0,%zmm6,%zmm5    # T[0] is original A00
+    vpternlogq    $0x96,%zmm0,%zmm6,%zmm1
+    vpternlogq    $0x96,%zmm0,%zmm6,%zmm2
+    vpternlogq    $0x96,%zmm0,%zmm6,%zmm3
+    vpternlogq    $0x96,%zmm0,%zmm6,%zmm4
+    ######################################### Rho
+    vprolvq        %zmm22,%zmm5,%zmm0    # T[0] is original A00
+    vprolvq        %zmm23,%zmm1,%zmm1
+    vprolvq        %zmm24,%zmm2,%zmm2
+    vprolvq        %zmm25,%zmm3,%zmm3
+    vprolvq        %zmm26,%zmm4,%zmm4
+    ######################################### Pi
+    vpermq        %zmm0,%zmm17,%zmm0
+    vpermq        %zmm1,%zmm18,%zmm1
+    vpermq        %zmm2,%zmm19,%zmm2
+    vpermq        %zmm3,%zmm20,%zmm3
+    vpermq        %zmm4,%zmm21,%zmm4
+    ######################################### Chi
+    vmovdqa64    %zmm0,%zmm5
+    vmovdqa64    %zmm1,%zmm6
+    vpternlogq    $0xD2,%zmm2,%zmm1,%zmm0
+    vpternlogq    $0xD2,%zmm3,%zmm2,%zmm1
+    vpternlogq    $0xD2,%zmm4,%zmm3,%zmm2
+    vpternlogq    $0xD2,%zmm5,%zmm4,%zmm3
+    vpternlogq    $0xD2,%zmm6,%zmm5,%zmm4
+    ######################################### Iota
+    vpxorq        (%r10),%zmm0,%zmm0{%k1}
+    lea        16(%r10),%r10
+    ######################################### Harmonize rounds
+    vpblendmq    %zmm2,%zmm1,%zmm6{%k2}
+    vpblendmq    %zmm3,%zmm2,%zmm7{%k2}
+    vpblendmq    %zmm4,%zmm3,%zmm8{%k2}
+     vpblendmq    %zmm1,%zmm0,%zmm5{%k2}
+    vpblendmq    %zmm0,%zmm4,%zmm9{%k2}
+    vpblendmq    %zmm3,%zmm6,%zmm6{%k3}
+    vpblendmq    %zmm4,%zmm7,%zmm7{%k3}
+     vpblendmq    %zmm2,%zmm5,%zmm5{%k3}
+    vpblendmq    %zmm0,%zmm8,%zmm8{%k3}
+    vpblendmq    %zmm1,%zmm9,%zmm9{%k3}
+    vpblendmq    %zmm4,%zmm6,%zmm6{%k4}
+     vpblendmq    %zmm3,%zmm5,%zmm5{%k4}
+    vpblendmq    %zmm0,%zmm7,%zmm7{%k4}
+    vpblendmq    %zmm1,%zmm8,%zmm8{%k4}
+    vpblendmq    %zmm2,%zmm9,%zmm9{%k4}
+    vpblendmq    %zmm4,%zmm5,%zmm5{%k5}
+    vpblendmq    %zmm0,%zmm6,%zmm6{%k5}
+    vpblendmq    %zmm1,%zmm7,%zmm7{%k5}
+    vpblendmq    %zmm2,%zmm8,%zmm8{%k5}
+    vpblendmq    %zmm3,%zmm9,%zmm9{%k5}
+    #vpermq        %zmm5,%zmm33,%zmm0    # doesn't actually change order
+    vpermq        %zmm6,%zmm13,%zmm1
+    vpermq        %zmm7,%zmm14,%zmm2
+    vpermq        %zmm8,%zmm15,%zmm3
+    vpermq        %zmm9,%zmm16,%zmm4
+    ######################################### Theta, odd round
+    vmovdqa64    %zmm5,%zmm0        # real A00
+    vpternlogq    $0x96,%zmm2,%zmm1,%zmm5    # C00 is %zmm5's alias
+    vpternlogq    $0x96,%zmm4,%zmm3,%zmm5
+    vprolq        $1,%zmm5,%zmm6
+    vpermq        %zmm5,%zmm13,%zmm5
+    vpermq        %zmm6,%zmm16,%zmm6
+    vpternlogq    $0x96,%zmm5,%zmm6,%zmm0
+    vpternlogq    $0x96,%zmm5,%zmm6,%zmm3
+    vpternlogq    $0x96,%zmm5,%zmm6,%zmm1
+    vpternlogq    $0x96,%zmm5,%zmm6,%zmm4
+    vpternlogq    $0x96,%zmm5,%zmm6,%zmm2
+    ######################################### Rho
+    vprolvq        %zmm27,%zmm0,%zmm0
+    vprolvq        %zmm30,%zmm3,%zmm6
+    vprolvq        %zmm28,%zmm1,%zmm7
+    vprolvq        %zmm31,%zmm4,%zmm8
+    vprolvq        %zmm29,%zmm2,%zmm9
+     vpermq        %zmm0,%zmm16,%zmm10
+     vpermq        %zmm0,%zmm15,%zmm11
+    ######################################### Iota
+    vpxorq        -8(%r10),%zmm0,%zmm0{%k1}
+    ######################################### Pi
+    vpermq        %zmm6,%zmm14,%zmm1
+    vpermq        %zmm7,%zmm16,%zmm2
+    vpermq        %zmm8,%zmm13,%zmm3
+    vpermq        %zmm9,%zmm15,%zmm4
+    ######################################### Chi
+    vpternlogq    $0xD2,%zmm11,%zmm10,%zmm0
+    vpermq        %zmm6,%zmm13,%zmm12
+    #vpermq        %zmm6,%zmm33,%zmm6
+    vpternlogq    $0xD2,%zmm6,%zmm12,%zmm1
+    vpermq        %zmm7,%zmm15,%zmm5
+    vpermq        %zmm7,%zmm14,%zmm7
+    vpternlogq    $0xD2,%zmm7,%zmm5,%zmm2
+    #vpermq        %zmm8,%zmm33,%zmm8
+    vpermq        %zmm8,%zmm16,%zmm6
+    vpternlogq    $0xD2,%zmm6,%zmm8,%zmm3
+    vpermq        %zmm9,%zmm14,%zmm5
+    vpermq        %zmm9,%zmm13,%zmm9
+    vpternlogq    $0xD2,%zmm9,%zmm5,%zmm4
+    dec        %eax
+    jnz        .Loop_avx512
+    ret
+.ifdef macOS
+.else
+.size    __KeccakF1600,.-__KeccakF1600
+.endif
+
+# -----------------------------------------------------------------------------
+#
+# void KeccakP1600_AVX512_Permute_12rounds(void *state);
+#                                        %rdi
+#
+.ifdef macOS
+.globl  _KeccakP1600_AVX512_Permute_12rounds
+_KeccakP1600_AVX512_Permute_12rounds:
+.else
+.globl  KeccakP1600_AVX512_Permute_12rounds
+.type   KeccakP1600_AVX512_Permute_12rounds,@function
+KeccakP1600_AVX512_Permute_12rounds:
+.endif
+.balign  32
+    lea         96(%rdi),%rdi
+    lea         theta_perm(%rip),%r8
+    kxnorw      %k6,%k6,%k6
+    kshiftrw    $15,%k6,%k1
+    kshiftrw    $11,%k6,%k6
+    kshiftlw    $1,%k1,%k2
+    kshiftlw    $2,%k1,%k3
+    kshiftlw    $3,%k1,%k4
+    kshiftlw    $4,%k1,%k5
+    #vmovdqa64   64*0(%r8),%zmm33
+    vmovdqa64   64*1(%r8),%zmm13
+    vmovdqa64   64*2(%r8),%zmm14
+    vmovdqa64   64*3(%r8),%zmm15
+    vmovdqa64   64*4(%r8),%zmm16
+    vmovdqa64   64*5(%r8),%zmm27
+    vmovdqa64   64*6(%r8),%zmm28
+    vmovdqa64   64*7(%r8),%zmm29
+    vmovdqa64   64*8(%r8),%zmm30
+    vmovdqa64   64*9(%r8),%zmm31
+    vmovdqa64   64*10(%r8),%zmm22
+    vmovdqa64   64*11(%r8),%zmm23
+    vmovdqa64   64*12(%r8),%zmm24
+    vmovdqa64   64*13(%r8),%zmm25
+    vmovdqa64   64*14(%r8),%zmm26
+    vmovdqa64   64*15(%r8),%zmm17
+    vmovdqa64   64*16(%r8),%zmm18
+    vmovdqa64   64*17(%r8),%zmm19
+    vmovdqa64   64*18(%r8),%zmm20
+    vmovdqa64   64*19(%r8),%zmm21
+    vmovdqu64   40*0-96(%rdi),%zmm0{%k6}{z}
+#    vpxorq      %zmm5,%zmm5,%zmm5
+    vmovdqu64   40*1-96(%rdi),%zmm1{%k6}{z}
+    vmovdqu64   40*2-96(%rdi),%zmm2{%k6}{z}
+    vmovdqu64   40*3-96(%rdi),%zmm3{%k6}{z}
+    vmovdqu64   40*4-96(%rdi),%zmm4{%k6}{z}
+    lea         iotas+12*8(%rip), %r10
+    mov         $12/2, %eax
+    call        __KeccakF1600
+    vmovdqu64   %zmm0,40*0-96(%rdi){%k6}
+    vmovdqu64   %zmm1,40*1-96(%rdi){%k6}
+    vmovdqu64   %zmm2,40*2-96(%rdi){%k6}
+    vmovdqu64   %zmm3,40*3-96(%rdi){%k6}
+    vmovdqu64   %zmm4,40*4-96(%rdi){%k6}
+    vzeroupper
+    ret
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX512_Permute_12rounds,.-KeccakP1600_AVX512_Permute_12rounds
+.endif
+
+# -----------------------------------------------------------------------------
+#
+# size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+#                                          %rdi                %rsi                            %rdx         %rcx
+#
+.ifdef macOS
+.globl  _KeccakP1600_AVX512_12rounds_FastLoop_Absorb
+_KeccakP1600_AVX512_12rounds_FastLoop_Absorb:
+.else
+.globl  KeccakP1600_AVX512_12rounds_FastLoop_Absorb
+.type   KeccakP1600_AVX512_12rounds_FastLoop_Absorb,@function
+KeccakP1600_AVX512_12rounds_FastLoop_Absorb:
+.endif
+.balign  32
+    push            %rbx
+    push            %r10
+    shr             $3, %rcx                # rcx = data length in lanes
+    mov             %rdx, %rbx              # rbx = initial data pointer
+    cmp             %rsi, %rcx
+    jb              KeccakP1600_AVX512_FastLoop_Absorb_Exit
+    lea             96(%rdi),%rdi
+    lea             theta_perm(%rip),%r8
+    kxnorw          %k6,%k6,%k6
+    kshiftrw        $15,%k6,%k1
+    kshiftrw        $11,%k6,%k6
+    kshiftlw        $1,%k1,%k2
+    kshiftlw        $2,%k1,%k3
+    kshiftlw        $3,%k1,%k4
+    kshiftlw        $4,%k1,%k5
+    vmovdqa64       64*1(%r8),%zmm13
+    vmovdqa64       64*2(%r8),%zmm14
+    vmovdqa64       64*3(%r8),%zmm15
+    vmovdqa64       64*4(%r8),%zmm16
+    vmovdqa64       64*5(%r8),%zmm27
+    vmovdqa64       64*6(%r8),%zmm28
+    vmovdqa64       64*7(%r8),%zmm29
+    vmovdqa64       64*8(%r8),%zmm30
+    vmovdqa64       64*9(%r8),%zmm31
+    vmovdqa64       64*10(%r8),%zmm22
+    vmovdqa64       64*11(%r8),%zmm23
+    vmovdqa64       64*12(%r8),%zmm24
+    vmovdqa64       64*13(%r8),%zmm25
+    vmovdqa64       64*14(%r8),%zmm26
+    vmovdqa64       64*15(%r8),%zmm17
+    vmovdqa64       64*16(%r8),%zmm18
+    vmovdqa64       64*17(%r8),%zmm19
+    vmovdqa64       64*18(%r8),%zmm20
+    vmovdqa64       64*19(%r8),%zmm21
+    vmovdqu64       40*0-96(%rdi),%zmm0{%k6}{z}
+    vmovdqu64       40*1-96(%rdi),%zmm1{%k6}{z}
+    vmovdqu64       40*2-96(%rdi),%zmm2{%k6}{z}
+    vmovdqu64       40*3-96(%rdi),%zmm3{%k6}{z}
+    vmovdqu64       40*4-96(%rdi),%zmm4{%k6}{z}
+    cmp             $21, %rsi    
+    jnz             KeccakP1600_AVX512_FastLoop_Absorb_Not21Lanes
+    sub             $21, %rcx
+KeccakP1600_AVX512_FastLoop_Absorb_Loop21Lanes:
+    vmovdqu64       8*0(%rdx),%zmm5{%k6}{z}
+    vmovdqu64       8*5(%rdx),%zmm6{%k6}{z}
+    vmovdqu64       8*10(%rdx),%zmm7{%k6}{z}
+    vmovdqu64       8*15(%rdx),%zmm8{%k6}{z}
+    vmovdqu64       8*20(%rdx),%zmm9{%k1}{z}
+    vpxorq          %zmm5,%zmm0,%zmm0
+    vpxorq          %zmm6,%zmm1,%zmm1
+    vpxorq          %zmm7,%zmm2,%zmm2
+    vpxorq          %zmm8,%zmm3,%zmm3
+    vpxorq          %zmm9,%zmm4,%zmm4
+    add             $21*8, %rdx
+    lea             iotas+12*8(%rip), %r10
+    mov             $12/2, %eax
+    call            __KeccakF1600
+    sub             $21, %rcx
+    jnc             KeccakP1600_AVX512_FastLoop_Absorb_Loop21Lanes
+KeccakP1600_AVX512_FastLoop_Absorb_SaveAndExit:
+    vmovdqu64       %zmm0,40*0-96(%rdi){%k6}
+    vmovdqu64       %zmm1,40*1-96(%rdi){%k6}
+    vmovdqu64       %zmm2,40*2-96(%rdi){%k6}
+    vmovdqu64       %zmm3,40*3-96(%rdi){%k6}
+    vmovdqu64       %zmm4,40*4-96(%rdi){%k6}
+KeccakP1600_AVX512_FastLoop_Absorb_Exit:
+    vzeroupper
+    mov             %rdx, %rax               # return number of bytes processed
+    sub             %rbx, %rax
+    pop             %r10
+    pop             %rbx
+    ret
+KeccakP1600_AVX512_FastLoop_Absorb_Not21Lanes:
+    cmp             $17, %rsi    
+    jnz             KeccakP1600_AVX512_FastLoop_Absorb_Not17Lanes
+    sub             $17, %rcx
+KeccakP1600_AVX512_FastLoop_Absorb_Loop17Lanes:
+    vmovdqu64       8*0(%rdx),%zmm5{%k6}{z}
+    vmovdqu64       8*5(%rdx),%zmm6{%k6}{z}
+    vmovdqu64       8*10(%rdx),%zmm7{%k6}{z}
+    vmovdqu64       8*15(%rdx),%zmm8{%k1}{z}
+    vmovdqu64       8*15(%rdx),%zmm8{%k2}
+    vpxorq          %zmm5,%zmm0,%zmm0
+    vpxorq          %zmm6,%zmm1,%zmm1
+    vpxorq          %zmm7,%zmm2,%zmm2
+    vpxorq          %zmm8,%zmm3,%zmm3
+    add             $17*8, %rdx
+    lea             iotas+12*8(%rip), %r10
+    mov             $12/2, %eax
+    call            __KeccakF1600
+    sub             $17, %rcx
+    jnc             KeccakP1600_AVX512_FastLoop_Absorb_Loop17Lanes
+    jmp             KeccakP1600_AVX512_FastLoop_Absorb_SaveAndExit
+KeccakP1600_AVX512_FastLoop_Absorb_Not17Lanes:
+    lea             -96(%rdi), %rdi
+KeccakP1600_AVX512_FastLoop_Absorb_LanesLoop:
+    mov             %rsi, %rax
+    mov             %rdi, %r10
+KeccakP1600_AVX512_FastLoop_Absorb_LanesAddLoop:
+    mov             (%rdx), %r8
+    add             $8, %rdx
+    xor             %r8, (%r10)
+    add             $8, %r10
+    sub             $1, %rax
+    jnz             KeccakP1600_AVX512_FastLoop_Absorb_LanesAddLoop
+    sub             %rsi, %rcx
+    push            %rdi
+    push            %rsi
+    push            %rdx
+    push            %rcx
+.ifdef macOS
+    call            _KeccakP1600_AVX512_Permute_12rounds
+.else
+    call            KeccakP1600_AVX512_Permute_12rounds@PLT
+.endif
+    pop             %rcx
+    pop             %rdx
+    pop             %rsi
+    pop             %rdi
+    cmp             %rsi, %rcx
+    jae             KeccakP1600_AVX512_FastLoop_Absorb_LanesLoop
+    jmp             KeccakP1600_AVX512_FastLoop_Absorb_Exit
+.ifdef macOS
+.else
+.size   KeccakP1600_AVX512_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX512_12rounds_FastLoop_Absorb
+.endif
+.balign  64
+theta_perm:
+    .quad    0, 1, 2, 3, 4, 5, 6, 7        # [not used]
+    .quad    4, 0, 1, 2, 3, 5, 6, 7
+    .quad    3, 4, 0, 1, 2, 5, 6, 7
+    .quad    2, 3, 4, 0, 1, 5, 6, 7
+    .quad    1, 2, 3, 4, 0, 5, 6, 7
+rhotates1:
+    .quad    0,  44, 43, 21, 14, 0, 0, 0    # [0][0] [1][1] [2][2] [3][3] [4][4]
+    .quad    18, 1,  6,  25, 8,  0, 0, 0    # [4][0] [0][1] [1][2] [2][3] [3][4]
+    .quad    41, 2,  62, 55, 39, 0, 0, 0    # [3][0] [4][1] [0][2] [1][3] [2][4]
+    .quad    3,  45, 61, 28, 20, 0, 0, 0    # [2][0] [3][1] [4][2] [0][3] [1][4]
+    .quad    36, 10, 15, 56, 27, 0, 0, 0    # [1][0] [2][1] [3][2] [4][3] [0][4]
+rhotates0:
+    .quad     0,  1, 62, 28, 27, 0, 0, 0
+    .quad    36, 44,  6, 55, 20, 0, 0, 0
+    .quad     3, 10, 43, 25, 39, 0, 0, 0
+    .quad    41, 45, 15, 21,  8, 0, 0, 0
+    .quad    18,  2, 61, 56, 14, 0, 0, 0
+pi0_perm:
+    .quad    0, 3, 1, 4, 2, 5, 6, 7
+    .quad    1, 4, 2, 0, 3, 5, 6, 7
+    .quad    2, 0, 3, 1, 4, 5, 6, 7
+    .quad    3, 1, 4, 2, 0, 5, 6, 7
+    .quad    4, 2, 0, 3, 1, 5, 6, 7
+iotas:
+    .quad    0x0000000000000001
+    .quad    0x0000000000008082
+    .quad    0x800000000000808a
+    .quad    0x8000000080008000
+    .quad    0x000000000000808b
+    .quad    0x0000000080000001
+    .quad    0x8000000080008081
+    .quad    0x8000000000008009
+    .quad    0x000000000000008a
+    .quad    0x0000000000000088
+    .quad    0x0000000080008009
+    .quad    0x000000008000000a
+    .quad    0x000000008000808b
+    .quad    0x800000000000008b
+    .quad    0x8000000000008089
+    .quad    0x8000000000008003
+    .quad    0x8000000000008002
+    .quad    0x8000000000000080
+    .quad    0x000000000000800a
+    .quad    0x800000008000000a
+    .quad    0x8000000080008081
+    .quad    0x8000000000008080
+    .quad    0x0000000080000001
+    .quad    0x8000000080008008
+iotas_end:
+.asciz    "Keccak-1600 for AVX-512F, CRYPTOGAMS by <appro@openssl.org>"
diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h
new file mode 100644
index 0000000..709469c
--- /dev/null
+++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h
@@ -0,0 +1,74 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+/* Keccak-p[1600] */
+
+#define KeccakP1600_stateSizeInBytes    200
+#define KeccakP1600_stateAlignment      8
+#define KeccakP1600_12rounds_FastLoop_supported
+
+const char * KeccakP1600_GetImplementation();
+void KeccakP1600_Initialize(void *state);
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_Permute_12rounds(void *state);
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+
+void KeccakP1600_AVX512_Initialize(void *state);
+void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_AVX512_Permute_12rounds(void *state);
+void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+
+void KeccakP1600_AVX2_Initialize(void *state);
+void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_AVX2_Permute_12rounds(void *state);
+void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+
+void KeccakP1600_opt64_Initialize(void *state);
+void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_opt64_Permute_12rounds(void *state);
+void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+
+/* Keccak-p[1600]×2 */
+
+int KeccakP1600times2_IsAvailable();
+const char * KeccakP1600times2_GetImplementation();
+
+/* Keccak-p[1600]×4 */
+
+int KeccakP1600times4_IsAvailable();
+const char * KeccakP1600times4_GetImplementation();
+
+/* Keccak-p[1600]×8 */
+
+int KeccakP1600times8_IsAvailable();
+const char * KeccakP1600times8_GetImplementation();
+
+#endif
diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c
new file mode 100644
index 0000000..e98056d
--- /dev/null
+++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c
@@ -0,0 +1,1026 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "brg_endian.h"
+#include <KeccakP-1600-SnP.h>
+
+#define KeccakP1600_opt64_implementation_config "all rounds unrolled"
+#define KeccakP1600_opt64_fullUnrolling
+/* Or */
+/*
+#define KeccakP1600_opt64_implementation_config "6 rounds unrolled"
+#define KeccakP1600_opt64_unrolling 6
+*/
+/* Or */
+/*
+#define KeccakP1600_opt64_implementation_config "lane complementing, 6 rounds unrolled"
+#define KeccakP1600_opt64_unrolling 6
+#define KeccakP1600_opt64_useLaneComplementing
+*/
+/* Or */
+/*
+#define KeccakP1600_opt64_implementation_config "lane complementing, all rounds unrolled"
+#define KeccakP1600_opt64_fullUnrolling
+#define KeccakP1600_opt64_useLaneComplementing
+*/
+/* Or */
+/*
+#define KeccakP1600_opt64_implementation_config "lane complementing, all rounds unrolled, using SHLD for rotations"
+#define KeccakP1600_opt64_fullUnrolling
+#define KeccakP1600_opt64_useLaneComplementing
+#define KeccakP1600_opt64_useSHLD
+*/
+
+#if defined(KeccakP1600_opt64_useLaneComplementing)
+#define UseBebigokimisa
+#endif
+
+#if defined(_MSC_VER)
+#define ROL64(a, offset) _rotl64(a, offset)
+#elif defined(KeccakP1600_opt64_useSHLD)
+    #define ROL64(x,N) ({ \
+    register uint64_t __out; \
+    register uint64_t __in = x; \
+    __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \
+    __out; \
+    })
+#else
+#define ROL64(a, offset) ((((uint64_t)a) << offset) ^ (((uint64_t)a) >> (64-offset)))
+#endif
+
+#ifdef KeccakP1600_opt64_fullUnrolling
+#define FullUnrolling
+#else
+#define Unrolling KeccakP1600_opt64_unrolling
+#endif
+
+static const uint64_t KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_opt64_Initialize(void *state)
+{
+    memset(state, 0, 200);
+#ifdef KeccakP1600_opt64_useLaneComplementing
+    ((uint64_t*)state)[ 1] = ~(uint64_t)0;
+    ((uint64_t*)state)[ 2] = ~(uint64_t)0;
+    ((uint64_t*)state)[ 8] = ~(uint64_t)0;
+    ((uint64_t*)state)[12] = ~(uint64_t)0;
+    ((uint64_t*)state)[17] = ~(uint64_t)0;
+    ((uint64_t*)state)[20] = ~(uint64_t)0;
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    uint64_t lane;
+    if (length == 0)
+        return;
+    if (length == 1)
+        lane = data[0];
+    else {
+        lane = 0;
+        memcpy(&lane, data, length);
+    }
+    lane <<= offset*8;
+#else
+    uint64_t lane = 0;
+    unsigned int i;
+    for(i=0; i<length; i++)
+        lane |= ((uint64_t)data[i]) << ((i+offset)*8);
+#endif
+    ((uint64_t*)state)[lanePosition] ^= lane;
+}
+
+/* ---------------------------------------------------------------- */
+
+static void KeccakP1600_opt64_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    unsigned int i = 0;
+#ifdef NO_MISALIGNED_ACCESSES
+    /* If either pointer is misaligned, fall back to byte-wise xor. */
+    if (((((uintptr_t)state) & 7) != 0) || ((((uintptr_t)data) & 7) != 0)) {
+      for (i = 0; i < laneCount * 8; i++) {
+        ((unsigned char*)state)[i] ^= data[i];
+      }
+    }
+    else
+#endif
+    {
+      /* Otherwise... */
+      for( ; (i+8)<=laneCount; i+=8) {
+          ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+          ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
+          ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2];
+          ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3];
+          ((uint64_t*)state)[i+4] ^= ((uint64_t*)data)[i+4];
+          ((uint64_t*)state)[i+5] ^= ((uint64_t*)data)[i+5];
+          ((uint64_t*)state)[i+6] ^= ((uint64_t*)data)[i+6];
+          ((uint64_t*)state)[i+7] ^= ((uint64_t*)data)[i+7];
+      }
+      for( ; (i+4)<=laneCount; i+=4) {
+          ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+          ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
+          ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2];
+          ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3];
+      }
+      for( ; (i+2)<=laneCount; i+=2) {
+          ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+          ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
+      }
+      if (i<laneCount) {
+          ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+      }
+    }
+#else
+    unsigned int i;
+    const uint8_t *curData = data;
+    for(i=0; i<laneCount; i++, curData+=8) {
+        uint64_t lane = (uint64_t)curData[0]
+            | ((uint64_t)curData[1] <<  8)
+            | ((uint64_t)curData[2] << 16)
+            | ((uint64_t)curData[3] << 24)
+            | ((uint64_t)curData[4] << 32)
+            | ((uint64_t)curData[5] << 40)
+            | ((uint64_t)curData[6] << 48)
+            | ((uint64_t)curData[7] << 56);
+        ((uint64_t*)state)[i] ^= lane;
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_opt64_AddByte(void *state, unsigned char byte, unsigned int offset)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    ((unsigned char*)(state))[offset] ^= byte;
+#else
+    uint64_t lane = byte;
+    lane <<= (offset%8)*8;
+    ((uint64_t*)state)[offset/8] ^= lane;
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#define SnP_AddBytes(state, data, offset, length, SnP_AddLanes, SnP_AddBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_AddLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_AddBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+#define declareABCDE \
+    uint64_t Aba, Abe, Abi, Abo, Abu; \
+    uint64_t Aga, Age, Agi, Ago, Agu; \
+    uint64_t Aka, Ake, Aki, Ako, Aku; \
+    uint64_t Ama, Ame, Ami, Amo, Amu; \
+    uint64_t Asa, Ase, Asi, Aso, Asu; \
+    uint64_t Bba, Bbe, Bbi, Bbo, Bbu; \
+    uint64_t Bga, Bge, Bgi, Bgo, Bgu; \
+    uint64_t Bka, Bke, Bki, Bko, Bku; \
+    uint64_t Bma, Bme, Bmi, Bmo, Bmu; \
+    uint64_t Bsa, Bse, Bsi, Bso, Bsu; \
+    uint64_t Ca, Ce, Ci, Co, Cu; \
+    uint64_t Da, De, Di, Do, Du; \
+    uint64_t Eba, Ebe, Ebi, Ebo, Ebu; \
+    uint64_t Ega, Ege, Egi, Ego, Egu; \
+    uint64_t Eka, Eke, Eki, Eko, Eku; \
+    uint64_t Ema, Eme, Emi, Emo, Emu; \
+    uint64_t Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = Aba^Aga^Aka^Ama^Asa; \
+    Ce = Abe^Age^Ake^Ame^Ase; \
+    Ci = Abi^Agi^Aki^Ami^Asi; \
+    Co = Abo^Ago^Ako^Amo^Aso; \
+    Cu = Abu^Agu^Aku^Amu^Asu; \
+
+#ifdef UseBebigokimisa
+/* --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^(  Bbe |  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    Ca = E##ba; \
+    E##be =   Bbe ^((~Bbi)|  Bbo ); \
+    Ce = E##be; \
+    E##bi =   Bbi ^(  Bbo &  Bbu ); \
+    Ci = E##bi; \
+    E##bo =   Bbo ^(  Bbu |  Bba ); \
+    Co = E##bo; \
+    E##bu =   Bbu ^(  Bba &  Bbe ); \
+    Cu = E##bu; \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^(  Bge |  Bgi ); \
+    Ca ^= E##ga; \
+    E##ge =   Bge ^(  Bgi &  Bgo ); \
+    Ce ^= E##ge; \
+    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
+    Ci ^= E##gi; \
+    E##go =   Bgo ^(  Bgu |  Bga ); \
+    Co ^= E##go; \
+    E##gu =   Bgu ^(  Bga &  Bge ); \
+    Cu ^= E##gu; \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^(  Bke |  Bki ); \
+    Ca ^= E##ka; \
+    E##ke =   Bke ^(  Bki &  Bko ); \
+    Ce ^= E##ke; \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    Ci ^= E##ki; \
+    E##ko = (~Bko)^(  Bku |  Bka ); \
+    Co ^= E##ko; \
+    E##ku =   Bku ^(  Bka &  Bke ); \
+    Cu ^= E##ku; \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^(  Bme &  Bmi ); \
+    Ca ^= E##ma; \
+    E##me =   Bme ^(  Bmi |  Bmo ); \
+    Ce ^= E##me; \
+    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
+    Ci ^= E##mi; \
+    E##mo = (~Bmo)^(  Bmu &  Bma ); \
+    Co ^= E##mo; \
+    E##mu =   Bmu ^(  Bma |  Bme ); \
+    Cu ^= E##mu; \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    Ca ^= E##sa; \
+    E##se = (~Bse)^(  Bsi |  Bso ); \
+    Ce ^= E##se; \
+    E##si =   Bsi ^(  Bso &  Bsu ); \
+    Ci ^= E##si; \
+    E##so =   Bso ^(  Bsu |  Bsa ); \
+    Co ^= E##so; \
+    E##su =   Bsu ^(  Bsa &  Bse ); \
+    Cu ^= E##su; \
+\
+
+/* --- Code for round (lane complementing pattern 'bebigokimisa') */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^(  Bbe |  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    E##be =   Bbe ^((~Bbi)|  Bbo ); \
+    E##bi =   Bbi ^(  Bbo &  Bbu ); \
+    E##bo =   Bbo ^(  Bbu |  Bba ); \
+    E##bu =   Bbu ^(  Bba &  Bbe ); \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^(  Bge |  Bgi ); \
+    E##ge =   Bge ^(  Bgi &  Bgo ); \
+    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
+    E##go =   Bgo ^(  Bgu |  Bga ); \
+    E##gu =   Bgu ^(  Bga &  Bge ); \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^(  Bke |  Bki ); \
+    E##ke =   Bke ^(  Bki &  Bko ); \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    E##ko = (~Bko)^(  Bku |  Bka ); \
+    E##ku =   Bku ^(  Bka &  Bke ); \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^(  Bme &  Bmi ); \
+    E##me =   Bme ^(  Bmi |  Bmo ); \
+    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
+    E##mo = (~Bmo)^(  Bmu &  Bma ); \
+    E##mu =   Bmu ^(  Bma |  Bme ); \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    E##se = (~Bse)^(  Bsi |  Bso ); \
+    E##si =   Bsi ^(  Bso &  Bsu ); \
+    E##so =   Bso ^(  Bsu |  Bsa ); \
+    E##su =   Bsu ^(  Bsa &  Bse ); \
+\
+
+#else /* UseBebigokimisa */
+/* --- Code for round, with prepare-theta */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^((~Bbe)&  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    Ca = E##ba; \
+    E##be =   Bbe ^((~Bbi)&  Bbo ); \
+    Ce = E##be; \
+    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
+    Ci = E##bi; \
+    E##bo =   Bbo ^((~Bbu)&  Bba ); \
+    Co = E##bo; \
+    E##bu =   Bbu ^((~Bba)&  Bbe ); \
+    Cu = E##bu; \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^((~Bge)&  Bgi ); \
+    Ca ^= E##ga; \
+    E##ge =   Bge ^((~Bgi)&  Bgo ); \
+    Ce ^= E##ge; \
+    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
+    Ci ^= E##gi; \
+    E##go =   Bgo ^((~Bgu)&  Bga ); \
+    Co ^= E##go; \
+    E##gu =   Bgu ^((~Bga)&  Bge ); \
+    Cu ^= E##gu; \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^((~Bke)&  Bki ); \
+    Ca ^= E##ka; \
+    E##ke =   Bke ^((~Bki)&  Bko ); \
+    Ce ^= E##ke; \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    Ci ^= E##ki; \
+    E##ko =   Bko ^((~Bku)&  Bka ); \
+    Co ^= E##ko; \
+    E##ku =   Bku ^((~Bka)&  Bke ); \
+    Cu ^= E##ku; \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^((~Bme)&  Bmi ); \
+    Ca ^= E##ma; \
+    E##me =   Bme ^((~Bmi)&  Bmo ); \
+    Ce ^= E##me; \
+    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
+    Ci ^= E##mi; \
+    E##mo =   Bmo ^((~Bmu)&  Bma ); \
+    Co ^= E##mo; \
+    E##mu =   Bmu ^((~Bma)&  Bme ); \
+    Cu ^= E##mu; \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    Ca ^= E##sa; \
+    E##se =   Bse ^((~Bsi)&  Bso ); \
+    Ce ^= E##se; \
+    E##si =   Bsi ^((~Bso)&  Bsu ); \
+    Ci ^= E##si; \
+    E##so =   Bso ^((~Bsu)&  Bsa ); \
+    Co ^= E##so; \
+    E##su =   Bsu ^((~Bsa)&  Bse ); \
+    Cu ^= E##su; \
+\
+
+/* --- Code for round */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^((~Bbe)&  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    E##be =   Bbe ^((~Bbi)&  Bbo ); \
+    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
+    E##bo =   Bbo ^((~Bbu)&  Bba ); \
+    E##bu =   Bbu ^((~Bba)&  Bbe ); \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^((~Bge)&  Bgi ); \
+    E##ge =   Bge ^((~Bgi)&  Bgo ); \
+    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
+    E##go =   Bgo ^((~Bgu)&  Bga ); \
+    E##gu =   Bgu ^((~Bga)&  Bge ); \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^((~Bke)&  Bki ); \
+    E##ke =   Bke ^((~Bki)&  Bko ); \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    E##ko =   Bko ^((~Bku)&  Bka ); \
+    E##ku =   Bku ^((~Bka)&  Bke ); \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^((~Bme)&  Bmi ); \
+    E##me =   Bme ^((~Bmi)&  Bmo ); \
+    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
+    E##mo =   Bmo ^((~Bmu)&  Bma ); \
+    E##mu =   Bmu ^((~Bma)&  Bme ); \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    E##se =   Bse ^((~Bsi)&  Bso ); \
+    E##si =   Bsi ^((~Bso)&  Bsu ); \
+    E##so =   Bso ^((~Bsu)&  Bsa ); \
+    E##su =   Bsu ^((~Bsa)&  Bse ); \
+\
+
+#endif /* UseBebigokimisa */
+
+#define copyFromState(X, state) \
+    X##ba = state[ 0]; \
+    X##be = state[ 1]; \
+    X##bi = state[ 2]; \
+    X##bo = state[ 3]; \
+    X##bu = state[ 4]; \
+    X##ga = state[ 5]; \
+    X##ge = state[ 6]; \
+    X##gi = state[ 7]; \
+    X##go = state[ 8]; \
+    X##gu = state[ 9]; \
+    X##ka = state[10]; \
+    X##ke = state[11]; \
+    X##ki = state[12]; \
+    X##ko = state[13]; \
+    X##ku = state[14]; \
+    X##ma = state[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba; \
+    state[ 1] = X##be; \
+    state[ 2] = X##bi; \
+    state[ 3] = X##bo; \
+    state[ 4] = X##bu; \
+    state[ 5] = X##ga; \
+    state[ 6] = X##ge; \
+    state[ 7] = X##gi; \
+    state[ 8] = X##go; \
+    state[ 9] = X##gu; \
+    state[10] = X##ka; \
+    state[11] = X##ke; \
+    state[12] = X##ki; \
+    state[13] = X##ko; \
+    state[14] = X##ku; \
+    state[15] = X##ma; \
+    state[16] = X##me; \
+    state[17] = X##mi; \
+    state[18] = X##mo; \
+    state[19] = X##mu; \
+    state[20] = X##sa; \
+    state[21] = X##se; \
+    state[22] = X##si; \
+    state[23] = X##so; \
+    state[24] = X##su; \
+
+#define copyStateVariables(X, Y) \
+    X##ba = Y##ba; \
+    X##be = Y##be; \
+    X##bi = Y##bi; \
+    X##bo = Y##bo; \
+    X##bu = Y##bu; \
+    X##ga = Y##ga; \
+    X##ge = Y##ge; \
+    X##gi = Y##gi; \
+    X##go = Y##go; \
+    X##gu = Y##gu; \
+    X##ka = Y##ka; \
+    X##ke = Y##ke; \
+    X##ki = Y##ki; \
+    X##ko = Y##ko; \
+    X##ku = Y##ku; \
+    X##ma = Y##ma; \
+    X##me = Y##me; \
+    X##mi = Y##mi; \
+    X##mo = Y##mo; \
+    X##mu = Y##mu; \
+    X##sa = Y##sa; \
+    X##se = Y##se; \
+    X##si = Y##si; \
+    X##so = Y##so; \
+    X##su = Y##su; \
+
+#if ((defined(FullUnrolling)) || (Unrolling == 12))
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 6)
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+
+#elif (Unrolling == 4)
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+
+#elif (Unrolling == 3)
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#elif (Unrolling == 2)
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#elif (Unrolling == 1)
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#else
+#error "Unrolling is not correctly specified!"
+#endif
+
+void KeccakP1600_opt64_Permute_12rounds(void *state)
+{
+    declareABCDE
+    #ifndef KeccakP1600_opt64_fullUnrolling
+    unsigned int i;
+    #endif
+    uint64_t *stateAsLanes = (uint64_t*)state;
+
+    copyFromState(A, stateAsLanes)
+    rounds12
+    copyToState(stateAsLanes, A)
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    uint64_t lane = ((uint64_t*)state)[lanePosition];
+#ifdef KeccakP1600_opt64_useLaneComplementing
+    if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+        lane = ~lane;
+#endif
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    {
+        uint64_t lane1[1];
+        lane1[0] = lane;
+        memcpy(data, (uint8_t*)lane1+offset, length);
+    }
+#else
+    unsigned int i;
+    lane >>= offset*8;
+    for(i=0; i<length; i++) {
+        data[i] = lane & 0xFF;
+        lane >>= 8;
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+static void fromWordToBytes(uint8_t *bytes, const uint64_t word)
+{
+    unsigned int i;
+
+    for(i=0; i<(64/8); i++)
+        bytes[i] = (word >> (8*i)) & 0xFF;
+}
+#endif
+
+void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    memcpy(data, state, laneCount*8);
+#else
+    unsigned int i;
+
+    for(i=0; i<laneCount; i++)
+        fromWordToBytes(data+(i*8), ((const uint64_t*)state)[i]);
+#endif
+#ifdef KeccakP1600_opt64_useLaneComplementing
+    if (laneCount > 1) {
+        ((uint64_t*)data)[ 1] = ~((uint64_t*)data)[ 1];
+        if (laneCount > 2) {
+            ((uint64_t*)data)[ 2] = ~((uint64_t*)data)[ 2];
+            if (laneCount > 8) {
+                ((uint64_t*)data)[ 8] = ~((uint64_t*)data)[ 8];
+                if (laneCount > 12) {
+                    ((uint64_t*)data)[12] = ~((uint64_t*)data)[12];
+                    if (laneCount > 17) {
+                        ((uint64_t*)data)[17] = ~((uint64_t*)data)[17];
+                        if (laneCount > 20) {
+                            ((uint64_t*)data)[20] = ~((uint64_t*)data)[20];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_ExtractBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define HTOLE64(x) (x)
+#else
+#define HTOLE64(x) (\
+  ((x & 0xff00000000000000ull) >> 56) | \
+  ((x & 0x00ff000000000000ull) >> 40) | \
+  ((x & 0x0000ff0000000000ull) >> 24) | \
+  ((x & 0x000000ff00000000ull) >> 8)  | \
+  ((x & 0x00000000ff000000ull) << 8)  | \
+  ((x & 0x0000000000ff0000ull) << 24) | \
+  ((x & 0x000000000000ff00ull) << 40) | \
+  ((x & 0x00000000000000ffull) << 56))
+#endif
+
+#define addInput(X, input, laneCount) \
+    if (laneCount == 21) { \
+        X##ba ^= HTOLE64(input[ 0]); \
+        X##be ^= HTOLE64(input[ 1]); \
+        X##bi ^= HTOLE64(input[ 2]); \
+        X##bo ^= HTOLE64(input[ 3]); \
+        X##bu ^= HTOLE64(input[ 4]); \
+        X##ga ^= HTOLE64(input[ 5]); \
+        X##ge ^= HTOLE64(input[ 6]); \
+        X##gi ^= HTOLE64(input[ 7]); \
+        X##go ^= HTOLE64(input[ 8]); \
+        X##gu ^= HTOLE64(input[ 9]); \
+        X##ka ^= HTOLE64(input[10]); \
+        X##ke ^= HTOLE64(input[11]); \
+        X##ki ^= HTOLE64(input[12]); \
+        X##ko ^= HTOLE64(input[13]); \
+        X##ku ^= HTOLE64(input[14]); \
+        X##ma ^= HTOLE64(input[15]); \
+        X##me ^= HTOLE64(input[16]); \
+        X##mi ^= HTOLE64(input[17]); \
+        X##mo ^= HTOLE64(input[18]); \
+        X##mu ^= HTOLE64(input[19]); \
+        X##sa ^= HTOLE64(input[20]); \
+    } \
+
+#include <assert.h>
+
+size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen)
+{
+    size_t originalDataByteLen = dataByteLen;
+    declareABCDE
+    #ifndef KeccakP1600_opt64_fullUnrolling
+    unsigned int i;
+    #endif
+    uint64_t *stateAsLanes = (uint64_t*)state;
+    uint64_t *inDataAsLanes = (uint64_t*)data;
+
+    assert(laneCount == 21);
+
+    #define laneCount 21
+    copyFromState(A, stateAsLanes)
+    while(dataByteLen >= laneCount*8) {
+        addInput(A, inDataAsLanes, laneCount)
+        rounds12
+        inDataAsLanes += laneCount;
+        dataByteLen -= laneCount*8;
+    }
+    #undef laneCount
+    copyToState(stateAsLanes, A)
+    return originalDataByteLen - dataByteLen;
+}
diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c
new file mode 100644
index 0000000..22a0901
--- /dev/null
+++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c
@@ -0,0 +1,406 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "brg_endian.h"
+#include "KeccakP-1600-SnP.h"
+
+#ifdef KeccakP1600_disableParallelism
+#undef KeccakP1600_enable_simd_options
+#else
+
+// Forward declaration
+void KangarooTwelve_SetProcessorCapabilities();
+#ifdef KeccakP1600_enable_simd_options
+int K12_SSSE3_requested_disabled = 0;
+int K12_AVX2_requested_disabled = 0;
+int K12_AVX512_requested_disabled = 0;
+#endif  // KeccakP1600_enable_simd_options
+int K12_enableSSSE3 = 0;
+int K12_enableAVX2 = 0;
+int K12_enableAVX512 = 0;
+
+/* ---------------------------------------------------------------- */
+
+void KangarooTwelve_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output);
+void KangarooTwelve_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output);
+
+int KeccakP1600times2_IsAvailable()
+{
+    int result = 0;
+    result |= K12_enableAVX512;
+    result |= K12_enableSSSE3;
+    return result;
+}
+
+const char * KeccakP1600times2_GetImplementation()
+{
+    if (K12_enableAVX512) {
+        return "AVX-512 implementation";
+    } else if (K12_enableSSSE3) {
+        return "SSSE3 implementation";
+    } else {
+        return "";
+    }
+}
+
+void KangarooTwelve_Process2Leaves(const unsigned char *input, unsigned char *output)
+{
+    if (K12_enableAVX512) {
+        KangarooTwelve_AVX512_Process2Leaves(input, output);
+    } else if (K12_enableSSSE3) {
+        KangarooTwelve_SSSE3_Process2Leaves(input, output);
+    }
+}
+
+
+void KangarooTwelve_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output);
+void KangarooTwelve_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output);
+
+int KeccakP1600times4_IsAvailable()
+{
+    int result = 0;
+    result |= K12_enableAVX512;
+    result |= K12_enableAVX2;
+    return result;
+}
+
+const char * KeccakP1600times4_GetImplementation()
+{
+    if (K12_enableAVX512) {
+        return "AVX-512 implementation";
+    } else if (K12_enableAVX2) {
+        return "AVX2 implementation";
+    } else {
+        return "";
+    }
+}
+
+void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output)
+{
+    if (K12_enableAVX512) {
+        KangarooTwelve_AVX512_Process4Leaves(input, output);
+    } else if (K12_enableAVX2) {
+        KangarooTwelve_AVX2_Process4Leaves(input, output);
+    }
+}
+
+
+void KangarooTwelve_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output);
+
+int KeccakP1600times8_IsAvailable()
+{
+    int result = 0;
+    result |= K12_enableAVX512;
+    return result;
+}
+
+const char * KeccakP1600times8_GetImplementation()
+{
+    if (K12_enableAVX512) {
+        return "AVX-512 implementation";
+    } else {
+        return "";
+    }
+}
+
+void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output)
+{
+    if (K12_enableAVX512)
+        KangarooTwelve_AVX512_Process8Leaves(input, output);
+}
+
+#endif  // KeccakP1600_disableParallelism
+
+const char * KeccakP1600_GetImplementation()
+{
+    if (K12_enableAVX512)
+        return "AVX-512 implementation";
+    else
+#ifndef KeccakP1600_noAssembly
+    if (K12_enableAVX2)
+        return "AVX2 implementation";
+    else
+#endif
+        return "generic 64-bit implementation";
+}
+
+void KeccakP1600_Initialize(void *state)
+{
+    KangarooTwelve_SetProcessorCapabilities();
+    if (K12_enableAVX512)
+        KeccakP1600_AVX512_Initialize(state);
+    else
+#ifndef KeccakP1600_noAssembly
+    if (K12_enableAVX2)
+        KeccakP1600_AVX2_Initialize(state);
+    else
+#endif
+        KeccakP1600_opt64_Initialize(state);
+}
+
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset)
+{
+    if (K12_enableAVX512)
+        ((unsigned char*)(state))[offset] ^= data;
+    else
+#ifndef KeccakP1600_noAssembly
+    if (K12_enableAVX2)
+        KeccakP1600_AVX2_AddByte(state, data, offset);
+    else
+#endif
+        KeccakP1600_opt64_AddByte(state, data, offset);
+}
+
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    if (K12_enableAVX512)
+        KeccakP1600_AVX512_AddBytes(state, data, offset, length);
+    else
+#ifndef KeccakP1600_noAssembly
+    if (K12_enableAVX2)
+        KeccakP1600_AVX2_AddBytes(state, data, offset, length);
+    else
+#endif
+        KeccakP1600_opt64_AddBytes(state, data, offset, length);
+}
+
+void KeccakP1600_Permute_12rounds(void *state)
+{
+    if (K12_enableAVX512)
+        KeccakP1600_AVX512_Permute_12rounds(state);
+    else
+#ifndef KeccakP1600_noAssembly
+    if (K12_enableAVX2)
+        KeccakP1600_AVX2_Permute_12rounds(state);
+    else
+#endif
+        KeccakP1600_opt64_Permute_12rounds(state);
+}
+
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    if (K12_enableAVX512)
+        KeccakP1600_AVX512_ExtractBytes(state, data, offset, length);
+    else
+#ifndef KeccakP1600_noAssembly
+    if (K12_enableAVX2)
+        KeccakP1600_AVX2_ExtractBytes(state, data, offset, length);
+    else
+#endif
+        KeccakP1600_opt64_ExtractBytes(state, data, offset, length);
+}
+
+size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen)
+{
+    if (K12_enableAVX512)
+        return KeccakP1600_AVX512_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen);
+    else
+#ifndef KeccakP1600_noAssembly
+    if (K12_enableAVX2)
+        return KeccakP1600_AVX2_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen);
+    else
+#endif
+        return KeccakP1600_opt64_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen);
+}
+
+/* ---------------------------------------------------------------- */
+
+/* Processor capability detection code by Samuel Neves and Jack O'Connor, see
+ * https://github.com/BLAKE3-team/BLAKE3/blob/master/c/blake3_dispatch.c
+ */
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define IS_X86
+#define IS_X86_64
+#endif
+
+#if defined(__i386__) || defined(_M_IX86)
+#define IS_X86
+#define IS_X86_32
+#endif
+
+#if defined(IS_X86)
+static uint64_t xgetbv() {
+#if defined(_MSC_VER)
+  return _xgetbv(0);
+#else
+  uint32_t eax = 0, edx = 0;
+  __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
+  return ((uint64_t)edx << 32) | eax;
+#endif
+}
+
+static void cpuid(uint32_t out[4], uint32_t id) {
+#if defined(_MSC_VER)
+  __cpuid((int *)out, id);
+#elif defined(__i386__) || defined(_M_IX86)
+  __asm__ __volatile__("movl %%ebx, %1\n"
+                       "cpuid\n"
+                       "xchgl %1, %%ebx\n"
+                       : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id));
+#else
+  __asm__ __volatile__("cpuid\n"
+                       : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id));
+#endif
+}
+
+static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
+#if defined(_MSC_VER)
+  __cpuidex((int *)out, id, sid);
+#elif defined(__i386__) || defined(_M_IX86)
+  __asm__ __volatile__("movl %%ebx, %1\n"
+                       "cpuid\n"
+                       "xchgl %1, %%ebx\n"
+                       : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id), "c"(sid));
+#else
+  __asm__ __volatile__("cpuid\n"
+                       : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+                       : "a"(id), "c"(sid));
+#endif
+}
+
+#endif
+
+enum cpu_feature {
+  SSE2 = 1 << 0,
+  SSSE3 = 1 << 1,
+  SSE41 = 1 << 2,
+  AVX = 1 << 3,
+  AVX2 = 1 << 4,
+  AVX512F = 1 << 5,
+  AVX512VL = 1 << 6,
+  /* ... */
+  UNDEFINED = 1 << 30
+};
+
+static enum cpu_feature g_cpu_features = UNDEFINED;
+
+static enum cpu_feature
+    get_cpu_features(void) {
+
+  if (g_cpu_features != UNDEFINED) {
+    return g_cpu_features;
+  } else {
+#if defined(IS_X86)
+    uint32_t regs[4] = {0};
+    uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
+    (void)edx;
+    enum cpu_feature features = 0;
+    cpuid(regs, 0);
+    const int max_id = *eax;
+    cpuid(regs, 1);
+#if defined(__amd64__) || defined(_M_X64)
+    features |= SSE2;
+#else
+    if (*edx & (1UL << 26))
+      features |= SSE2;
+#endif
+    if (*ecx & (1UL << 9))
+      features |= SSSE3;
+    if (*ecx & (1UL << 19))
+      features |= SSE41;
+
+    if (*ecx & (1UL << 27)) { // OSXSAVE
+      const uint64_t mask = xgetbv();
+      if ((mask & 6) == 6) { // SSE and AVX states
+        if (*ecx & (1UL << 28))
+          features |= AVX;
+        if (max_id >= 7) {
+          cpuidex(regs, 7, 0);
+          if (*ebx & (1UL << 5))
+            features |= AVX2;
+          if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
+            if (*ebx & (1UL << 31))
+              features |= AVX512VL;
+            if (*ebx & (1UL << 16))
+              features |= AVX512F;
+          }
+        }
+      }
+    }
+    g_cpu_features = features;
+    return features;
+#else
+    /* How to detect NEON? */
+    return 0;
+#endif
+  }
+}
+
+void KangarooTwelve_SetProcessorCapabilities()
+{
+    enum cpu_feature features = get_cpu_features();
+    K12_enableSSSE3 = (features & SSSE3);
+    K12_enableAVX2 = (features & AVX2);
+    K12_enableAVX512 = (features & AVX512F) && (features & AVX512VL);
+#ifdef KeccakP1600_enable_simd_options
+    K12_enableSSSE3 = K12_enableSSSE3 && !K12_SSSE3_requested_disabled;
+    K12_enableAVX2 = K12_enableAVX2 && !K12_AVX2_requested_disabled;
+    K12_enableAVX512 = K12_enableAVX512 && !K12_AVX512_requested_disabled;
+#endif  // KeccakP1600_enable_simd_options
+}
+
+#ifdef KeccakP1600_enable_simd_options
+int KangarooTwelve_DisableSSSE3(void) {
+    KangarooTwelve_SetProcessorCapabilities();
+    K12_SSSE3_requested_disabled = 1;
+    if (K12_enableSSSE3) {
+        KangarooTwelve_SetProcessorCapabilities();
+        return 1;  // SSSE3 was disabled on this call.
+    } else {
+        return 0;  // Nothing changed.
+    }
+}
+
+int KangarooTwelve_DisableAVX2(void) {
+    KangarooTwelve_SetProcessorCapabilities();
+    K12_AVX2_requested_disabled = 1;
+    if (K12_enableAVX2) {
+        KangarooTwelve_SetProcessorCapabilities();
+        return 1;  // AVX2 was disabled on this call.
+    } else {
+        return 0;  // Nothing changed.
+    }
+}
+
+int KangarooTwelve_DisableAVX512(void) {
+    KangarooTwelve_SetProcessorCapabilities();
+    K12_AVX512_requested_disabled = 1;
+    if (K12_enableAVX512) {
+        KangarooTwelve_SetProcessorCapabilities();
+        return 1;  // AVX512 was disabled on this call.
+    } else {
+        return 0;  // Nothing changed.
+    }
+}
+
+void KangarooTwelve_EnableAllCpuFeatures(void) {
+    K12_SSSE3_requested_disabled = 0;
+    K12_AVX2_requested_disabled = 0;
+    K12_AVX512_requested_disabled = 0;
+    KangarooTwelve_SetProcessorCapabilities();
+}
+#endif  // KeccakP1600_enable_simd_options
diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c
new file mode 100644
index 0000000..0abab49
--- /dev/null
+++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c
@@ -0,0 +1,419 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#include <stdint.h>
+#include <immintrin.h>
+#include "KeccakP-1600-SnP.h"
+#include "align.h"
+
+#define AVX2alignment 32
+
+#define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
+#define CONST256(a)             _mm256_load_si256((const __m256i *)&(a))
+#define CONST256_64(a)          _mm256_set1_epi64x(a)
+#define LOAD256(a)              _mm256_load_si256((const __m256i *)&(a))
+#define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d))
+#define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
+#define ROL64in256_8(d, a)      d = _mm256_shuffle_epi8(a, CONST256(rho8))
+#define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
+static ALIGN(AVX2alignment) const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
+static ALIGN(AVX2alignment) const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
+#define STORE256(a, b)          _mm256_store_si256((__m256i *)&(a), b)
+#define STORE256u(a, b)         _mm256_storeu_si256((__m256i *)&(a), b)
+#define XOR256(a, b)            _mm256_xor_si256(a, b)
+#define XOReq256(a, b)          a = _mm256_xor_si256(a, b)
+#define UNPACKL( a, b )         _mm256_unpacklo_epi64((a), (b))
+#define UNPACKH( a, b )         _mm256_unpackhi_epi64((a), (b))
+#define PERM128( a, b, c )      _mm256_permute2f128_si256(a, b, c)
+#define SHUFFLE64( a, b, c )    _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c))
+#define ZERO()                  _mm256_setzero_si256()
+
+static ALIGN(AVX2alignment) const uint64_t KeccakP1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL};
+
+#define declareABCDE \
+    __m256i Aba, Abe, Abi, Abo, Abu; \
+    __m256i Aga, Age, Agi, Ago, Agu; \
+    __m256i Aka, Ake, Aki, Ako, Aku; \
+    __m256i Ama, Ame, Ami, Amo, Amu; \
+    __m256i Asa, Ase, Asi, Aso, Asu; \
+    __m256i Bba, Bbe, Bbi, Bbo, Bbu; \
+    __m256i Bga, Bge, Bgi, Bgo, Bgu; \
+    __m256i Bka, Bke, Bki, Bko, Bku; \
+    __m256i Bma, Bme, Bmi, Bmo, Bmu; \
+    __m256i Bsa, Bse, Bsi, Bso, Bsu; \
+    __m256i Ca, Ce, Ci, Co, Cu; \
+    __m256i Ca1, Ce1, Ci1, Co1, Cu1; \
+    __m256i Da, De, Di, Do, Du; \
+    __m256i Eba, Ebe, Ebi, Ebo, Ebu; \
+    __m256i Ega, Ege, Egi, Ego, Egu; \
+    __m256i Eka, Eke, Eki, Eko, Eku; \
+    __m256i Ema, Eme, Emi, Emo, Emu; \
+    __m256i Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
+    Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
+    Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
+    Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
+    Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
+
+/* --- Theta Rho Pi Chi Iota Prepare-theta */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    ROL64in256(Ce1, Ce, 1); \
+    Da = XOR256(Cu, Ce1); \
+    ROL64in256(Ci1, Ci, 1); \
+    De = XOR256(Ca, Ci1); \
+    ROL64in256(Co1, Co, 1); \
+    Di = XOR256(Ce, Co1); \
+    ROL64in256(Cu1, Cu, 1); \
+    Do = XOR256(Ci, Cu1); \
+    ROL64in256(Ca1, Ca, 1); \
+    Du = XOR256(Co, Ca1); \
+\
+    XOReq256(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq256(A##ge, De); \
+    ROL64in256(Bbe, A##ge, 44); \
+    XOReq256(A##ki, Di); \
+    ROL64in256(Bbi, A##ki, 43); \
+    E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
+    XOReq256(E##ba, CONST256_64(KeccakP1600RoundConstants[i])); \
+    Ca = E##ba; \
+    XOReq256(A##mo, Do); \
+    ROL64in256(Bbo, A##mo, 21); \
+    E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
+    Ce = E##be; \
+    XOReq256(A##su, Du); \
+    ROL64in256(Bbu, A##su, 14); \
+    E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
+    Ci = E##bi; \
+    E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
+    Co = E##bo; \
+    E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
+    Cu = E##bu; \
+\
+    XOReq256(A##bo, Do); \
+    ROL64in256(Bga, A##bo, 28); \
+    XOReq256(A##gu, Du); \
+    ROL64in256(Bge, A##gu, 20); \
+    XOReq256(A##ka, Da); \
+    ROL64in256(Bgi, A##ka, 3); \
+    E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
+    XOReq256(Ca, E##ga); \
+    XOReq256(A##me, De); \
+    ROL64in256(Bgo, A##me, 45); \
+    E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
+    XOReq256(Ce, E##ge); \
+    XOReq256(A##si, Di); \
+    ROL64in256(Bgu, A##si, 61); \
+    E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
+    XOReq256(Ci, E##gi); \
+    E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
+    XOReq256(Co, E##go); \
+    E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
+    XOReq256(Cu, E##gu); \
+\
+    XOReq256(A##be, De); \
+    ROL64in256(Bka, A##be, 1); \
+    XOReq256(A##gi, Di); \
+    ROL64in256(Bke, A##gi, 6); \
+    XOReq256(A##ko, Do); \
+    ROL64in256(Bki, A##ko, 25); \
+    E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
+    XOReq256(Ca, E##ka); \
+    XOReq256(A##mu, Du); \
+    ROL64in256_8(Bko, A##mu); \
+    E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
+    XOReq256(Ce, E##ke); \
+    XOReq256(A##sa, Da); \
+    ROL64in256(Bku, A##sa, 18); \
+    E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
+    XOReq256(Ci, E##ki); \
+    E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
+    XOReq256(Co, E##ko); \
+    E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
+    XOReq256(Cu, E##ku); \
+\
+    XOReq256(A##bu, Du); \
+    ROL64in256(Bma, A##bu, 27); \
+    XOReq256(A##ga, Da); \
+    ROL64in256(Bme, A##ga, 36); \
+    XOReq256(A##ke, De); \
+    ROL64in256(Bmi, A##ke, 10); \
+    E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
+    XOReq256(Ca, E##ma); \
+    XOReq256(A##mi, Di); \
+    ROL64in256(Bmo, A##mi, 15); \
+    E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
+    XOReq256(Ce, E##me); \
+    XOReq256(A##so, Do); \
+    ROL64in256_56(Bmu, A##so); \
+    E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
+    XOReq256(Ci, E##mi); \
+    E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
+    XOReq256(Co, E##mo); \
+    E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
+    XOReq256(Cu, E##mu); \
+\
+    XOReq256(A##bi, Di); \
+    ROL64in256(Bsa, A##bi, 62); \
+    XOReq256(A##go, Do); \
+    ROL64in256(Bse, A##go, 55); \
+    XOReq256(A##ku, Du); \
+    ROL64in256(Bsi, A##ku, 39); \
+    E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
+    XOReq256(Ca, E##sa); \
+    XOReq256(A##ma, Da); \
+    ROL64in256(Bso, A##ma, 41); \
+    E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
+    XOReq256(Ce, E##se); \
+    XOReq256(A##se, De); \
+    ROL64in256(Bsu, A##se, 2); \
+    E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
+    XOReq256(Ci, E##si); \
+    E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
+    XOReq256(Co, E##so); \
+    E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
+    XOReq256(Cu, E##su); \
+\
+
+/* --- Theta Rho Pi Chi Iota */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    ROL64in256(Ce1, Ce, 1); \
+    Da = XOR256(Cu, Ce1); \
+    ROL64in256(Ci1, Ci, 1); \
+    De = XOR256(Ca, Ci1); \
+    ROL64in256(Co1, Co, 1); \
+    Di = XOR256(Ce, Co1); \
+    ROL64in256(Cu1, Cu, 1); \
+    Do = XOR256(Ci, Cu1); \
+    ROL64in256(Ca1, Ca, 1); \
+    Du = XOR256(Co, Ca1); \
+\
+    XOReq256(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq256(A##ge, De); \
+    ROL64in256(Bbe, A##ge, 44); \
+    XOReq256(A##ki, Di); \
+    ROL64in256(Bbi, A##ki, 43); \
+    E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
+    XOReq256(E##ba, CONST256_64(KeccakP1600RoundConstants[i])); \
+    XOReq256(A##mo, Do); \
+    ROL64in256(Bbo, A##mo, 21); \
+    E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
+    XOReq256(A##su, Du); \
+    ROL64in256(Bbu, A##su, 14); \
+    E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
+    E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
+    E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
+\
+    XOReq256(A##bo, Do); \
+    ROL64in256(Bga, A##bo, 28); \
+    XOReq256(A##gu, Du); \
+    ROL64in256(Bge, A##gu, 20); \
+    XOReq256(A##ka, Da); \
+    ROL64in256(Bgi, A##ka, 3); \
+    E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
+    XOReq256(A##me, De); \
+    ROL64in256(Bgo, A##me, 45); \
+    E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
+    XOReq256(A##si, Di); \
+    ROL64in256(Bgu, A##si, 61); \
+    E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
+    E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
+    E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
+\
+    XOReq256(A##be, De); \
+    ROL64in256(Bka, A##be, 1); \
+    XOReq256(A##gi, Di); \
+    ROL64in256(Bke, A##gi, 6); \
+    XOReq256(A##ko, Do); \
+    ROL64in256(Bki, A##ko, 25); \
+    E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
+    XOReq256(A##mu, Du); \
+    ROL64in256_8(Bko, A##mu); \
+    E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
+    XOReq256(A##sa, Da); \
+    ROL64in256(Bku, A##sa, 18); \
+    E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
+    E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
+    E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
+\
+    XOReq256(A##bu, Du); \
+    ROL64in256(Bma, A##bu, 27); \
+    XOReq256(A##ga, Da); \
+    ROL64in256(Bme, A##ga, 36); \
+    XOReq256(A##ke, De); \
+    ROL64in256(Bmi, A##ke, 10); \
+    E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
+    XOReq256(A##mi, Di); \
+    ROL64in256(Bmo, A##mi, 15); \
+    E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
+    XOReq256(A##so, Do); \
+    ROL64in256_56(Bmu, A##so); \
+    E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
+    E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
+    E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
+\
+    XOReq256(A##bi, Di); \
+    ROL64in256(Bsa, A##bi, 62); \
+    XOReq256(A##go, Do); \
+    ROL64in256(Bse, A##go, 55); \
+    XOReq256(A##ku, Du); \
+    ROL64in256(Bsi, A##ku, 39); \
+    E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
+    XOReq256(A##ma, Da); \
+    ROL64in256(Bso, A##ma, 41); \
+    E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
+    XOReq256(A##se, De); \
+    ROL64in256(Bsu, A##se, 2); \
+    E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
+    E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
+    E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
+\
+
+#define initializeState(X) \
+    X##ba = ZERO(); \
+    X##be = ZERO(); \
+    X##bi = ZERO(); \
+    X##bo = ZERO(); \
+    X##bu = ZERO(); \
+    X##ga = ZERO(); \
+    X##ge = ZERO(); \
+    X##gi = ZERO(); \
+    X##go = ZERO(); \
+    X##gu = ZERO(); \
+    X##ka = ZERO(); \
+    X##ke = ZERO(); \
+    X##ki = ZERO(); \
+    X##ko = ZERO(); \
+    X##ku = ZERO(); \
+    X##ma = ZERO(); \
+    X##me = ZERO(); \
+    X##mi = ZERO(); \
+    X##mo = ZERO(); \
+    X##mu = ZERO(); \
+    X##sa = ZERO(); \
+    X##se = ZERO(); \
+    X##si = ZERO(); \
+    X##so = ZERO(); \
+    X##su = ZERO(); \
+
+#define XORdata16(X, data0, data1, data2, data3) \
+    XOReq256(X##ba, LOAD4_64((data3)[ 0], (data2)[ 0], (data1)[ 0], (data0)[ 0])); \
+    XOReq256(X##be, LOAD4_64((data3)[ 1], (data2)[ 1], (data1)[ 1], (data0)[ 1])); \
+    XOReq256(X##bi, LOAD4_64((data3)[ 2], (data2)[ 2], (data1)[ 2], (data0)[ 2])); \
+    XOReq256(X##bo, LOAD4_64((data3)[ 3], (data2)[ 3], (data1)[ 3], (data0)[ 3])); \
+    XOReq256(X##bu, LOAD4_64((data3)[ 4], (data2)[ 4], (data1)[ 4], (data0)[ 4])); \
+    XOReq256(X##ga, LOAD4_64((data3)[ 5], (data2)[ 5], (data1)[ 5], (data0)[ 5])); \
+    XOReq256(X##ge, LOAD4_64((data3)[ 6], (data2)[ 6], (data1)[ 6], (data0)[ 6])); \
+    XOReq256(X##gi, LOAD4_64((data3)[ 7], (data2)[ 7], (data1)[ 7], (data0)[ 7])); \
+    XOReq256(X##go, LOAD4_64((data3)[ 8], (data2)[ 8], (data1)[ 8], (data0)[ 8])); \
+    XOReq256(X##gu, LOAD4_64((data3)[ 9], (data2)[ 9], (data1)[ 9], (data0)[ 9])); \
+    XOReq256(X##ka, LOAD4_64((data3)[10], (data2)[10], (data1)[10], (data0)[10])); \
+    XOReq256(X##ke, LOAD4_64((data3)[11], (data2)[11], (data1)[11], (data0)[11])); \
+    XOReq256(X##ki, LOAD4_64((data3)[12], (data2)[12], (data1)[12], (data0)[12])); \
+    XOReq256(X##ko, LOAD4_64((data3)[13], (data2)[13], (data1)[13], (data0)[13])); \
+    XOReq256(X##ku, LOAD4_64((data3)[14], (data2)[14], (data1)[14], (data0)[14])); \
+    XOReq256(X##ma, LOAD4_64((data3)[15], (data2)[15], (data1)[15], (data0)[15])); \
+
+#define XORdata21(X, data0, data1, data2, data3) \
+    XORdata16(X, data0, data1, data2, data3) \
+    XOReq256(X##me, LOAD4_64((data3)[16], (data2)[16], (data1)[16], (data0)[16])); \
+    XOReq256(X##mi, LOAD4_64((data3)[17], (data2)[17], (data1)[17], (data0)[17])); \
+    XOReq256(X##mo, LOAD4_64((data3)[18], (data2)[18], (data1)[18], (data0)[18])); \
+    XOReq256(X##mu, LOAD4_64((data3)[19], (data2)[19], (data1)[19], (data0)[19])); \
+    XOReq256(X##sa, LOAD4_64((data3)[20], (data2)[20], (data1)[20], (data0)[20])); \
+
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A)
+
+#define chunkSize 8192
+#define rateInBytes (21*8)
+
+void KangarooTwelve_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output)
+{
+    declareABCDE
+    unsigned int j;
+
+    initializeState(A);
+
+    for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) {
+        XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize));
+        rounds12
+        input += rateInBytes;
+    }
+
+    XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize));
+    XOReq256(Ame, CONST256_64(0x0BULL));
+    XOReq256(Asa, CONST256_64(0x8000000000000000ULL));
+    rounds12
+
+    {
+        __m256i lanesL01, lanesL23, lanesH01, lanesH23;
+
+        lanesL01 = UNPACKL( Aba, Abe );
+        lanesH01 = UNPACKH( Aba, Abe );
+        lanesL23 = UNPACKL( Abi, Abo );
+        lanesH23 = UNPACKH( Abi, Abo );
+        STORE256u( output[ 0], PERM128( lanesL01, lanesL23, 0x20 ) );
+        STORE256u( output[32], PERM128( lanesH01, lanesH23, 0x20 ) );
+        STORE256u( output[64], PERM128( lanesL01, lanesL23, 0x31 ) );
+        STORE256u( output[96], PERM128( lanesH01, lanesH23, 0x31 ) );
+    }
+}
diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c
new file mode 100644
index 0000000..a19fc35
--- /dev/null
+++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c
@@ -0,0 +1,458 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#include <stdint.h>
+#include <emmintrin.h>
+#include <immintrin.h>
+#include "KeccakP-1600-SnP.h"
+#include "align.h"
+
+#define AVX512alignment 64
+
+#define LOAD4_32(a,b,c,d)           _mm_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d))
+#define LOAD8_32(a,b,c,d,e,f,g,h)   _mm256_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d), (uint32_t)(e), (uint32_t)(f), (uint32_t)(g), (uint32_t)(h))
+#define LOAD_GATHER2_64(idx,p)      _mm_i32gather_epi64( (const void*)(p), idx, 8)
+#define LOAD_GATHER4_64(idx,p)      _mm256_i32gather_epi64( (const void*)(p), idx, 8)
+#define LOAD_GATHER8_64(idx,p)      _mm512_i32gather_epi64( idx, (const void*)(p), 8)
+#define STORE_SCATTER8_64(p,idx, v) _mm512_i32scatter_epi64( (void*)(p), idx, v, 8)
+
+
+/* Keccak-p[1600]×2 */
+
+#define XOR(a,b)                    _mm_xor_si128(a,b)
+#define XOReq(a, b)                 a = _mm_xor_si128(a, b)
+#define XOR3(a,b,c)                 _mm_ternarylogic_epi64(a,b,c,0x96)
+#define XOR5(a,b,c,d,e)             XOR3(XOR3(a,b,c),d,e)
+#define ROL(a,offset)               _mm_rol_epi64(a,offset)
+#define Chi(a,b,c)                  _mm_ternarylogic_epi64(a,b,c,0xD2)
+#define CONST_64(a)                 _mm_set1_epi64x(a)
+#define LOAD6464(a, b)              _mm_set_epi64x(a, b)
+#define STORE128u(a, b)             _mm_storeu_si128((__m128i *)&(a), b)
+#define UNPACKL( a, b )             _mm_unpacklo_epi64((a), (b))
+#define UNPACKH( a, b )             _mm_unpackhi_epi64((a), (b))
+#define ZERO()              _mm_setzero_si128()
+
+static ALIGN(AVX512alignment) const uint64_t KeccakP1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL};
+
+#define KeccakP_DeclareVars(type) \
+    type    _Ba, _Be, _Bi, _Bo, _Bu; \
+    type    _Da, _De, _Di, _Do, _Du; \
+    type    _ba, _be, _bi, _bo, _bu; \
+    type    _ga, _ge, _gi, _go, _gu; \
+    type    _ka, _ke, _ki, _ko, _ku; \
+    type    _ma, _me, _mi, _mo, _mu; \
+    type    _sa, _se, _si, _so, _su
+
+#define KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bb1, _Bb2, _Bb3, _Bb4, _Bb5, _Rr1, _Rr2, _Rr3, _Rr4, _Rr5 ) \
+    _Bb1 = XOR(_L1, _Da); \
+    _Bb2 = XOR(_L2, _De); \
+    _Bb3 = XOR(_L3, _Di); \
+    _Bb4 = XOR(_L4, _Do); \
+    _Bb5 = XOR(_L5, _Du); \
+    if (_Rr1 != 0) _Bb1 = ROL(_Bb1, _Rr1); \
+    _Bb2 = ROL(_Bb2, _Rr2); \
+    _Bb3 = ROL(_Bb3, _Rr3); \
+    _Bb4 = ROL(_Bb4, _Rr4); \
+    _Bb5 = ROL(_Bb5, _Rr5); \
+    _L1 = Chi( _Ba, _Be, _Bi); \
+    _L2 = Chi( _Be, _Bi, _Bo); \
+    _L3 = Chi( _Bi, _Bo, _Bu); \
+    _L4 = Chi( _Bo, _Bu, _Ba); \
+    _L5 = Chi( _Bu, _Ba, _Be);
+
+#define KeccakP_ThetaRhoPiChiIota0( _L1, _L2, _L3, _L4, _L5, _rc ) \
+    _Ba = XOR5( _ba, _ga, _ka, _ma, _sa ); /* Theta effect */ \
+    _Be = XOR5( _be, _ge, _ke, _me, _se ); \
+    _Bi = XOR5( _bi, _gi, _ki, _mi, _si ); \
+    _Bo = XOR5( _bo, _go, _ko, _mo, _so ); \
+    _Bu = XOR5( _bu, _gu, _ku, _mu, _su ); \
+    _Da = ROL( _Be, 1 ); \
+    _De = ROL( _Bi, 1 ); \
+    _Di = ROL( _Bo, 1 ); \
+    _Do = ROL( _Bu, 1 ); \
+    _Du = ROL( _Ba, 1 ); \
+    _Da = XOR( _Da, _Bu ); \
+    _De = XOR( _De, _Ba ); \
+    _Di = XOR( _Di, _Be ); \
+    _Do = XOR( _Do, _Bi ); \
+    _Du = XOR( _Du, _Bo ); \
+    KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Ba, _Be, _Bi, _Bo, _Bu,  0, 44, 43, 21, 14 ); \
+    _L1 = XOR(_L1, _rc) /* Iota */
+
+#define KeccakP_ThetaRhoPiChi1( _L1, _L2, _L3, _L4, _L5 ) \
+    KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bi, _Bo, _Bu, _Ba, _Be,  3, 45, 61, 28, 20 )
+
+#define KeccakP_ThetaRhoPiChi2( _L1, _L2, _L3, _L4, _L5 ) \
+    KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bu, _Ba, _Be, _Bi, _Bo, 18,  1,  6, 25,  8 )
+
+#define KeccakP_ThetaRhoPiChi3( _L1, _L2, _L3, _L4, _L5 ) \
+    KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Be, _Bi, _Bo, _Bu, _Ba, 36, 10, 15, 56, 27 )
+
+#define KeccakP_ThetaRhoPiChi4( _L1, _L2, _L3, _L4, _L5 ) \
+    KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bo, _Bu, _Ba, _Be, _Bi, 41,  2, 62, 55, 39 )
+
+#define KeccakP_4rounds( i ) \
+    KeccakP_ThetaRhoPiChiIota0(_ba, _ge, _ki, _mo, _su, CONST_64(KeccakP1600RoundConstants[i]) ); \
+    KeccakP_ThetaRhoPiChi1(    _ka, _me, _si, _bo, _gu ); \
+    KeccakP_ThetaRhoPiChi2(    _sa, _be, _gi, _ko, _mu ); \
+    KeccakP_ThetaRhoPiChi3(    _ga, _ke, _mi, _so, _bu ); \
+    KeccakP_ThetaRhoPiChi4(    _ma, _se, _bi, _go, _ku ); \
+\
+    KeccakP_ThetaRhoPiChiIota0(_ba, _me, _gi, _so, _ku, CONST_64(KeccakP1600RoundConstants[i+1]) ); \
+    KeccakP_ThetaRhoPiChi1(    _sa, _ke, _bi, _mo, _gu ); \
+    KeccakP_ThetaRhoPiChi2(    _ma, _ge, _si, _ko, _bu ); \
+    KeccakP_ThetaRhoPiChi3(    _ka, _be, _mi, _go, _su ); \
+    KeccakP_ThetaRhoPiChi4(    _ga, _se, _ki, _bo, _mu ); \
+\
+    KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST_64(KeccakP1600RoundConstants[i+2]) ); \
+    KeccakP_ThetaRhoPiChi1(    _ma, _be, _ki, _so, _gu ); \
+    KeccakP_ThetaRhoPiChi2(    _ga, _me, _bi, _ko, _su ); \
+    KeccakP_ThetaRhoPiChi3(    _sa, _ge, _mi, _bo, _ku ); \
+    KeccakP_ThetaRhoPiChi4(    _ka, _se, _gi, _mo, _bu ); \
+\
+    KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST_64(KeccakP1600RoundConstants[i+3]) ); \
+    KeccakP_ThetaRhoPiChi1(    _ga, _ge, _gi, _go, _gu ); \
+    KeccakP_ThetaRhoPiChi2(    _ka, _ke, _ki, _ko, _ku ); \
+    KeccakP_ThetaRhoPiChi3(    _ma, _me, _mi, _mo, _mu ); \
+    KeccakP_ThetaRhoPiChi4(    _sa, _se, _si, _so, _su )
+
+#define rounds12 \
+    KeccakP_4rounds( 12 ); \
+    KeccakP_4rounds( 16 ); \
+    KeccakP_4rounds( 20 )
+
+#define initializeState(X) \
+    X##ba = ZERO(); \
+    X##be = ZERO(); \
+    X##bi = ZERO(); \
+    X##bo = ZERO(); \
+    X##bu = ZERO(); \
+    X##ga = ZERO(); \
+    X##ge = ZERO(); \
+    X##gi = ZERO(); \
+    X##go = ZERO(); \
+    X##gu = ZERO(); \
+    X##ka = ZERO(); \
+    X##ke = ZERO(); \
+    X##ki = ZERO(); \
+    X##ko = ZERO(); \
+    X##ku = ZERO(); \
+    X##ma = ZERO(); \
+    X##me = ZERO(); \
+    X##mi = ZERO(); \
+    X##mo = ZERO(); \
+    X##mu = ZERO(); \
+    X##sa = ZERO(); \
+    X##se = ZERO(); \
+    X##si = ZERO(); \
+    X##so = ZERO(); \
+    X##su = ZERO(); \
+
+#define XORdata16(X, data0, data1) \
+    XOReq(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \
+    XOReq(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \
+    XOReq(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \
+    XOReq(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \
+    XOReq(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \
+    XOReq(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \
+    XOReq(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \
+    XOReq(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \
+    XOReq(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \
+    XOReq(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \
+    XOReq(X##ka, LOAD6464((data1)[10], (data0)[10])); \
+    XOReq(X##ke, LOAD6464((data1)[11], (data0)[11])); \
+    XOReq(X##ki, LOAD6464((data1)[12], (data0)[12])); \
+    XOReq(X##ko, LOAD6464((data1)[13], (data0)[13])); \
+    XOReq(X##ku, LOAD6464((data1)[14], (data0)[14])); \
+    XOReq(X##ma, LOAD6464((data1)[15], (data0)[15])); \
+
+#define XORdata21(X, data0, data1) \
+    XORdata16(X, data0, data1) \
+    XOReq(X##me, LOAD6464((data1)[16], (data0)[16])); \
+    XOReq(X##mi, LOAD6464((data1)[17], (data0)[17])); \
+    XOReq(X##mo, LOAD6464((data1)[18], (data0)[18])); \
+    XOReq(X##mu, LOAD6464((data1)[19], (data0)[19])); \
+    XOReq(X##sa, LOAD6464((data1)[20], (data0)[20])); \
+
+#define chunkSize 8192
+#define rateInBytes (21*8)
+
+void KangarooTwelve_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output)
+{
+    KeccakP_DeclareVars(__m128i);
+    unsigned int j;
+
+    initializeState(_);
+
+    for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) {
+        XORdata21(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize));
+        rounds12
+        input += rateInBytes;
+    }
+
+    XORdata16(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize));
+    XOReq(_me, CONST_64(0x0BULL));
+    XOReq(_sa, CONST_64(0x8000000000000000ULL));
+    rounds12
+
+    STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( _ba, _be ) );
+    STORE128u( *(__m128i*)&(output[16]), UNPACKL( _bi, _bo ) );
+    STORE128u( *(__m128i*)&(output[32]), UNPACKH( _ba, _be ) );
+    STORE128u( *(__m128i*)&(output[48]), UNPACKH( _bi, _bo ) );
+}
+
+#undef XOR
+#undef XOReq
+#undef XOR3
+#undef XOR5
+#undef ROL
+#undef Chi
+#undef CONST_64
+#undef LOAD6464
+#undef STORE128u
+#undef UNPACKL
+#undef UNPACKH
+#undef ZERO
+#undef XORdata16
+#undef XORdata21
+
+
+/* Keccak-p[1600]×4 */
+
+#define XOR(a,b)                    _mm256_xor_si256(a,b)
+#define XOReq(a,b)                  a = _mm256_xor_si256(a,b)
+#define XOR3(a,b,c)                 _mm256_ternarylogic_epi64(a,b,c,0x96)
+#define XOR5(a,b,c,d,e)             XOR3(XOR3(a,b,c),d,e)
+#define XOR512(a,b)                 _mm512_xor_si512(a,b)
+#define ROL(a,offset)               _mm256_rol_epi64(a,offset)
+#define Chi(a,b,c)                  _mm256_ternarylogic_epi64(a,b,c,0xD2)
+#define CONST_64(a)                 _mm256_set1_epi64x(a)
+#define ZERO()                      _mm256_setzero_si256()
+#define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d))
+
+#define XORdata16(X, data0, data1, data2, data3) \
+    XOReq(X##ba, LOAD4_64((data3)[ 0], (data2)[ 0], (data1)[ 0], (data0)[ 0])); \
+    XOReq(X##be, LOAD4_64((data3)[ 1], (data2)[ 1], (data1)[ 1], (data0)[ 1])); \
+    XOReq(X##bi, LOAD4_64((data3)[ 2], (data2)[ 2], (data1)[ 2], (data0)[ 2])); \
+    XOReq(X##bo, LOAD4_64((data3)[ 3], (data2)[ 3], (data1)[ 3], (data0)[ 3])); \
+    XOReq(X##bu, LOAD4_64((data3)[ 4], (data2)[ 4], (data1)[ 4], (data0)[ 4])); \
+    XOReq(X##ga, LOAD4_64((data3)[ 5], (data2)[ 5], (data1)[ 5], (data0)[ 5])); \
+    XOReq(X##ge, LOAD4_64((data3)[ 6], (data2)[ 6], (data1)[ 6], (data0)[ 6])); \
+    XOReq(X##gi, LOAD4_64((data3)[ 7], (data2)[ 7], (data1)[ 7], (data0)[ 7])); \
+    XOReq(X##go, LOAD4_64((data3)[ 8], (data2)[ 8], (data1)[ 8], (data0)[ 8])); \
+    XOReq(X##gu, LOAD4_64((data3)[ 9], (data2)[ 9], (data1)[ 9], (data0)[ 9])); \
+    XOReq(X##ka, LOAD4_64((data3)[10], (data2)[10], (data1)[10], (data0)[10])); \
+    XOReq(X##ke, LOAD4_64((data3)[11], (data2)[11], (data1)[11], (data0)[11])); \
+    XOReq(X##ki, LOAD4_64((data3)[12], (data2)[12], (data1)[12], (data0)[12])); \
+    XOReq(X##ko, LOAD4_64((data3)[13], (data2)[13], (data1)[13], (data0)[13])); \
+    XOReq(X##ku, LOAD4_64((data3)[14], (data2)[14], (data1)[14], (data0)[14])); \
+    XOReq(X##ma, LOAD4_64((data3)[15], (data2)[15], (data1)[15], (data0)[15])); \
+
+#define XORdata21(X, data0, data1, data2, data3) \
+    XORdata16(X, data0, data1, data2, data3) \
+    XOReq(X##me, LOAD4_64((data3)[16], (data2)[16], (data1)[16], (data0)[16])); \
+    XOReq(X##mi, LOAD4_64((data3)[17], (data2)[17], (data1)[17], (data0)[17])); \
+    XOReq(X##mo, LOAD4_64((data3)[18], (data2)[18], (data1)[18], (data0)[18])); \
+    XOReq(X##mu, LOAD4_64((data3)[19], (data2)[19], (data1)[19], (data0)[19])); \
+    XOReq(X##sa, LOAD4_64((data3)[20], (data2)[20], (data1)[20], (data0)[20])); \
+
+void KangarooTwelve_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output)
+{
+    KeccakP_DeclareVars(__m256i);
+    unsigned int j;
+
+    initializeState(_);
+
+    for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) {
+        XORdata21(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize));
+        rounds12
+        input += rateInBytes;
+    }
+
+    XORdata16(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize));
+    XOReq(_me, CONST_64(0x0BULL));
+    XOReq(_sa, CONST_64(0x8000000000000000ULL));
+    rounds12
+
+#define STORE256u(a, b)         _mm256_storeu_si256((__m256i *)&(a), b)
+#define UNPACKL( a, b )         _mm256_unpacklo_epi64((a), (b))
+#define UNPACKH( a, b )         _mm256_unpackhi_epi64((a), (b))
+#define PERM128( a, b, c )      _mm256_permute2f128_si256(a, b, c)
+    {
+        __m256i lanesL01, lanesL23, lanesH01, lanesH23;
+
+        lanesL01 = UNPACKL( _ba, _be );
+        lanesH01 = UNPACKH( _ba, _be );
+        lanesL23 = UNPACKL( _bi, _bo );
+        lanesH23 = UNPACKH( _bi, _bo );
+        STORE256u( output[ 0], PERM128( lanesL01, lanesL23, 0x20 ) );
+        STORE256u( output[32], PERM128( lanesH01, lanesH23, 0x20 ) );
+        STORE256u( output[64], PERM128( lanesL01, lanesL23, 0x31 ) );
+        STORE256u( output[96], PERM128( lanesH01, lanesH23, 0x31 ) );
+    }
+/* TODO: check if something like this would be better:
+    index512 = LOAD8_32(3*laneOffset+1, 2*laneOffset+1, 1*laneOffset+1, 0*laneOffset+1, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
+    STORE_SCATTER8_64(dataAsLanes+0, index512, stateAsLanes512[0/2]);
+    STORE_SCATTER8_64(dataAsLanes+2, index512, stateAsLanes512[2/2]);
+*/
+}
+
+#undef XOR
+#undef XOReq
+#undef XOR3
+#undef XOR5
+#undef XOR512
+#undef ROL
+#undef Chi
+#undef CONST_64
+#undef ZERO
+#undef LOAD4_64
+#undef XORdata16
+#undef XORdata21
+
+
+/* Keccak-p[1600]×8 */
+
+#define XOR(a,b)                    _mm512_xor_si512(a,b)
+#define XOReq(a,b)                  a = _mm512_xor_si512(a,b)
+#define XOR3(a,b,c)                 _mm512_ternarylogic_epi64(a,b,c,0x96)
+#define XOR5(a,b,c,d,e)             XOR3(XOR3(a,b,c),d,e)
+#define XOReq512(a, b)              a = XOR(a,b)
+#define ROL(a,offset)               _mm512_rol_epi64(a,offset)
+#define Chi(a,b,c)                  _mm512_ternarylogic_epi64(a,b,c,0xD2)
+#define CONST_64(a)                 _mm512_set1_epi64(a)
+#define ZERO()                      _mm512_setzero_si512()
+#define LOAD(p)                     _mm512_loadu_si512(p)
+
+#define LoadAndTranspose8(dataAsLanes, offset) \
+    t0 = LOAD((dataAsLanes) + (offset) + 0*chunkSize/8); \
+    t1 = LOAD((dataAsLanes) + (offset) + 1*chunkSize/8); \
+    t2 = LOAD((dataAsLanes) + (offset) + 2*chunkSize/8); \
+    t3 = LOAD((dataAsLanes) + (offset) + 3*chunkSize/8); \
+    t4 = LOAD((dataAsLanes) + (offset) + 4*chunkSize/8); \
+    t5 = LOAD((dataAsLanes) + (offset) + 5*chunkSize/8); \
+    t6 = LOAD((dataAsLanes) + (offset) + 6*chunkSize/8); \
+    t7 = LOAD((dataAsLanes) + (offset) + 7*chunkSize/8); \
+    r0 = _mm512_unpacklo_epi64(t0, t1); \
+    r1 = _mm512_unpackhi_epi64(t0, t1); \
+    r2 = _mm512_unpacklo_epi64(t2, t3); \
+    r3 = _mm512_unpackhi_epi64(t2, t3); \
+    r4 = _mm512_unpacklo_epi64(t4, t5); \
+    r5 = _mm512_unpackhi_epi64(t4, t5); \
+    r6 = _mm512_unpacklo_epi64(t6, t7); \
+    r7 = _mm512_unpackhi_epi64(t6, t7); \
+    t0 = _mm512_shuffle_i32x4(r0, r2, 0x88); \
+    t1 = _mm512_shuffle_i32x4(r1, r3, 0x88); \
+    t2 = _mm512_shuffle_i32x4(r0, r2, 0xdd); \
+    t3 = _mm512_shuffle_i32x4(r1, r3, 0xdd); \
+    t4 = _mm512_shuffle_i32x4(r4, r6, 0x88); \
+    t5 = _mm512_shuffle_i32x4(r5, r7, 0x88); \
+    t6 = _mm512_shuffle_i32x4(r4, r6, 0xdd); \
+    t7 = _mm512_shuffle_i32x4(r5, r7, 0xdd); \
+    r0 = _mm512_shuffle_i32x4(t0, t4, 0x88); \
+    r1 = _mm512_shuffle_i32x4(t1, t5, 0x88); \
+    r2 = _mm512_shuffle_i32x4(t2, t6, 0x88); \
+    r3 = _mm512_shuffle_i32x4(t3, t7, 0x88); \
+    r4 = _mm512_shuffle_i32x4(t0, t4, 0xdd); \
+    r5 = _mm512_shuffle_i32x4(t1, t5, 0xdd); \
+    r6 = _mm512_shuffle_i32x4(t2, t6, 0xdd); \
+    r7 = _mm512_shuffle_i32x4(t3, t7, 0xdd); \
+
+#define XORdata16(X, index, dataAsLanes) \
+    LoadAndTranspose8(dataAsLanes, 0) \
+    XOReq(X##ba, r0); \
+    XOReq(X##be, r1); \
+    XOReq(X##bi, r2); \
+    XOReq(X##bo, r3); \
+    XOReq(X##bu, r4); \
+    XOReq(X##ga, r5); \
+    XOReq(X##ge, r6); \
+    XOReq(X##gi, r7); \
+    LoadAndTranspose8(dataAsLanes, 8) \
+    XOReq(X##go, r0); \
+    XOReq(X##gu, r1); \
+    XOReq(X##ka, r2); \
+    XOReq(X##ke, r3); \
+    XOReq(X##ki, r4); \
+    XOReq(X##ko, r5); \
+    XOReq(X##ku, r6); \
+    XOReq(X##ma, r7); \
+
+#define XORdata21(X, index, dataAsLanes) \
+    XORdata16(X, index, dataAsLanes) \
+    XOReq(X##me, LOAD_GATHER8_64(index, (dataAsLanes) + 16)); \
+    XOReq(X##mi, LOAD_GATHER8_64(index, (dataAsLanes) + 17)); \
+    XOReq(X##mo, LOAD_GATHER8_64(index, (dataAsLanes) + 18)); \
+    XOReq(X##mu, LOAD_GATHER8_64(index, (dataAsLanes) + 19)); \
+    XOReq(X##sa, LOAD_GATHER8_64(index, (dataAsLanes) + 20)); \
+
+void KangarooTwelve_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output)
+{
+    KeccakP_DeclareVars(__m512i);
+    unsigned int j;
+    const uint64_t *outputAsLanes = (const uint64_t *)output;
+    __m256i index;
+    __m512i t0, t1, t2, t3, t4, t5, t6, t7;
+    __m512i r0, r1, r2, r3, r4, r5, r6, r7;
+
+    initializeState(_);
+
+    index = LOAD8_32(7*(chunkSize / 8), 6*(chunkSize / 8), 5*(chunkSize / 8), 4*(chunkSize / 8), 3*(chunkSize / 8), 2*(chunkSize / 8), 1*(chunkSize / 8), 0*(chunkSize / 8));
+    for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) {
+        XORdata21(_, index, (const uint64_t *)input);
+        rounds12
+        input += rateInBytes;
+    }
+
+    XORdata16(_, index, (const uint64_t *)input);
+    XOReq(_me, CONST_64(0x0BULL));
+    XOReq(_sa, CONST_64(0x8000000000000000ULL));
+    rounds12
+
+    index = LOAD8_32(7*4, 6*4, 5*4, 4*4, 3*4, 2*4, 1*4, 0*4);
+    STORE_SCATTER8_64(outputAsLanes+0, index, _ba);
+    STORE_SCATTER8_64(outputAsLanes+1, index, _be);
+    STORE_SCATTER8_64(outputAsLanes+2, index, _bi);
+    STORE_SCATTER8_64(outputAsLanes+3, index, _bo);
+}
diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c
new file mode 100644
index 0000000..036df52
--- /dev/null
+++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c
@@ -0,0 +1,438 @@
+/*
+K12 based on the eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
+
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to the XKCP for more details.
+*/
+
+#include <stdint.h>
+#include <tmmintrin.h>
+#include "KeccakP-1600-SnP.h"
+#include "align.h"
+
+#define KeccakP1600times2_SSSE3_unrolling 2
+
+#define SSSE3alignment 16
+
+#define ANDnu128(a, b)      _mm_andnot_si128(a, b)
+#define CONST128(a)         _mm_load_si128((const __m128i *)&(a))
+#define LOAD128(a)          _mm_load_si128((const __m128i *)&(a))
+#define LOAD6464(a, b)      _mm_set_epi64x(a, b)
+#define CONST128_64(a)      _mm_set1_epi64x(a)
+#define ROL64in128(a, o)    _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
+#define ROL64in128_8(a)     _mm_shuffle_epi8(a, CONST128(rho8))
+#define ROL64in128_56(a)    _mm_shuffle_epi8(a, CONST128(rho56))
+static const uint64_t rho8[2] = {0x0605040302010007, 0x0E0D0C0B0A09080F};
+static const uint64_t rho56[2] = {0x0007060504030201, 0x080F0E0D0C0B0A09};
+#define STORE128(a, b)      _mm_store_si128((__m128i *)&(a), b)
+#define STORE128u(a, b)     _mm_storeu_si128((__m128i *)&(a), b)
+#define XOR128(a, b)        _mm_xor_si128(a, b)
+#define XOReq128(a, b)      a = _mm_xor_si128(a, b)
+#define UNPACKL( a, b )     _mm_unpacklo_epi64((a), (b))
+#define UNPACKH( a, b )     _mm_unpackhi_epi64((a), (b))
+#define ZERO()              _mm_setzero_si128()
+
+static ALIGN(SSSE3alignment) const uint64_t KeccakP1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL};
+
+#define declareABCDE \
+    __m128i Aba, Abe, Abi, Abo, Abu; \
+    __m128i Aga, Age, Agi, Ago, Agu; \
+    __m128i Aka, Ake, Aki, Ako, Aku; \
+    __m128i Ama, Ame, Ami, Amo, Amu; \
+    __m128i Asa, Ase, Asi, Aso, Asu; \
+    __m128i Bba, Bbe, Bbi, Bbo, Bbu; \
+    __m128i Bga, Bge, Bgi, Bgo, Bgu; \
+    __m128i Bka, Bke, Bki, Bko, Bku; \
+    __m128i Bma, Bme, Bmi, Bmo, Bmu; \
+    __m128i Bsa, Bse, Bsi, Bso, Bsu; \
+    __m128i Ca, Ce, Ci, Co, Cu; \
+    __m128i Da, De, Di, Do, Du; \
+    __m128i Eba, Ebe, Ebi, Ebo, Ebu; \
+    __m128i Ega, Ege, Egi, Ego, Egu; \
+    __m128i Eka, Eke, Eki, Eko, Eku; \
+    __m128i Ema, Eme, Emi, Emo, Emu; \
+    __m128i Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = XOR128(Aba, XOR128(Aga, XOR128(Aka, XOR128(Ama, Asa)))); \
+    Ce = XOR128(Abe, XOR128(Age, XOR128(Ake, XOR128(Ame, Ase)))); \
+    Ci = XOR128(Abi, XOR128(Agi, XOR128(Aki, XOR128(Ami, Asi)))); \
+    Co = XOR128(Abo, XOR128(Ago, XOR128(Ako, XOR128(Amo, Aso)))); \
+    Cu = XOR128(Abu, XOR128(Agu, XOR128(Aku, XOR128(Amu, Asu)))); \
+
+/* --- Theta Rho Pi Chi Iota Prepare-theta */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = XOR128(Cu, ROL64in128(Ce, 1)); \
+    De = XOR128(Ca, ROL64in128(Ci, 1)); \
+    Di = XOR128(Ce, ROL64in128(Co, 1)); \
+    Do = XOR128(Ci, ROL64in128(Cu, 1)); \
+    Du = XOR128(Co, ROL64in128(Ca, 1)); \
+\
+    XOReq128(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq128(A##ge, De); \
+    Bbe = ROL64in128(A##ge, 44); \
+    XOReq128(A##ki, Di); \
+    Bbi = ROL64in128(A##ki, 43); \
+    E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \
+    XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \
+    Ca = E##ba; \
+    XOReq128(A##mo, Do); \
+    Bbo = ROL64in128(A##mo, 21); \
+    E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \
+    Ce = E##be; \
+    XOReq128(A##su, Du); \
+    Bbu = ROL64in128(A##su, 14); \
+    E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \
+    Ci = E##bi; \
+    E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \
+    Co = E##bo; \
+    E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \
+    Cu = E##bu; \
+\
+    XOReq128(A##bo, Do); \
+    Bga = ROL64in128(A##bo, 28); \
+    XOReq128(A##gu, Du); \
+    Bge = ROL64in128(A##gu, 20); \
+    XOReq128(A##ka, Da); \
+    Bgi = ROL64in128(A##ka, 3); \
+    E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \
+    XOReq128(Ca, E##ga); \
+    XOReq128(A##me, De); \
+    Bgo = ROL64in128(A##me, 45); \
+    E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \
+    XOReq128(Ce, E##ge); \
+    XOReq128(A##si, Di); \
+    Bgu = ROL64in128(A##si, 61); \
+    E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \
+    XOReq128(Ci, E##gi); \
+    E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \
+    XOReq128(Co, E##go); \
+    E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \
+    XOReq128(Cu, E##gu); \
+\
+    XOReq128(A##be, De); \
+    Bka = ROL64in128(A##be, 1); \
+    XOReq128(A##gi, Di); \
+    Bke = ROL64in128(A##gi, 6); \
+    XOReq128(A##ko, Do); \
+    Bki = ROL64in128(A##ko, 25); \
+    E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \
+    XOReq128(Ca, E##ka); \
+    XOReq128(A##mu, Du); \
+    Bko = ROL64in128_8(A##mu); \
+    E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \
+    XOReq128(Ce, E##ke); \
+    XOReq128(A##sa, Da); \
+    Bku = ROL64in128(A##sa, 18); \
+    E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \
+    XOReq128(Ci, E##ki); \
+    E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \
+    XOReq128(Co, E##ko); \
+    E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \
+    XOReq128(Cu, E##ku); \
+\
+    XOReq128(A##bu, Du); \
+    Bma = ROL64in128(A##bu, 27); \
+    XOReq128(A##ga, Da); \
+    Bme = ROL64in128(A##ga, 36); \
+    XOReq128(A##ke, De); \
+    Bmi = ROL64in128(A##ke, 10); \
+    E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \
+    XOReq128(Ca, E##ma); \
+    XOReq128(A##mi, Di); \
+    Bmo = ROL64in128(A##mi, 15); \
+    E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \
+    XOReq128(Ce, E##me); \
+    XOReq128(A##so, Do); \
+    Bmu = ROL64in128_56(A##so); \
+    E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \
+    XOReq128(Ci, E##mi); \
+    E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \
+    XOReq128(Co, E##mo); \
+    E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \
+    XOReq128(Cu, E##mu); \
+\
+    XOReq128(A##bi, Di); \
+    Bsa = ROL64in128(A##bi, 62); \
+    XOReq128(A##go, Do); \
+    Bse = ROL64in128(A##go, 55); \
+    XOReq128(A##ku, Du); \
+    Bsi = ROL64in128(A##ku, 39); \
+    E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \
+    XOReq128(Ca, E##sa); \
+    XOReq128(A##ma, Da); \
+    Bso = ROL64in128(A##ma, 41); \
+    E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \
+    XOReq128(Ce, E##se); \
+    XOReq128(A##se, De); \
+    Bsu = ROL64in128(A##se, 2); \
+    E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \
+    XOReq128(Ci, E##si); \
+    E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \
+    XOReq128(Co, E##so); \
+    E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \
+    XOReq128(Cu, E##su); \
+\
+
+/* --- Theta Rho Pi Chi Iota */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = XOR128(Cu, ROL64in128(Ce, 1)); \
+    De = XOR128(Ca, ROL64in128(Ci, 1)); \
+    Di = XOR128(Ce, ROL64in128(Co, 1)); \
+    Do = XOR128(Ci, ROL64in128(Cu, 1)); \
+    Du = XOR128(Co, ROL64in128(Ca, 1)); \
+\
+    XOReq128(A##ba, Da); \
+    Bba = A##ba; \
+    XOReq128(A##ge, De); \
+    Bbe = ROL64in128(A##ge, 44); \
+    XOReq128(A##ki, Di); \
+    Bbi = ROL64in128(A##ki, 43); \
+    E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \
+    XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \
+    XOReq128(A##mo, Do); \
+    Bbo = ROL64in128(A##mo, 21); \
+    E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \
+    XOReq128(A##su, Du); \
+    Bbu = ROL64in128(A##su, 14); \
+    E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \
+    E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \
+    E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \
+\
+    XOReq128(A##bo, Do); \
+    Bga = ROL64in128(A##bo, 28); \
+    XOReq128(A##gu, Du); \
+    Bge = ROL64in128(A##gu, 20); \
+    XOReq128(A##ka, Da); \
+    Bgi = ROL64in128(A##ka, 3); \
+    E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \
+    XOReq128(A##me, De); \
+    Bgo = ROL64in128(A##me, 45); \
+    E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \
+    XOReq128(A##si, Di); \
+    Bgu = ROL64in128(A##si, 61); \
+    E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \
+    E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \
+    E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \
+\
+    XOReq128(A##be, De); \
+    Bka = ROL64in128(A##be, 1); \
+    XOReq128(A##gi, Di); \
+    Bke = ROL64in128(A##gi, 6); \
+    XOReq128(A##ko, Do); \
+    Bki = ROL64in128(A##ko, 25); \
+    E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \
+    XOReq128(A##mu, Du); \
+    Bko = ROL64in128_8(A##mu); \
+    E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \
+    XOReq128(A##sa, Da); \
+    Bku = ROL64in128(A##sa, 18); \
+    E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \
+    E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \
+    E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \
+\
+    XOReq128(A##bu, Du); \
+    Bma = ROL64in128(A##bu, 27); \
+    XOReq128(A##ga, Da); \
+    Bme = ROL64in128(A##ga, 36); \
+    XOReq128(A##ke, De); \
+    Bmi = ROL64in128(A##ke, 10); \
+    E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \
+    XOReq128(A##mi, Di); \
+    Bmo = ROL64in128(A##mi, 15); \
+    E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \
+    XOReq128(A##so, Do); \
+    Bmu = ROL64in128_56(A##so); \
+    E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \
+    E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \
+    E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \
+\
+    XOReq128(A##bi, Di); \
+    Bsa = ROL64in128(A##bi, 62); \
+    XOReq128(A##go, Do); \
+    Bse = ROL64in128(A##go, 55); \
+    XOReq128(A##ku, Du); \
+    Bsi = ROL64in128(A##ku, 39); \
+    E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \
+    XOReq128(A##ma, Da); \
+    Bso = ROL64in128(A##ma, 41); \
+    E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \
+    XOReq128(A##se, De); \
+    Bsu = ROL64in128(A##se, 2); \
+    E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \
+    E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \
+    E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \
+\
+
+#define initializeState(X) \
+    X##ba = ZERO(); \
+    X##be = ZERO(); \
+    X##bi = ZERO(); \
+    X##bo = ZERO(); \
+    X##bu = ZERO(); \
+    X##ga = ZERO(); \
+    X##ge = ZERO(); \
+    X##gi = ZERO(); \
+    X##go = ZERO(); \
+    X##gu = ZERO(); \
+    X##ka = ZERO(); \
+    X##ke = ZERO(); \
+    X##ki = ZERO(); \
+    X##ko = ZERO(); \
+    X##ku = ZERO(); \
+    X##ma = ZERO(); \
+    X##me = ZERO(); \
+    X##mi = ZERO(); \
+    X##mo = ZERO(); \
+    X##mu = ZERO(); \
+    X##sa = ZERO(); \
+    X##se = ZERO(); \
+    X##si = ZERO(); \
+    X##so = ZERO(); \
+    X##su = ZERO(); \
+
+#define XORdata16(X, data0, data1) \
+    XOReq128(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \
+    XOReq128(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \
+    XOReq128(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \
+    XOReq128(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \
+    XOReq128(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \
+    XOReq128(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \
+    XOReq128(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \
+    XOReq128(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \
+    XOReq128(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \
+    XOReq128(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \
+    XOReq128(X##ka, LOAD6464((data1)[10], (data0)[10])); \
+    XOReq128(X##ke, LOAD6464((data1)[11], (data0)[11])); \
+    XOReq128(X##ki, LOAD6464((data1)[12], (data0)[12])); \
+    XOReq128(X##ko, LOAD6464((data1)[13], (data0)[13])); \
+    XOReq128(X##ku, LOAD6464((data1)[14], (data0)[14])); \
+    XOReq128(X##ma, LOAD6464((data1)[15], (data0)[15])); \
+
+#define XORdata21(X, data0, data1) \
+    XORdata16(X, data0, data1) \
+    XOReq128(X##me, LOAD6464((data1)[16], (data0)[16])); \
+    XOReq128(X##mi, LOAD6464((data1)[17], (data0)[17])); \
+    XOReq128(X##mo, LOAD6464((data1)[18], (data0)[18])); \
+    XOReq128(X##mu, LOAD6464((data1)[19], (data0)[19])); \
+    XOReq128(X##sa, LOAD6464((data1)[20], (data0)[20])); \
+
+#if ((defined(KeccakP1600times2_SSSE3_fullUnrolling)) || (KeccakP1600times2_SSSE3_unrolling == 12))
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#elif (KeccakP1600times2_SSSE3_unrolling == 6)
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+
+#elif (KeccakP1600times2_SSSE3_unrolling == 4)
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+
+#elif (KeccakP1600times2_SSSE3_unrolling == 2)
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#else
+#error "KeccakP1600times2_SSSE3_unrolling is not correctly specified!"
+#endif
+
+#define chunkSize 8192
+#define rateInBytes (21*8)
+
+void KangarooTwelve_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output)
+{
+    declareABCDE
+    #ifndef KeccakP1600times2_SSSE3_fullUnrolling
+    unsigned int i;
+    #endif
+    unsigned int j;
+
+    initializeState(A);
+
+    for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) {
+        XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize));
+        rounds12
+        input += rateInBytes;
+    }
+
+    XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize));
+    XOReq128(Ame, _mm_set1_epi64x(0x0BULL));
+    XOReq128(Asa, _mm_set1_epi64x(0x8000000000000000ULL));
+    rounds12
+
+    STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( Aba, Abe ) );
+    STORE128u( *(__m128i*)&(output[16]), UNPACKL( Abi, Abo ) );
+    STORE128u( *(__m128i*)&(output[32]), UNPACKH( Aba, Abe ) );
+    STORE128u( *(__m128i*)&(output[48]), UNPACKH( Abi, Abo ) );
+}

From b81d2e0c975c219e4460d762fc58081308914c1b Mon Sep 17 00:00:00 2001
From: Matthew Darnell <matthew.darnell@protonmail.com>
Date: Sat, 9 Dec 2023 18:42:11 -0500
Subject: [PATCH 3/7] Flags for linux build and more K12 deps

---
 crypto/build.rs | 56 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/crypto/build.rs b/crypto/build.rs
index 8d1f280..e5c2a3d 100644
--- a/crypto/build.rs
+++ b/crypto/build.rs
@@ -15,7 +15,7 @@ fn main() {
             .define("_MSC_VER", "1")
             .define("_AMD64_", "1")
             .compile("Chopper")
-    } else {
+    } else if std::env::consts::OS == "linux" {
         cc::Build::new()
             .define("__LINUX__", "1")
             .define("_X86_", "1")
@@ -33,15 +33,65 @@ fn main() {
 
         cc::Build::new()
             .include("../ffi-deps/K12/lib")
-            .include("../ffi-deps/K12/lib/Plain64")
+            .include("../ffi-deps/K12/lib/Optimized64")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c")
             .file("../ffi-deps/K12/lib/KangarooTwelve.c")
+            .flag("-march=native")
+            .flag("-mavx512vl")
+            .flag("-mavx512f")
+            .flag("-msse3")
             .compile("KangarooTwelve");
 
         cc::Build::new()
             .file("../ffi-deps/chopper-linux.cpp")
             .define("__LINUX__", "1")
             .define("_X86_", "1")
-            //.define("_AMD64_", "1")
+            .compile("Chopper")
+    } else {
+        cc::Build::new()
+            .define("__LINUX__", "1")
+            .define("_ARM_", "1")
+            .define("_AVX_", "1")
+            .define("USE_ENDO", "true")
+            .include("../ffi-deps/FourQlib/FourQ_32bit")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
+            .file("../ffi-deps/FourQlib/random/random.c")
+            .file("../ffi-deps/FourQlib/sha512/sha512.c")
+            .compile("libFourQ");
+
+        cc::Build::new()
+            .include("../ffi-deps/K12/lib")
+            .include("../ffi-deps/K12/lib/Optimized64")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c")
+            .file("../ffi-deps/K12/lib/KangarooTwelve.c")
+            .flag("-march=native")
+            .flag("-mavx512vl")
+            .flag("-mavx512f")
+            .flag("-msse3")
+            .compile("KangarooTwelve");
+
+        cc::Build::new()
+            .file("../ffi-deps/chopper-linux.cpp")
+            .define("__LINUX__", "1")
+            .define("_AMD64_", "1")
             .compile("Chopper")
     }
 }

From 4f9d8a64563086adc5e8d889ee3e7ad624e997c5 Mon Sep 17 00:00:00 2001
From: Matthew Darnell <matthew.darnell@protonmail.com>
Date: Sat, 9 Dec 2023 19:47:29 -0500
Subject: [PATCH 4/7] Building on Mac M1

---
 crypto/build.rs   | 22 +++++++---------------
 identity/build.rs | 28 +++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/crypto/build.rs b/crypto/build.rs
index e5c2a3d..1bdf33e 100644
--- a/crypto/build.rs
+++ b/crypto/build.rs
@@ -18,7 +18,8 @@ fn main() {
     } else if std::env::consts::OS == "linux" {
         cc::Build::new()
             .define("__LINUX__", "1")
-            .define("_X86_", "1")
+            //.define("_X86_", "1")
+            .define("_AMD64_", "1")
             .define("_AVX_", "1")
             .define("USE_ENDO", "true")
             .include("../ffi-deps/FourQlib/FourQ_32bit")
@@ -52,11 +53,13 @@ fn main() {
         cc::Build::new()
             .file("../ffi-deps/chopper-linux.cpp")
             .define("__LINUX__", "1")
-            .define("_X86_", "1")
+            //.define("_X86_", "1")
+            .define("_AMD64_", "1")
             .compile("Chopper")
     } else {
         cc::Build::new()
             .define("__LINUX__", "1")
+            .define("_AMD64_", "1")
             .define("_ARM_", "1")
             .define("_AVX_", "1")
             .define("USE_ENDO", "true")
@@ -72,20 +75,9 @@ fn main() {
 
         cc::Build::new()
             .include("../ffi-deps/K12/lib")
-            .include("../ffi-deps/K12/lib/Optimized64")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c")
+            .include("../ffi-deps/K12/lib/Inplace32BI")
+            .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c")
             .file("../ffi-deps/K12/lib/KangarooTwelve.c")
-            .flag("-march=native")
-            .flag("-mavx512vl")
-            .flag("-mavx512f")
-            .flag("-msse3")
             .compile("KangarooTwelve");
 
         cc::Build::new()
diff --git a/identity/build.rs b/identity/build.rs
index 3f29496..d9f70e0 100644
--- a/identity/build.rs
+++ b/identity/build.rs
@@ -10,7 +10,7 @@ fn main() {
             .define("_AVX_", "1")
             .include("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h")
             .compile("Chopper")
-    } else {
+    } else if std::env::consts::OS == "linux" {
         cc::Build::new()
             .define("__LINUX__", "1")
             .define("_X86_", "1")
@@ -38,6 +38,32 @@ fn main() {
             .include("../ffi-deps/FourQlib/FourQ_32bit")
             .file("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h")
             .compile("Chopper")
+    } else {
+        cc::Build::new()
+            .define("__LINUX__", "1")
+            .define("_AMD64_", "1")
+            .define("_ARM_", "1")
+            .define("_AVX_", "1")
+            .define("USE_ENDO", "true")
+            .include("../ffi-deps/FourQlib/FourQ_32bit")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
+            .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
+            .file("../ffi-deps/FourQlib/random/random.c")
+            .file("../ffi-deps/FourQlib/sha512/sha512.c")
+            .compile("libFourQ");
+
+
+        println!("cargo:rustc-link-lib=libFourQ");
+        println!("cargo:rustc-link-lib=dylib=libFourQ");
+
+        cc::Build::new()
+            .file("../ffi-deps/chopper-linux.cpp")
+            .define("__LINUX__", "1")
+            .define("_AMD64_", "1")
+            .compile("Chopper")
     }
 }
 

From 98947c959b6f9f72146bf0b82d81d0e896f4ea19 Mon Sep 17 00:00:00 2001
From: Matthew Darnell <matthew.darnell@protonmail.com>
Date: Mon, 11 Dec 2023 13:00:25 -0500
Subject: [PATCH 5/7] Cpu tweaking in build

---
 crypto/build.rs | 206 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 140 insertions(+), 66 deletions(-)

diff --git a/crypto/build.rs b/crypto/build.rs
index 1bdf33e..f8d270e 100644
--- a/crypto/build.rs
+++ b/crypto/build.rs
@@ -9,81 +9,155 @@ fn main() {
 fn main() {
     println!("Running crypto Build Step");
 
-    if std::env::consts::OS == "windows" {
+    let os = std::env::consts::OS;
+    let arch = std::env::consts::ARCH;
+
+
+    if os == "windows" {
         cc::Build::new()
             .file("../ffi-deps/chopper-win.cpp")
             .define("_MSC_VER", "1")
             .define("_AMD64_", "1")
             .compile("Chopper")
-    } else if std::env::consts::OS == "linux" {
-        cc::Build::new()
-            .define("__LINUX__", "1")
-            //.define("_X86_", "1")
-            .define("_AMD64_", "1")
-            .define("_AVX_", "1")
-            .define("USE_ENDO", "true")
-            .include("../ffi-deps/FourQlib/FourQ_32bit")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
-            .file("../ffi-deps/FourQlib/random/random.c")
-            .file("../ffi-deps/FourQlib/sha512/sha512.c")
-            .compile("libFourQ");
+    } else if os == "linux" {
+        if arch == "x86_64" || arch == "x86" {
+            cc::Build::new()
+                .define("__LINUX__", "1")
+                .define("_X86_", "1")
+                .define("_AVX_", "1")
+                .define("USE_ENDO", "true")
+                .include("../ffi-deps/FourQlib/FourQ_32bit")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
+                .file("../ffi-deps/FourQlib/random/random.c")
+                .file("../ffi-deps/FourQlib/sha512/sha512.c")
+                .compile("libFourQ");
 
-        cc::Build::new()
-            .include("../ffi-deps/K12/lib")
-            .include("../ffi-deps/K12/lib/Optimized64")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c")
-            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c")
-            .file("../ffi-deps/K12/lib/KangarooTwelve.c")
-            .flag("-march=native")
-            .flag("-mavx512vl")
-            .flag("-mavx512f")
-            .flag("-msse3")
-            .compile("KangarooTwelve");
+            cc::Build::new()
+                .include("../ffi-deps/K12/lib")
+                .include("../ffi-deps/K12/lib/Optimized64")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c")
+                .file("../ffi-deps/K12/lib/KangarooTwelve.c")
+                .flag("-march=native")
+                .flag("-mavx512vl")
+                .flag("-mavx512f")
+                .flag("-msse3")
+                .compile("KangarooTwelve");
 
-        cc::Build::new()
-            .file("../ffi-deps/chopper-linux.cpp")
-            .define("__LINUX__", "1")
-            //.define("_X86_", "1")
-            .define("_AMD64_", "1")
-            .compile("Chopper")
+            cc::Build::new()
+                .file("../ffi-deps/chopper-linux.cpp")
+                .define("__LINUX__", "1")
+                .define("_X86_", "1")
+                .compile("Chopper")
+        } else {    //ARM
+            cc::Build::new()
+                .define("__LINUX__", "1")
+                .define("_AMD64_", "1")
+                .define("_AVX_", "1")
+                .define("USE_ENDO", "true")
+                .include("../ffi-deps/FourQlib/FourQ_32bit")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
+                .file("../ffi-deps/FourQlib/random/random.c")
+                .file("../ffi-deps/FourQlib/sha512/sha512.c")
+                .compile("libFourQ");
+
+            cc::Build::new()
+                .include("../ffi-deps/K12/lib")
+                .include("../ffi-deps/K12/lib/Optimized64")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c")
+                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c")
+                .file("../ffi-deps/K12/lib/KangarooTwelve.c")
+                .flag("-march=native")
+                .flag("-mavx512vl")
+                .flag("-mavx512f")
+                .flag("-msse3")
+                .compile("KangarooTwelve");
+
+            cc::Build::new()
+                .file("../ffi-deps/chopper-linux.cpp")
+                .define("__LINUX__", "1")
+                .define("_AMD64_", "1")
+                .compile("Chopper")
+        }
     } else {
-        cc::Build::new()
-            .define("__LINUX__", "1")
-            .define("_AMD64_", "1")
-            .define("_ARM_", "1")
-            .define("_AVX_", "1")
-            .define("USE_ENDO", "true")
-            .include("../ffi-deps/FourQlib/FourQ_32bit")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
-            .file("../ffi-deps/FourQlib/random/random.c")
-            .file("../ffi-deps/FourQlib/sha512/sha512.c")
-            .compile("libFourQ");
 
-        cc::Build::new()
-            .include("../ffi-deps/K12/lib")
-            .include("../ffi-deps/K12/lib/Inplace32BI")
-            .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c")
-            .file("../ffi-deps/K12/lib/KangarooTwelve.c")
-            .compile("KangarooTwelve");
+        if arch == "x86_64" || arch == "x86" {  //Intel Mac
+            cc::Build::new()
+                .define("__LINUX__", "1")
+                .define("_X86_", "1")
+                .define("_AVX_", "1")
+                .define("USE_ENDO", "true")
+                .include("../ffi-deps/FourQlib/FourQ_32bit")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
+                .file("../ffi-deps/FourQlib/random/random.c")
+                .file("../ffi-deps/FourQlib/sha512/sha512.c")
+                .compile("libFourQ");
 
-        cc::Build::new()
-            .file("../ffi-deps/chopper-linux.cpp")
-            .define("__LINUX__", "1")
-            .define("_AMD64_", "1")
-            .compile("Chopper")
+            cc::Build::new()
+                .include("../ffi-deps/K12/lib")
+                .include("../ffi-deps/K12/lib/Inplace32BI")
+                .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c")
+                .file("../ffi-deps/K12/lib/KangarooTwelve.c")
+                .compile("KangarooTwelve");
+
+            cc::Build::new()
+                .file("../ffi-deps/chopper-linux.cpp")
+                .define("__LINUX__", "1")
+                .define("_AMD64_", "1")
+                .compile("Chopper")
+        } else {    //Mac M1 Series
+            cc::Build::new()
+                .define("__LINUX__", "1")
+                .define("_AMD64_", "1")
+                .define("_ARM_", "1")
+                .define("_AVX_", "1")
+                .define("USE_ENDO", "true")
+                .include("../ffi-deps/FourQlib/FourQ_32bit")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
+                .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
+                .file("../ffi-deps/FourQlib/random/random.c")
+                .file("../ffi-deps/FourQlib/sha512/sha512.c")
+                .compile("libFourQ");
+
+            cc::Build::new()
+                .include("../ffi-deps/K12/lib")
+                .include("../ffi-deps/K12/lib/Inplace32BI")
+                .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c")
+                .file("../ffi-deps/K12/lib/KangarooTwelve.c")
+                .compile("KangarooTwelve");
+
+            cc::Build::new()
+                .file("../ffi-deps/chopper-linux.cpp")
+                .define("__LINUX__", "1")
+                .define("_AMD64_", "1")
+                .compile("Chopper")
+        }
     }
 }

From 0a0afec1162df2a635d284fe95aef5ce6a16f5f9 Mon Sep 17 00:00:00 2001
From: Matthew Darnell <matthew.darnell@protonmail.com>
Date: Fri, 15 Dec 2023 15:06:38 -0500
Subject: [PATCH 6/7] Simplify FFI Build Script

---
 crypto/build.rs     | 195 ++++++++++++++------------------------------
 identity/Cargo.toml |   3 -
 identity/build.rs   |  69 ----------------
 3 files changed, 59 insertions(+), 208 deletions(-)
 delete mode 100644 identity/build.rs

diff --git a/crypto/build.rs b/crypto/build.rs
index f8d270e..3ca1657 100644
--- a/crypto/build.rs
+++ b/crypto/build.rs
@@ -7,11 +7,17 @@ fn main() {
 
 #[cfg(feature = "encryption")]
 fn main() {
-    println!("Running crypto Build Step");
-
     let os = std::env::consts::OS;
     let arch = std::env::consts::ARCH;
-
+    let cpu: &str = match arch {
+        "x86" => "_X86_",
+        "x86_64" => "_X86_",
+        _ => "_AMD64_"
+    };
+    let extra_four_q_define: &str = match cpu {
+        "_X86_" => "_BOGUS_",
+        _ => "_ARM_"    //Mac M1 need this
+    };
 
     if os == "windows" {
         cc::Build::new()
@@ -19,145 +25,62 @@ fn main() {
             .define("_MSC_VER", "1")
             .define("_AMD64_", "1")
             .compile("Chopper")
-    } else if os == "linux" {
-        if arch == "x86_64" || arch == "x86" {
-            cc::Build::new()
-                .define("__LINUX__", "1")
-                .define("_X86_", "1")
-                .define("_AVX_", "1")
-                .define("USE_ENDO", "true")
-                .include("../ffi-deps/FourQlib/FourQ_32bit")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
-                .file("../ffi-deps/FourQlib/random/random.c")
-                .file("../ffi-deps/FourQlib/sha512/sha512.c")
-                .compile("libFourQ");
+    }
 
-            cc::Build::new()
-                .include("../ffi-deps/K12/lib")
-                .include("../ffi-deps/K12/lib/Optimized64")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c")
-                .file("../ffi-deps/K12/lib/KangarooTwelve.c")
-                .flag("-march=native")
-                .flag("-mavx512vl")
-                .flag("-mavx512f")
-                .flag("-msse3")
-                .compile("KangarooTwelve");
+    cc::Build::new()
+        .define("__LINUX__", "1")
+        .define(cpu, "1")
+        .define(extra_four_q_define, "1")
+        .define("_AVX_", "1")
+        .define("USE_ENDO", "true")
+        .include("../ffi-deps/FourQlib/FourQ_32bit")
+        .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
+        .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
+        .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
+        .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
+        .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
+        .file("../ffi-deps/FourQlib/random/random.c")
+        .file("../ffi-deps/FourQlib/sha512/sha512.c")
+        .compile("libFourQ");
 
-            cc::Build::new()
-                .file("../ffi-deps/chopper-linux.cpp")
-                .define("__LINUX__", "1")
-                .define("_X86_", "1")
-                .compile("Chopper")
-        } else {    //ARM
-            cc::Build::new()
-                .define("__LINUX__", "1")
-                .define("_AMD64_", "1")
-                .define("_AVX_", "1")
-                .define("USE_ENDO", "true")
-                .include("../ffi-deps/FourQlib/FourQ_32bit")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
-                .file("../ffi-deps/FourQlib/random/random.c")
-                .file("../ffi-deps/FourQlib/sha512/sha512.c")
-                .compile("libFourQ");
+    let mut binding = cc::Build::new();
+    let k12 = binding
+        .include("../ffi-deps/K12/lib")
+        .file("../ffi-deps/K12/lib/KangarooTwelve.c");
 
-            cc::Build::new()
-                .include("../ffi-deps/K12/lib")
-                .include("../ffi-deps/K12/lib/Optimized64")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c")
-                .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c")
-                .file("../ffi-deps/K12/lib/KangarooTwelve.c")
-                .flag("-march=native")
-                .flag("-mavx512vl")
-                .flag("-mavx512f")
-                .flag("-msse3")
-                .compile("KangarooTwelve");
+    if os == "linux" {
+        k12
+            .include("../ffi-deps/K12/lib/Optimized64")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c")
+            .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c")
+            .flag("-march=native")
+            .flag("-mavx512vl")
+            .flag("-mavx512f")
+            .flag("-msse3")
+            .compile("KangarooTwelve");
 
-            cc::Build::new()
-                .file("../ffi-deps/chopper-linux.cpp")
-                .define("__LINUX__", "1")
-                .define("_AMD64_", "1")
-                .compile("Chopper")
-        }
+        cc::Build::new()
+            .file("../ffi-deps/chopper-linux.cpp")
+            .define("__LINUX__", "1")
+            .define(cpu, "1")
+            .compile("Chopper")
     } else {
+        k12
+            .include("../ffi-deps/K12/lib/Inplace32BI")
+            .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c")
+            .compile("KangarooTwelve");
 
-        if arch == "x86_64" || arch == "x86" {  //Intel Mac
-            cc::Build::new()
-                .define("__LINUX__", "1")
-                .define("_X86_", "1")
-                .define("_AVX_", "1")
-                .define("USE_ENDO", "true")
-                .include("../ffi-deps/FourQlib/FourQ_32bit")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
-                .file("../ffi-deps/FourQlib/random/random.c")
-                .file("../ffi-deps/FourQlib/sha512/sha512.c")
-                .compile("libFourQ");
-
-            cc::Build::new()
-                .include("../ffi-deps/K12/lib")
-                .include("../ffi-deps/K12/lib/Inplace32BI")
-                .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c")
-                .file("../ffi-deps/K12/lib/KangarooTwelve.c")
-                .compile("KangarooTwelve");
-
-            cc::Build::new()
-                .file("../ffi-deps/chopper-linux.cpp")
-                .define("__LINUX__", "1")
-                .define("_AMD64_", "1")
-                .compile("Chopper")
-        } else {    //Mac M1 Series
-            cc::Build::new()
-                .define("__LINUX__", "1")
-                .define("_AMD64_", "1")
-                .define("_ARM_", "1")
-                .define("_AVX_", "1")
-                .define("USE_ENDO", "true")
-                .include("../ffi-deps/FourQlib/FourQ_32bit")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
-                .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
-                .file("../ffi-deps/FourQlib/random/random.c")
-                .file("../ffi-deps/FourQlib/sha512/sha512.c")
-                .compile("libFourQ");
-
-            cc::Build::new()
-                .include("../ffi-deps/K12/lib")
-                .include("../ffi-deps/K12/lib/Inplace32BI")
-                .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c")
-                .file("../ffi-deps/K12/lib/KangarooTwelve.c")
-                .compile("KangarooTwelve");
+        cc::Build::new()
+            .file("../ffi-deps/chopper-linux.cpp")
+            .define("__LINUX__", "1")
+            .define("_AMD64_", "1")
+            .compile("Chopper")
 
-            cc::Build::new()
-                .file("../ffi-deps/chopper-linux.cpp")
-                .define("__LINUX__", "1")
-                .define("_AMD64_", "1")
-                .compile("Chopper")
-        }
     }
 }
diff --git a/identity/Cargo.toml b/identity/Cargo.toml
index bcfab0c..11f1988 100644
--- a/identity/Cargo.toml
+++ b/identity/Cargo.toml
@@ -9,6 +9,3 @@ libc = "0.2.147"
 crypto = { path = '../crypto', features = ["encryption"] }
 logger = { path = '../logger' }
 
-[build-dependencies]
-bindgen = "0.69.1"
-cc = "1.0.79"
diff --git a/identity/build.rs b/identity/build.rs
deleted file mode 100644
index d9f70e0..0000000
--- a/identity/build.rs
+++ /dev/null
@@ -1,69 +0,0 @@
-extern crate cc;
-
-fn main() {
-
-    if std::env::consts::OS == "windows" {
-        cc::Build::new()
-            .file("../ffi-deps/chopper-win.cpp")
-            .define("_MSC_VER", "1")
-            .define("_AMD64_", "1")
-            .define("_AVX_", "1")
-            .include("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h")
-            .compile("Chopper")
-    } else if std::env::consts::OS == "linux" {
-        cc::Build::new()
-            .define("__LINUX__", "1")
-            .define("_X86_", "1")
-            .define("_AVX_", "1")
-            .define("USE_ENDO", "true")
-            .include("../ffi-deps/FourQlib/FourQ_32bit")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
-            .file("../ffi-deps/FourQlib/random/random.c")
-            .file("../ffi-deps/FourQlib/sha512/sha512.c")
-            .compile("libFourQ");
-
-
-        println!("cargo:rustc-link-lib=libFourQ");
-        println!("cargo:rustc-link-lib=dylib=libFourQ");
-
-        cc::Build::new()
-            .file("../ffi-deps/chopper-linux.cpp")
-            .define("__LINUX__", "1")
-            .define("_X86_", "1")
-            .define("_AVX_", "1")
-            .include("../ffi-deps/FourQlib/FourQ_32bit")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/FourQ.h")
-            .compile("Chopper")
-    } else {
-        cc::Build::new()
-            .define("__LINUX__", "1")
-            .define("_AMD64_", "1")
-            .define("_ARM_", "1")
-            .define("_AVX_", "1")
-            .define("USE_ENDO", "true")
-            .include("../ffi-deps/FourQlib/FourQ_32bit")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c")
-            .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c")
-            .file("../ffi-deps/FourQlib/random/random.c")
-            .file("../ffi-deps/FourQlib/sha512/sha512.c")
-            .compile("libFourQ");
-
-
-        println!("cargo:rustc-link-lib=libFourQ");
-        println!("cargo:rustc-link-lib=dylib=libFourQ");
-
-        cc::Build::new()
-            .file("../ffi-deps/chopper-linux.cpp")
-            .define("__LINUX__", "1")
-            .define("_AMD64_", "1")
-            .compile("Chopper")
-    }
-}
-

From 5c38c22a230786c534aa1d2d0421e38a405f106e Mon Sep 17 00:00:00 2001
From: Matthew Darnell <matthew.darnell@protonmail.com>
Date: Fri, 15 Dec 2023 16:04:12 -0500
Subject: [PATCH 7/7] Explicit return in builder

---
 crypto/build.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crypto/build.rs b/crypto/build.rs
index 3ca1657..e6dcb11 100644
--- a/crypto/build.rs
+++ b/crypto/build.rs
@@ -20,11 +20,11 @@ fn main() {
     };
 
     if os == "windows" {
-        cc::Build::new()
+        return cc::Build::new()
             .file("../ffi-deps/chopper-win.cpp")
             .define("_MSC_VER", "1")
             .define("_AMD64_", "1")
-            .compile("Chopper")
+            .compile("Chopper");
     }
 
     cc::Build::new()