diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S new file mode 100644 index 0000000..09aa0d2 --- /dev/null +++ b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S @@ -0,0 +1,623 @@ +# K12 based on the eXtended Keccak Code Package (XKCP) +# https://github.com/XKCP/XKCP +# +# The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. +# +# Implementation by Gilles Van Assche, hereby denoted as "the implementer". +# Core subroutine is based on one by Andy Polyakov, available +# at https://github.com/dot-asm/cryptogams. Used with permission. +# +# For more information, feedback or questions, please refer to the Keccak Team website: +# https://keccak.team/ +# +# To the extent possible under law, the implementer has waived all copyright +# and related or neighboring rights to the source code in this file. +# http://creativecommons.org/publicdomain/zero/1.0/ + +.text + +.balign 64 // strategic alignment and padding that allows to use + // address value as loop termination condition... + .quad 0,0,0,0,0,0,0,0 +.ifdef macOS +.else +.type iotas,%object +.endif +iotas: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a +iotas12: + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +.ifdef macOS +.else +.size iotas,.-iotas +.endif + +.ifdef macOS +.else +.type KeccakP1600_ARMv8Asha3_Permute_12rounds_internal,%function +.endif +KeccakP1600_ARMv8Asha3_Permute_12rounds_internal: +.balign 32 + mov x9,#12 + adr x10,iotas12 + b .Loop_ce +.balign 16 +.Loop_ce: + ////////////////////////////////////////////////// Theta + eor3 v25.16b,v20.16b,v15.16b,v10.16b + eor3 v26.16b,v21.16b,v16.16b,v11.16b + eor3 v27.16b,v22.16b,v17.16b,v12.16b + eor3 v28.16b,v23.16b,v18.16b,v13.16b + eor3 v29.16b,v24.16b,v19.16b,v14.16b + eor3 v25.16b,v25.16b, v5.16b,v0.16b + eor3 v26.16b,v26.16b, v6.16b,v1.16b + eor3 v27.16b,v27.16b, v7.16b,v2.16b + eor3 v28.16b,v28.16b, v8.16b,v3.16b + eor3 v29.16b,v29.16b, v9.16b,v4.16b + + rax1 v30.2d,v25.2d,v27.2d // D[1] + rax1 v31.2d,v26.2d,v28.2d // D[2] + rax1 v27.2d,v27.2d,v29.2d // D[3] + rax1 v28.2d,v28.2d,v25.2d // D[4] + rax1 v29.2d,v29.2d,v26.2d // D[0] + + ////////////////////////////////////////////////// Theta+Rho+Pi + xar v25.2d, v1.2d,v30.2d,#64-1 // C[0]=A[2][0] + + xar v1.2d,v6.2d,v30.2d,#64-44 + xar v6.2d,v9.2d,v28.2d,#64-20 + xar v9.2d,v22.2d,v31.2d,#64-61 + xar v22.2d,v14.2d,v28.2d,#64-39 + xar v14.2d,v20.2d,v29.2d,#64-18 + + xar v26.2d, v2.2d,v31.2d,#64-62 // C[1]=A[4][0] + + xar v2.2d,v12.2d,v31.2d,#64-43 + xar v12.2d,v13.2d,v27.2d,#64-25 + xar v13.2d,v19.2d,v28.2d,#64-8 + xar v19.2d,v23.2d,v27.2d,#64-56 + xar v23.2d,v15.2d,v29.2d,#64-41 + + xar v15.2d,v4.2d,v28.2d,#64-27 + + xar v28.2d, v24.2d,v28.2d,#64-14 // D[4]=A[0][4] + xar v24.2d,v21.2d,v30.2d,#64-2 + xar v8.2d,v8.2d,v27.2d,#64-55 // A[1][3]=A[4][1] + xar v4.2d,v16.2d,v30.2d,#64-45 // A[0][4]=A[1][3] + xar v16.2d,v5.2d,v29.2d,#64-36 + + xar v5.2d,v3.2d,v27.2d,#64-28 + + eor v0.16b,v0.16b,v29.16b + + xar v27.2d, v18.2d,v27.2d,#64-21 // D[3]=A[0][3] + xar v3.2d,v17.2d,v31.2d,#64-15 // A[0][3]=A[3][3] + xar v30.2d, v11.2d,v30.2d,#64-10 // D[1]=A[3][2] + xar v31.2d, v7.2d,v31.2d,#64-6 // D[2]=A[2][1] + xar v29.2d, v10.2d,v29.2d,#64-3 // D[0]=A[1][2] + + ////////////////////////////////////////////////// Chi+Iota + bcax v20.16b,v26.16b, v22.16b,v8.16b // A[1][3]=A[4][1] + bcax v21.16b,v8.16b,v23.16b,v22.16b // A[1][3]=A[4][1] + bcax v22.16b,v22.16b,v24.16b,v23.16b + bcax v23.16b,v23.16b,v26.16b, v24.16b + bcax v24.16b,v24.16b,v8.16b,v26.16b // A[1][3]=A[4][1] + + ld1r {v26.2d},[x10],#8 + + bcax v17.16b,v30.16b, v19.16b,v3.16b // A[0][3]=A[3][3] + bcax v18.16b,v3.16b,v15.16b,v19.16b // A[0][3]=A[3][3] + bcax v19.16b,v19.16b,v16.16b,v15.16b + bcax v15.16b,v15.16b,v30.16b, v16.16b + bcax v16.16b,v16.16b,v3.16b,v30.16b // A[0][3]=A[3][3] + + bcax v10.16b,v25.16b, v12.16b,v31.16b + bcax v11.16b,v31.16b, v13.16b,v12.16b + bcax v12.16b,v12.16b,v14.16b,v13.16b + bcax v13.16b,v13.16b,v25.16b, v14.16b + bcax v14.16b,v14.16b,v31.16b, v25.16b + + bcax v7.16b,v29.16b, v9.16b,v4.16b // A[0][4]=A[1][3] + bcax v8.16b,v4.16b,v5.16b,v9.16b // A[0][4]=A[1][3] + bcax v9.16b,v9.16b,v6.16b,v5.16b + bcax v5.16b,v5.16b,v29.16b, v6.16b + bcax v6.16b,v6.16b,v4.16b,v29.16b // A[0][4]=A[1][3] + + bcax v3.16b,v27.16b, v0.16b,v28.16b + bcax v4.16b,v28.16b, v1.16b,v0.16b + bcax v0.16b,v0.16b,v2.16b,v1.16b + bcax v1.16b,v1.16b,v27.16b, v2.16b + bcax v2.16b,v2.16b,v28.16b, v27.16b + + eor v0.16b,v0.16b,v26.16b + + subs x9,x9,#1 + bne .Loop_ce + + ret +.ifdef macOS +.else +.size KeccakP1600_ARMv8Asha3_Permute_12rounds_internal,.-KeccakP1600_ARMv8Asha3_Permute_12rounds_internal +.endif + +.ifdef macOS +.globl _KeccakP1600_ARMv8Asha3_Permute_12rounds +_KeccakP1600_ARMv8Asha3_Permute_12rounds: +.else +.globl KeccakP1600_ARMv8Asha3_Permute_12rounds +.type KeccakP1600_ARMv8Asha3_Permute_12rounds,%function +KeccakP1600_ARMv8Asha3_Permute_12rounds: +.endif +.balign 32 + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + ldp d0,d1,[x0,#8*0] + ldp d2,d3,[x0,#8*2] + ldp d4,d5,[x0,#8*4] + ldp d6,d7,[x0,#8*6] + ldp d8,d9,[x0,#8*8] + ldp d10,d11,[x0,#8*10] + ldp d12,d13,[x0,#8*12] + ldp d14,d15,[x0,#8*14] + ldp d16,d17,[x0,#8*16] + ldp d18,d19,[x0,#8*18] + ldp d20,d21,[x0,#8*20] + ldp d22,d23,[x0,#8*22] + ldr d24,[x0,#8*24] + bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal + ldr x30,[sp,#8] + stp d0,d1,[x0,#8*0] + stp d2,d3,[x0,#8*2] + stp d4,d5,[x0,#8*4] + stp d6,d7,[x0,#8*6] + stp d8,d9,[x0,#8*8] + stp d10,d11,[x0,#8*10] + stp d12,d13,[x0,#8*12] + stp d14,d15,[x0,#8*14] + stp d16,d17,[x0,#8*16] + stp d18,d19,[x0,#8*18] + stp d20,d21,[x0,#8*20] + stp d22,d23,[x0,#8*22] + str d24,[x0,#8*24] + + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 + ret +.ifdef macOS +.else +.size KeccakP1600_ARMv8Asha3_Permute_12rounds,.-KeccakP1600_ARMv8Asha3_Permute_12rounds +.endif + +// size_t KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb( +// void *state(x0), +// unsigned int laneCount(x1) = 21, +// const unsigned char *data(x2), +// size_t dataByteLen(x3)) +.ifdef macOS +.globl _KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb +_KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb: +.else +.globl KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb +.type KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb,%function +KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb: +.endif +.balign 32 + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + ldp d0,d1,[x0,#8*0] + ldp d2,d3,[x0,#8*2] + ldp d4,d5,[x0,#8*4] + ldp d6,d7,[x0,#8*6] + ldp d8,d9,[x0,#8*8] + ldp d10,d11,[x0,#8*10] + ldp d12,d13,[x0,#8*12] + ldp d14,d15,[x0,#8*14] + ldp d16,d17,[x0,#8*16] + ldp d18,d19,[x0,#8*18] + ldp d20,d21,[x0,#8*20] + ldp d22,d23,[x0,#8*22] + ldr d24,[x0,#8*24] + + // Prepare the return value + mov x11, #0 + b .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop + +.balign 16 +.KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop: + subs x3, x3, #8*21 + b.cc .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_end + + // Lanes 0-3 + ld1 {v27.8b-v30.8b}, [x2], #32 + eor v0.16b, v0.16b, v27.16b + eor v1.16b, v1.16b, v28.16b + eor v2.16b, v2.16b, v29.16b + eor v3.16b, v3.16b, v30.16b + + // Lanes 4-7 + ld1 {v27.8b-v30.8b}, [x2], #32 + eor v4.16b, v4.16b, v27.16b + eor v5.16b, v5.16b, v28.16b + eor v6.16b, v6.16b, v29.16b + eor v7.16b, v7.16b, v30.16b + + // Lanes 8-11 + ld1 {v27.8b-v30.8b}, [x2], #32 + eor v8.16b, v8.16b, v27.16b + eor v9.16b, v9.16b, v28.16b + eor v10.16b, v10.16b, v29.16b + eor v11.16b, v11.16b, v30.16b + + // Lanes 12-15 + ld1 {v27.8b-v30.8b}, [x2], #32 + eor v12.16b, v12.16b, v27.16b + eor v13.16b, v13.16b, v28.16b + eor v14.16b, v14.16b, v29.16b + eor v15.16b, v15.16b, v30.16b + + // Lanes 16-20 + ld1 {v27.8b-v30.8b}, [x2], #32 + eor v16.16b, v16.16b, v27.16b + eor v17.16b, v17.16b, v28.16b + eor v18.16b, v18.16b, v29.16b + eor v19.16b, v19.16b, v30.16b + ld1 {v27.8b}, [x2], #8 + eor v20.16b, v20.16b, v27.16b + + bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal + + add x11, x11, #8*21 + + b .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop +.KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_end: + + stp d0,d1,[x0,#8*0] + stp d2,d3,[x0,#8*2] + stp d4,d5,[x0,#8*4] + stp d6,d7,[x0,#8*6] + stp d8,d9,[x0,#8*8] + stp d10,d11,[x0,#8*10] + stp d12,d13,[x0,#8*12] + stp d14,d15,[x0,#8*14] + stp d16,d17,[x0,#8*16] + stp d18,d19,[x0,#8*18] + stp d20,d21,[x0,#8*20] + stp d22,d23,[x0,#8*22] + str d24,[x0,#8*24] + + mov x0, x11 + + ldr x30,[sp,#8] + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 + + ret +.ifdef macOS +.else +.size KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb,.-KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb +.endif + +.ifdef macOS +.globl _KeccakP1600times2_ARMv8Asha3_Permute_12rounds +_KeccakP1600times2_ARMv8Asha3_Permute_12rounds: +.else +.globl KeccakP1600times2_ARMv8Asha3_Permute_12rounds +.type KeccakP1600times2_ARMv8Asha3_Permute_12rounds,%function +KeccakP1600times2_ARMv8Asha3_Permute_12rounds: +.endif +.balign 32 + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + ld1 { v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 + ld1 { v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 + ld1 { v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 + ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 + ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 + ld1 {v24.2d}, [x0] + sub x0, x0, #64*6 + + bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal + + ldr x30,[sp,#8] + st1 { v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 + st1 { v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 + st1 { v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 + st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 + st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 + st1 {v24.2d}, [x0] + + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 + + ret +.ifdef macOS +.else +.size KeccakP1600times2_ARMv8Asha3_Permute_12rounds,.-KeccakP1600times2_ARMv8Asha3_Permute_12rounds +.endif + +.ifdef macOS +.globl _KangarooTwelve_ARMv8Asha3_Process2Leaves +_KangarooTwelve_ARMv8Asha3_Process2Leaves: +.else +.globl KangarooTwelve_ARMv8Asha3_Process2Leaves +.type KangarooTwelve_ARMv8Asha3_Process2Leaves,%function +KangarooTwelve_ARMv8Asha3_Process2Leaves: +.endif +.balign 32 + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + movi v0.2d, #0 + movi v1.2d, #0 + movi v2.2d, #0 + movi v3.2d, #0 + movi v4.2d, #0 + movi v5.2d, #0 + movi v6.2d, #0 + movi v7.2d, #0 + movi v8.2d, #0 + movi v9.2d, #0 + movi v10.2d, #0 + movi v11.2d, #0 + movi v12.2d, #0 + movi v13.2d, #0 + movi v14.2d, #0 + movi v15.2d, #0 + movi v16.2d, #0 + movi v17.2d, #0 + movi v18.2d, #0 + movi v19.2d, #0 + movi v20.2d, #0 + movi v21.2d, #0 + movi v22.2d, #0 + movi v23.2d, #0 + movi v24.2d, #0 + + // x12 is input + chunkSize + add x12, x0, #8192 + + // Loop over the first 48 blocks + mov x11, 48 + b .KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks +.KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks: + + // Lanes 0-3 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v0.16b, v0.16b, v25.16b + eor v1.16b, v1.16b, v26.16b + eor v2.16b, v2.16b, v27.16b + eor v3.16b, v3.16b, v28.16b + + // Lanes 4-7 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v4.16b, v4.16b, v25.16b + eor v5.16b, v5.16b, v26.16b + eor v6.16b, v6.16b, v27.16b + eor v7.16b, v7.16b, v28.16b + + // Lanes 8-11 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v8.16b, v8.16b, v25.16b + eor v9.16b, v9.16b, v26.16b + eor v10.16b, v10.16b, v27.16b + eor v11.16b, v11.16b, v28.16b + + // Lanes 12-15 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v12.16b, v12.16b, v25.16b + eor v13.16b, v13.16b, v26.16b + eor v14.16b, v14.16b, v27.16b + eor v15.16b, v15.16b, v28.16b + + // Lanes 16-20 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 + ld1 {v29.d}[0], [x0], #8 + ld1 {v29.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b + rev64 v29.16b, v29.16b +#endif + eor v16.16b, v16.16b, v25.16b + eor v17.16b, v17.16b, v26.16b + eor v18.16b, v18.16b, v27.16b + eor v19.16b, v19.16b, v28.16b + eor v20.16b, v20.16b, v29.16b + + bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal + + subs x11, x11, #1 + bne .KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks + + // Lanes 0-3 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v0.16b, v0.16b, v25.16b + eor v1.16b, v1.16b, v26.16b + eor v2.16b, v2.16b, v27.16b + eor v3.16b, v3.16b, v28.16b + + // Lanes 4-7 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v4.16b, v4.16b, v25.16b + eor v5.16b, v5.16b, v26.16b + eor v6.16b, v6.16b, v27.16b + eor v7.16b, v7.16b, v28.16b + + // Lanes 8-11 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v8.16b, v8.16b, v25.16b + eor v9.16b, v9.16b, v26.16b + eor v10.16b, v10.16b, v27.16b + eor v11.16b, v11.16b, v28.16b + + // Lanes 12-15 + ld1 {v25.1d-v28.1d}, [x0], #32 + ld1 {v25.d}[1], [x12], #8 + ld1 {v26.d}[1], [x12], #8 + ld1 {v27.d}[1], [x12], #8 + ld1 {v28.d}[1], [x12], #8 +#ifdef __AARCH64EB__ + rev64 v25.16b, v25.16b + rev64 v26.16b, v26.16b + rev64 v27.16b, v27.16b + rev64 v28.16b, v28.16b +#endif + eor v12.16b, v12.16b, v25.16b + eor v13.16b, v13.16b, v26.16b + eor v14.16b, v14.16b, v27.16b + eor v15.16b, v15.16b, v28.16b + + mov x13, #0x0B + dup v25.2d, x13 + mov x13, #0x8000000000000000 + dup v26.2d, x13 + eor v16.16b, v16.16b, v25.16b + eor v20.16b, v20.16b, v26.16b + + bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal + + st1 {v0.1d-v3.1d}, [x1], #32 + st1 {v0.d}[1], [x1], #8 + st1 {v1.d}[1], [x1], #8 + st1 {v2.d}[1], [x1], #8 + st1 {v3.d}[1], [x1], #8 + + ldr x30,[sp,#8] + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 + + ret +.ifdef macOS +.else +.size KangarooTwelve_ARMv8Asha3_Process2Leaves,.-KangarooTwelve_ARMv8Asha3_Process2Leaves +.endif diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h new file mode 100644 index 0000000..512eca3 --- /dev/null +++ b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h @@ -0,0 +1,65 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#ifndef _KeccakP_1600_SnP_h_ +#define _KeccakP_1600_SnP_h_ + +/* Keccak-p[1600] */ + +#define KeccakP1600_stateSizeInBytes 200 +#define KeccakP1600_stateAlignment 8 +#define KeccakP1600_12rounds_FastLoop_supported + +const char * KeccakP1600_GetImplementation(); +void KeccakP1600_opt64_Initialize(void *state); +void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_ARMv8Asha3_Permute_12rounds(void *state); +void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +#define KeccakP1600_Initialize KeccakP1600_opt64_Initialize +#define KeccakP1600_AddByte KeccakP1600_opt64_AddByte +#define KeccakP1600_AddBytes KeccakP1600_opt64_AddBytes +#define KeccakP1600_Permute_12rounds KeccakP1600_ARMv8Asha3_Permute_12rounds +#define KeccakP1600_ExtractBytes KeccakP1600_opt64_ExtractBytes +#define KeccakP1600_12rounds_FastLoop_Absorb KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb + +/* Keccak-p[1600]×2 */ + +int KeccakP1600times2_IsAvailable(); +const char * KeccakP1600times2_GetImplementation(); +void KeccakP1600times2_ARMv8Asha3_Permute_12rounds(void *state); +void KangarooTwelve_ARMv8Asha3_Process2Leaves(const unsigned char *input, unsigned char *output); + +#define KeccakP1600times2_Permute_12rounds KeccakP1600times2_ARMv8Asha3_Permute_12rounds +#define KangarooTwelve_Process2Leaves KangarooTwelve_ARMv8Asha3_Process2Leaves + +/* Keccak-p[1600]×4 */ + +int KeccakP1600times4_IsAvailable(); +const char * KeccakP1600times4_GetImplementation(); + +/* Keccak-p[1600]×8 */ + +int KeccakP1600times8_IsAvailable(); +const char * KeccakP1600times8_GetImplementation(); + +#endif diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c new file mode 100644 index 0000000..7228d7a --- /dev/null +++ b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c @@ -0,0 +1,227 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include +#include + +const char * KeccakP1600_GetImplementation() +{ + return "ARMv8-A+SHA3 optimized implementation"; +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_Initialize(void *state) +{ + memset(state, 0, 200); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) +{ + uint64_t lane; + + if (length == 0) + return; + if (length == 1) + lane = data[0]; + else { + lane = 0; + memcpy(&lane, data, length); + } + lane <<= offset*8; + ((uint64_t*)state)[lanePosition] ^= lane; +} + +/* ---------------------------------------------------------------- */ + +static void KeccakP1600_opt64_AddLanes(void *state, const unsigned char *data, unsigned int laneCount) +{ + unsigned int i = 0; + + for( ; (i+8)<=laneCount; i+=8) { + ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; + ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; + ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2]; + ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3]; + ((uint64_t*)state)[i+4] ^= ((uint64_t*)data)[i+4]; + ((uint64_t*)state)[i+5] ^= ((uint64_t*)data)[i+5]; + ((uint64_t*)state)[i+6] ^= ((uint64_t*)data)[i+6]; + ((uint64_t*)state)[i+7] ^= ((uint64_t*)data)[i+7]; + } + for( ; (i+4)<=laneCount; i+=4) { + ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; + ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; + ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2]; + ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3]; + } + for( ; (i+2)<=laneCount; i+=2) { + ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; + ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; + } + if (i 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) +{ + uint64_t lane = ((uint64_t*)state)[lanePosition]; + { + uint64_t lane1[1]; + lane1[0] = lane; + memcpy(data, (uint8_t*)lane1+offset, length); + } +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) +{ + memcpy(data, state, laneCount*8); +} + +/* ---------------------------------------------------------------- */ + +#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ + { \ + if ((offset) == 0) { \ + SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ + SnP_ExtractBytesInLane(state, \ + (length)/SnP_laneLengthInBytes, \ + (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ + 0, \ + (length)%SnP_laneLengthInBytes); \ + } \ + else { \ + unsigned int _sizeLeft = (length); \ + unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ + unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ + unsigned char *_curData = (data); \ + while(_sizeLeft > 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +/* Keccak-p[1600]×2 */ + +int KeccakP1600times2_IsAvailable() +{ + return 1; +} + +const char * KeccakP1600times2_GetImplementation() +{ + return "ARMv8-A+SHA3 optimized implementation"; +} + +/* Keccak-p[1600]×4 */ + +int KeccakP1600times4_IsAvailable() +{ + return 0; +} + +const char * KeccakP1600times4_GetImplementation() +{ + return ""; +} + +void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output) +{ +} + +/* Keccak-p[1600]×8 */ + +int KeccakP1600times8_IsAvailable() +{ + return 0; +} + +const char * KeccakP1600times8_GetImplementation() +{ + return ""; +} + +void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output) +{ +} diff --git a/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h new file mode 100644 index 0000000..ac76272 --- /dev/null +++ b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h @@ -0,0 +1,35 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#ifndef _KeccakP_1600_SnP_h_ +#define _KeccakP_1600_SnP_h_ + +#define KeccakP1600_stateSizeInBytes 200 +#define KeccakP1600_stateAlignment 8 +#define KeccakP1600_disableParallelism + +const char * KeccakP1600_GetImplementation(); +void KeccakP1600_Initialize(void *state); +void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_Permute_12rounds(void *state); +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); + +#endif diff --git a/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c new file mode 100644 index 0000000..a72dc7c --- /dev/null +++ b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c @@ -0,0 +1,1068 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include "brg_endian.h" +#include "KeccakP-1600-SnP.h" + +const char * KeccakP1600_GetImplementation() +{ + return "in-place 32-bit implementation"; +} + + +#define ROL32(a, offset) ((((uint32_t)a) << (offset)) ^ (((uint32_t)a) >> (32-(offset)))) + +/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ +#define prepareToBitInterleaving(low, high, temp, temp0, temp1) \ + temp0 = (low); \ + temp = (temp0 ^ (temp0 >> 1)) & 0x22222222UL; temp0 = temp0 ^ temp ^ (temp << 1); \ + temp = (temp0 ^ (temp0 >> 2)) & 0x0C0C0C0CUL; temp0 = temp0 ^ temp ^ (temp << 2); \ + temp = (temp0 ^ (temp0 >> 4)) & 0x00F000F0UL; temp0 = temp0 ^ temp ^ (temp << 4); \ + temp = (temp0 ^ (temp0 >> 8)) & 0x0000FF00UL; temp0 = temp0 ^ temp ^ (temp << 8); \ + temp1 = (high); \ + temp = (temp1 ^ (temp1 >> 1)) & 0x22222222UL; temp1 = temp1 ^ temp ^ (temp << 1); \ + temp = (temp1 ^ (temp1 >> 2)) & 0x0C0C0C0CUL; temp1 = temp1 ^ temp ^ (temp << 2); \ + temp = (temp1 ^ (temp1 >> 4)) & 0x00F000F0UL; temp1 = temp1 ^ temp ^ (temp << 4); \ + temp = (temp1 ^ (temp1 >> 8)) & 0x0000FF00UL; temp1 = temp1 ^ temp ^ (temp << 8); + +#define toBitInterleavingAndXOR(low, high, even, odd, temp, temp0, temp1) \ + prepareToBitInterleaving(low, high, temp, temp0, temp1) \ + even ^= (temp0 & 0x0000FFFF) | (temp1 << 16); \ + odd ^= (temp0 >> 16) | (temp1 & 0xFFFF0000); + +#define toBitInterleavingAndAND(low, high, even, odd, temp, temp0, temp1) \ + prepareToBitInterleaving(low, high, temp, temp0, temp1) \ + even &= (temp0 & 0x0000FFFF) | (temp1 << 16); \ + odd &= (temp0 >> 16) | (temp1 & 0xFFFF0000); + +#define toBitInterleavingAndSet(low, high, even, odd, temp, temp0, temp1) \ + prepareToBitInterleaving(low, high, temp, temp0, temp1) \ + even = (temp0 & 0x0000FFFF) | (temp1 << 16); \ + odd = (temp0 >> 16) | (temp1 & 0xFFFF0000); + +/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ +#define prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ + temp0 = (even); \ + temp1 = (odd); \ + temp = (temp0 & 0x0000FFFF) | (temp1 << 16); \ + temp1 = (temp0 >> 16) | (temp1 & 0xFFFF0000); \ + temp0 = temp; \ + temp = (temp0 ^ (temp0 >> 8)) & 0x0000FF00UL; temp0 = temp0 ^ temp ^ (temp << 8); \ + temp = (temp0 ^ (temp0 >> 4)) & 0x00F000F0UL; temp0 = temp0 ^ temp ^ (temp << 4); \ + temp = (temp0 ^ (temp0 >> 2)) & 0x0C0C0C0CUL; temp0 = temp0 ^ temp ^ (temp << 2); \ + temp = (temp0 ^ (temp0 >> 1)) & 0x22222222UL; temp0 = temp0 ^ temp ^ (temp << 1); \ + temp = (temp1 ^ (temp1 >> 8)) & 0x0000FF00UL; temp1 = temp1 ^ temp ^ (temp << 8); \ + temp = (temp1 ^ (temp1 >> 4)) & 0x00F000F0UL; temp1 = temp1 ^ temp ^ (temp << 4); \ + temp = (temp1 ^ (temp1 >> 2)) & 0x0C0C0C0CUL; temp1 = temp1 ^ temp ^ (temp << 2); \ + temp = (temp1 ^ (temp1 >> 1)) & 0x22222222UL; temp1 = temp1 ^ temp ^ (temp << 1); + +#define fromBitInterleaving(even, odd, low, high, temp, temp0, temp1) \ + prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ + low = temp0; \ + high = temp1; + +#define fromBitInterleavingAndXOR(even, odd, lowIn, highIn, lowOut, highOut, temp, temp0, temp1) \ + prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ + lowOut = lowIn ^ temp0; \ + highOut = highIn ^ temp1; + +void KeccakP1600_SetBytesInLaneToZero(void *state, unsigned int lanePosition, unsigned int offset, unsigned int length) +{ + uint8_t laneAsBytes[8]; + uint32_t low, high; + uint32_t temp, temp0, temp1; + uint32_t *stateAsHalfLanes = (uint32_t*)state; + + memset(laneAsBytes, 0xFF, offset); + memset(laneAsBytes+offset, 0x00, length); + memset(laneAsBytes+offset+length, 0xFF, 8-offset-length); +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + low = *((uint32_t*)(laneAsBytes+0)); + high = *((uint32_t*)(laneAsBytes+4)); +#else + low = laneAsBytes[0] + | ((uint32_t)(laneAsBytes[1]) << 8) + | ((uint32_t)(laneAsBytes[2]) << 16) + | ((uint32_t)(laneAsBytes[3]) << 24); + high = laneAsBytes[4] + | ((uint32_t)(laneAsBytes[5]) << 8) + | ((uint32_t)(laneAsBytes[6]) << 16) + | ((uint32_t)(laneAsBytes[7]) << 24); +#endif + toBitInterleavingAndAND(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_Initialize(void *state) +{ + memset(state, 0, 200); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset) +{ + unsigned int lanePosition = offset/8; + unsigned int offsetInLane = offset%8; + uint32_t low, high; + uint32_t temp, temp0, temp1; + uint32_t *stateAsHalfLanes = (uint32_t*)state; + + if (offsetInLane < 4) { + low = (uint32_t)byte << (offsetInLane*8); + high = 0; + } + else { + low = 0; + high = (uint32_t)byte << ((offsetInLane-4)*8); + } + toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) +{ + uint8_t laneAsBytes[8]; + uint32_t low, high; + uint32_t temp, temp0, temp1; + uint32_t *stateAsHalfLanes = (uint32_t*)state; + + memset(laneAsBytes, 0, 8); + memcpy(laneAsBytes+offset, data, length); +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + low = *((uint32_t*)(laneAsBytes+0)); + high = *((uint32_t*)(laneAsBytes+4)); +#else + low = laneAsBytes[0] + | ((uint32_t)(laneAsBytes[1]) << 8) + | ((uint32_t)(laneAsBytes[2]) << 16) + | ((uint32_t)(laneAsBytes[3]) << 24); + high = laneAsBytes[4] + | ((uint32_t)(laneAsBytes[5]) << 8) + | ((uint32_t)(laneAsBytes[6]) << 16) + | ((uint32_t)(laneAsBytes[7]) << 24); +#endif + toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); +} + +/* ---------------------------------------------------------------- */ + +static void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + const uint32_t * pI = (const uint32_t *)data; + uint32_t * pS = (uint32_t*)state; + uint32_t t, x0, x1; + int i; + for (i = laneCount-1; i >= 0; --i) { +#ifdef NO_MISALIGNED_ACCESSES + uint32_t low; + uint32_t high; + memcpy(&low, pI++, 4); + memcpy(&high, pI++, 4); + toBitInterleavingAndXOR(low, high, *(pS++), *(pS++), t, x0, x1); +#else + toBitInterleavingAndXOR(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1) +#endif + } +#else + unsigned int lanePosition; + for(lanePosition=0; lanePosition 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) +{ + uint32_t *stateAsHalfLanes = (uint32_t*)state; + uint32_t low, high, temp, temp0, temp1; + uint8_t laneAsBytes[8]; + + fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1); +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + *((uint32_t*)(laneAsBytes+0)) = low; + *((uint32_t*)(laneAsBytes+4)) = high; +#else + laneAsBytes[0] = low & 0xFF; + laneAsBytes[1] = (low >> 8) & 0xFF; + laneAsBytes[2] = (low >> 16) & 0xFF; + laneAsBytes[3] = (low >> 24) & 0xFF; + laneAsBytes[4] = high & 0xFF; + laneAsBytes[5] = (high >> 8) & 0xFF; + laneAsBytes[6] = (high >> 16) & 0xFF; + laneAsBytes[7] = (high >> 24) & 0xFF; +#endif + memcpy(data, laneAsBytes+offset, length); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + uint32_t * pI = (uint32_t *)data; + const uint32_t * pS = ( const uint32_t *)state; + uint32_t t, x0, x1; + int i; + for (i = laneCount-1; i >= 0; --i) { +#ifdef NO_MISALIGNED_ACCESSES + uint32_t low; + uint32_t high; + fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1); + memcpy(pI++, &low, 4); + memcpy(pI++, &high, 4); +#else + fromBitInterleaving(*(pS++), *(pS++), *(pI++), *(pI++), t, x0, x1) +#endif + } +#else + unsigned int lanePosition; + for(lanePosition=0; lanePosition> 8) & 0xFF; + laneAsBytes[2] = (low >> 16) & 0xFF; + laneAsBytes[3] = (low >> 24) & 0xFF; + laneAsBytes[4] = high & 0xFF; + laneAsBytes[5] = (high >> 8) & 0xFF; + laneAsBytes[6] = (high >> 16) & 0xFF; + laneAsBytes[7] = (high >> 24) & 0xFF; + memcpy(data+lanePosition*8, laneAsBytes, 8); + } +#endif +} + +/* ---------------------------------------------------------------- */ + +#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ + { \ + if ((offset) == 0) { \ + SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ + SnP_ExtractBytesInLane(state, \ + (length)/SnP_laneLengthInBytes, \ + (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ + 0, \ + (length)%SnP_laneLengthInBytes); \ + } \ + else { \ + unsigned int _sizeLeft = (length); \ + unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ + unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ + unsigned char *_curData = (data); \ + while(_sizeLeft > 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +static const uint32_t KeccakF1600RoundConstants_int2[2*24+1] = +{ + 0x00000001UL, 0x00000000UL, + 0x00000000UL, 0x00000089UL, + 0x00000000UL, 0x8000008bUL, + 0x00000000UL, 0x80008080UL, + 0x00000001UL, 0x0000008bUL, + 0x00000001UL, 0x00008000UL, + 0x00000001UL, 0x80008088UL, + 0x00000001UL, 0x80000082UL, + 0x00000000UL, 0x0000000bUL, + 0x00000000UL, 0x0000000aUL, + 0x00000001UL, 0x00008082UL, + 0x00000000UL, 0x00008003UL, + 0x00000001UL, 0x0000808bUL, + 0x00000001UL, 0x8000000bUL, + 0x00000001UL, 0x8000008aUL, + 0x00000001UL, 0x80000081UL, + 0x00000000UL, 0x80000081UL, + 0x00000000UL, 0x80000008UL, + 0x00000000UL, 0x00000083UL, + 0x00000000UL, 0x80008003UL, + 0x00000001UL, 0x80008088UL, + 0x00000000UL, 0x80000088UL, + 0x00000001UL, 0x00008000UL, + 0x00000000UL, 0x80008082UL, + 0x000000FFUL +}; + +#define KeccakRound0() \ + Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \ + Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \ + Da0 = Cx^ROL32(Du1, 1); \ + Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \ + Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \ + Da1 = Cz^Du0; \ + Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \ + Do0 = Cw^ROL32(Cz, 1); \ + Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \ + Do1 = Cy^Cx; \ + Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \ + De0 = Cx^ROL32(Cy, 1); \ + Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \ + De1 = Cz^Cw; \ + Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \ + Di0 = Du0^ROL32(Cy, 1); \ + Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \ + Di1 = Du1^Cw; \ + Du0 = Cw^ROL32(Cz, 1); \ + Du1 = Cy^Cx; \ +\ + Ba = (Aba0^Da0); \ + Be = ROL32((Age0^De0), 22); \ + Bi = ROL32((Aki1^Di1), 22); \ + Bo = ROL32((Amo1^Do1), 11); \ + Bu = ROL32((Asu0^Du0), 7); \ + Aba0 = Ba ^((~Be)& Bi ); \ + Aba0 ^= *(pRoundConstants++); \ + Age0 = Be ^((~Bi)& Bo ); \ + Aki1 = Bi ^((~Bo)& Bu ); \ + Amo1 = Bo ^((~Bu)& Ba ); \ + Asu0 = Bu ^((~Ba)& Be ); \ + Ba = (Aba1^Da1); \ + Be = ROL32((Age1^De1), 22); \ + Bi = ROL32((Aki0^Di0), 21); \ + Bo = ROL32((Amo0^Do0), 10); \ + Bu = ROL32((Asu1^Du1), 7); \ + Aba1 = Ba ^((~Be)& Bi ); \ + Aba1 ^= *(pRoundConstants++); \ + Age1 = Be ^((~Bi)& Bo ); \ + Aki0 = Bi ^((~Bo)& Bu ); \ + Amo0 = Bo ^((~Bu)& Ba ); \ + Asu1 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Aka1^Da1), 2); \ + Bo = ROL32((Ame1^De1), 23); \ + Bu = ROL32((Asi1^Di1), 31); \ + Ba = ROL32((Abo0^Do0), 14); \ + Be = ROL32((Agu0^Du0), 10); \ + Aka1 = Ba ^((~Be)& Bi ); \ + Ame1 = Be ^((~Bi)& Bo ); \ + Asi1 = Bi ^((~Bo)& Bu ); \ + Abo0 = Bo ^((~Bu)& Ba ); \ + Agu0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Aka0^Da0), 1); \ + Bo = ROL32((Ame0^De0), 22); \ + Bu = ROL32((Asi0^Di0), 30); \ + Ba = ROL32((Abo1^Do1), 14); \ + Be = ROL32((Agu1^Du1), 10); \ + Aka0 = Ba ^((~Be)& Bi ); \ + Ame0 = Be ^((~Bi)& Bo ); \ + Asi0 = Bi ^((~Bo)& Bu ); \ + Abo1 = Bo ^((~Bu)& Ba ); \ + Agu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Asa0^Da0), 9); \ + Ba = ROL32((Abe1^De1), 1); \ + Be = ROL32((Agi0^Di0), 3); \ + Bi = ROL32((Ako1^Do1), 13); \ + Bo = ROL32((Amu0^Du0), 4); \ + Asa0 = Ba ^((~Be)& Bi ); \ + Abe1 = Be ^((~Bi)& Bo ); \ + Agi0 = Bi ^((~Bo)& Bu ); \ + Ako1 = Bo ^((~Bu)& Ba ); \ + Amu0 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Asa1^Da1), 9); \ + Ba = (Abe0^De0); \ + Be = ROL32((Agi1^Di1), 3); \ + Bi = ROL32((Ako0^Do0), 12); \ + Bo = ROL32((Amu1^Du1), 4); \ + Asa1 = Ba ^((~Be)& Bi ); \ + Abe0 = Be ^((~Bi)& Bo ); \ + Agi1 = Bi ^((~Bo)& Bu ); \ + Ako0 = Bo ^((~Bu)& Ba ); \ + Amu1 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Aga0^Da0), 18); \ + Bi = ROL32((Ake0^De0), 5); \ + Bo = ROL32((Ami1^Di1), 8); \ + Bu = ROL32((Aso0^Do0), 28); \ + Ba = ROL32((Abu1^Du1), 14); \ + Aga0 = Ba ^((~Be)& Bi ); \ + Ake0 = Be ^((~Bi)& Bo ); \ + Ami1 = Bi ^((~Bo)& Bu ); \ + Aso0 = Bo ^((~Bu)& Ba ); \ + Abu1 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Aga1^Da1), 18); \ + Bi = ROL32((Ake1^De1), 5); \ + Bo = ROL32((Ami0^Di0), 7); \ + Bu = ROL32((Aso1^Do1), 28); \ + Ba = ROL32((Abu0^Du0), 13); \ + Aga1 = Ba ^((~Be)& Bi ); \ + Ake1 = Be ^((~Bi)& Bo ); \ + Ami0 = Bi ^((~Bo)& Bu ); \ + Aso1 = Bo ^((~Bu)& Ba ); \ + Abu0 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Ama1^Da1), 21); \ + Bu = ROL32((Ase0^De0), 1); \ + Ba = ROL32((Abi0^Di0), 31); \ + Be = ROL32((Ago1^Do1), 28); \ + Bi = ROL32((Aku1^Du1), 20); \ + Ama1 = Ba ^((~Be)& Bi ); \ + Ase0 = Be ^((~Bi)& Bo ); \ + Abi0 = Bi ^((~Bo)& Bu ); \ + Ago1 = Bo ^((~Bu)& Ba ); \ + Aku1 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Ama0^Da0), 20); \ + Bu = ROL32((Ase1^De1), 1); \ + Ba = ROL32((Abi1^Di1), 31); \ + Be = ROL32((Ago0^Do0), 27); \ + Bi = ROL32((Aku0^Du0), 19); \ + Ama0 = Ba ^((~Be)& Bi ); \ + Ase1 = Be ^((~Bi)& Bo ); \ + Abi1 = Bi ^((~Bo)& Bu ); \ + Ago0 = Bo ^((~Bu)& Ba ); \ + Aku0 = Bu ^((~Ba)& Be ) + +#define KeccakRound1() \ + Cx = Asu0^Agu0^Amu0^Abu1^Aku1; \ + Du1 = Age1^Ame0^Abe0^Ake1^Ase1; \ + Da0 = Cx^ROL32(Du1, 1); \ + Cz = Asu1^Agu1^Amu1^Abu0^Aku0; \ + Du0 = Age0^Ame1^Abe1^Ake0^Ase0; \ + Da1 = Cz^Du0; \ + Cw = Aki1^Asi1^Agi0^Ami1^Abi0; \ + Do0 = Cw^ROL32(Cz, 1); \ + Cy = Aki0^Asi0^Agi1^Ami0^Abi1; \ + Do1 = Cy^Cx; \ + Cx = Aba0^Aka1^Asa0^Aga0^Ama1; \ + De0 = Cx^ROL32(Cy, 1); \ + Cz = Aba1^Aka0^Asa1^Aga1^Ama0; \ + De1 = Cz^Cw; \ + Cy = Amo0^Abo1^Ako0^Aso1^Ago0; \ + Di0 = Du0^ROL32(Cy, 1); \ + Cw = Amo1^Abo0^Ako1^Aso0^Ago1; \ + Di1 = Du1^Cw; \ + Du0 = Cw^ROL32(Cz, 1); \ + Du1 = Cy^Cx; \ +\ + Ba = (Aba0^Da0); \ + Be = ROL32((Ame1^De0), 22); \ + Bi = ROL32((Agi1^Di1), 22); \ + Bo = ROL32((Aso1^Do1), 11); \ + Bu = ROL32((Aku1^Du0), 7); \ + Aba0 = Ba ^((~Be)& Bi ); \ + Aba0 ^= *(pRoundConstants++); \ + Ame1 = Be ^((~Bi)& Bo ); \ + Agi1 = Bi ^((~Bo)& Bu ); \ + Aso1 = Bo ^((~Bu)& Ba ); \ + Aku1 = Bu ^((~Ba)& Be ); \ + Ba = (Aba1^Da1); \ + Be = ROL32((Ame0^De1), 22); \ + Bi = ROL32((Agi0^Di0), 21); \ + Bo = ROL32((Aso0^Do0), 10); \ + Bu = ROL32((Aku0^Du1), 7); \ + Aba1 = Ba ^((~Be)& Bi ); \ + Aba1 ^= *(pRoundConstants++); \ + Ame0 = Be ^((~Bi)& Bo ); \ + Agi0 = Bi ^((~Bo)& Bu ); \ + Aso0 = Bo ^((~Bu)& Ba ); \ + Aku0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Asa1^Da1), 2); \ + Bo = ROL32((Ake1^De1), 23); \ + Bu = ROL32((Abi1^Di1), 31); \ + Ba = ROL32((Amo1^Do0), 14); \ + Be = ROL32((Agu0^Du0), 10); \ + Asa1 = Ba ^((~Be)& Bi ); \ + Ake1 = Be ^((~Bi)& Bo ); \ + Abi1 = Bi ^((~Bo)& Bu ); \ + Amo1 = Bo ^((~Bu)& Ba ); \ + Agu0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Asa0^Da0), 1); \ + Bo = ROL32((Ake0^De0), 22); \ + Bu = ROL32((Abi0^Di0), 30); \ + Ba = ROL32((Amo0^Do1), 14); \ + Be = ROL32((Agu1^Du1), 10); \ + Asa0 = Ba ^((~Be)& Bi ); \ + Ake0 = Be ^((~Bi)& Bo ); \ + Abi0 = Bi ^((~Bo)& Bu ); \ + Amo0 = Bo ^((~Bu)& Ba ); \ + Agu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Ama1^Da0), 9); \ + Ba = ROL32((Age1^De1), 1); \ + Be = ROL32((Asi1^Di0), 3); \ + Bi = ROL32((Ako0^Do1), 13); \ + Bo = ROL32((Abu1^Du0), 4); \ + Ama1 = Ba ^((~Be)& Bi ); \ + Age1 = Be ^((~Bi)& Bo ); \ + Asi1 = Bi ^((~Bo)& Bu ); \ + Ako0 = Bo ^((~Bu)& Ba ); \ + Abu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Ama0^Da1), 9); \ + Ba = (Age0^De0); \ + Be = ROL32((Asi0^Di1), 3); \ + Bi = ROL32((Ako1^Do0), 12); \ + Bo = ROL32((Abu0^Du1), 4); \ + Ama0 = Ba ^((~Be)& Bi ); \ + Age0 = Be ^((~Bi)& Bo ); \ + Asi0 = Bi ^((~Bo)& Bu ); \ + Ako1 = Bo ^((~Bu)& Ba ); \ + Abu0 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Aka1^Da0), 18); \ + Bi = ROL32((Abe1^De0), 5); \ + Bo = ROL32((Ami0^Di1), 8); \ + Bu = ROL32((Ago1^Do0), 28); \ + Ba = ROL32((Asu1^Du1), 14); \ + Aka1 = Ba ^((~Be)& Bi ); \ + Abe1 = Be ^((~Bi)& Bo ); \ + Ami0 = Bi ^((~Bo)& Bu ); \ + Ago1 = Bo ^((~Bu)& Ba ); \ + Asu1 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Aka0^Da1), 18); \ + Bi = ROL32((Abe0^De1), 5); \ + Bo = ROL32((Ami1^Di0), 7); \ + Bu = ROL32((Ago0^Do1), 28); \ + Ba = ROL32((Asu0^Du0), 13); \ + Aka0 = Ba ^((~Be)& Bi ); \ + Abe0 = Be ^((~Bi)& Bo ); \ + Ami1 = Bi ^((~Bo)& Bu ); \ + Ago0 = Bo ^((~Bu)& Ba ); \ + Asu0 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Aga1^Da1), 21); \ + Bu = ROL32((Ase0^De0), 1); \ + Ba = ROL32((Aki1^Di0), 31); \ + Be = ROL32((Abo1^Do1), 28); \ + Bi = ROL32((Amu1^Du1), 20); \ + Aga1 = Ba ^((~Be)& Bi ); \ + Ase0 = Be ^((~Bi)& Bo ); \ + Aki1 = Bi ^((~Bo)& Bu ); \ + Abo1 = Bo ^((~Bu)& Ba ); \ + Amu1 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Aga0^Da0), 20); \ + Bu = ROL32((Ase1^De1), 1); \ + Ba = ROL32((Aki0^Di1), 31); \ + Be = ROL32((Abo0^Do0), 27); \ + Bi = ROL32((Amu0^Du0), 19); \ + Aga0 = Ba ^((~Be)& Bi ); \ + Ase1 = Be ^((~Bi)& Bo ); \ + Aki0 = Bi ^((~Bo)& Bu ); \ + Abo0 = Bo ^((~Bu)& Ba ); \ + Amu0 = Bu ^((~Ba)& Be ); + +#define KeccakRound2() \ + Cx = Aku1^Agu0^Abu1^Asu1^Amu1; \ + Du1 = Ame0^Ake0^Age0^Abe0^Ase1; \ + Da0 = Cx^ROL32(Du1, 1); \ + Cz = Aku0^Agu1^Abu0^Asu0^Amu0; \ + Du0 = Ame1^Ake1^Age1^Abe1^Ase0; \ + Da1 = Cz^Du0; \ + Cw = Agi1^Abi1^Asi1^Ami0^Aki1; \ + Do0 = Cw^ROL32(Cz, 1); \ + Cy = Agi0^Abi0^Asi0^Ami1^Aki0; \ + Do1 = Cy^Cx; \ + Cx = Aba0^Asa1^Ama1^Aka1^Aga1; \ + De0 = Cx^ROL32(Cy, 1); \ + Cz = Aba1^Asa0^Ama0^Aka0^Aga0; \ + De1 = Cz^Cw; \ + Cy = Aso0^Amo0^Ako1^Ago0^Abo0; \ + Di0 = Du0^ROL32(Cy, 1); \ + Cw = Aso1^Amo1^Ako0^Ago1^Abo1; \ + Di1 = Du1^Cw; \ + Du0 = Cw^ROL32(Cz, 1); \ + Du1 = Cy^Cx; \ +\ + Ba = (Aba0^Da0); \ + Be = ROL32((Ake1^De0), 22); \ + Bi = ROL32((Asi0^Di1), 22); \ + Bo = ROL32((Ago0^Do1), 11); \ + Bu = ROL32((Amu1^Du0), 7); \ + Aba0 = Ba ^((~Be)& Bi ); \ + Aba0 ^= *(pRoundConstants++); \ + Ake1 = Be ^((~Bi)& Bo ); \ + Asi0 = Bi ^((~Bo)& Bu ); \ + Ago0 = Bo ^((~Bu)& Ba ); \ + Amu1 = Bu ^((~Ba)& Be ); \ + Ba = (Aba1^Da1); \ + Be = ROL32((Ake0^De1), 22); \ + Bi = ROL32((Asi1^Di0), 21); \ + Bo = ROL32((Ago1^Do0), 10); \ + Bu = ROL32((Amu0^Du1), 7); \ + Aba1 = Ba ^((~Be)& Bi ); \ + Aba1 ^= *(pRoundConstants++); \ + Ake0 = Be ^((~Bi)& Bo ); \ + Asi1 = Bi ^((~Bo)& Bu ); \ + Ago1 = Bo ^((~Bu)& Ba ); \ + Amu0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Ama0^Da1), 2); \ + Bo = ROL32((Abe0^De1), 23); \ + Bu = ROL32((Aki0^Di1), 31); \ + Ba = ROL32((Aso1^Do0), 14); \ + Be = ROL32((Agu0^Du0), 10); \ + Ama0 = Ba ^((~Be)& Bi ); \ + Abe0 = Be ^((~Bi)& Bo ); \ + Aki0 = Bi ^((~Bo)& Bu ); \ + Aso1 = Bo ^((~Bu)& Ba ); \ + Agu0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Ama1^Da0), 1); \ + Bo = ROL32((Abe1^De0), 22); \ + Bu = ROL32((Aki1^Di0), 30); \ + Ba = ROL32((Aso0^Do1), 14); \ + Be = ROL32((Agu1^Du1), 10); \ + Ama1 = Ba ^((~Be)& Bi ); \ + Abe1 = Be ^((~Bi)& Bo ); \ + Aki1 = Bi ^((~Bo)& Bu ); \ + Aso0 = Bo ^((~Bu)& Ba ); \ + Agu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Aga1^Da0), 9); \ + Ba = ROL32((Ame0^De1), 1); \ + Be = ROL32((Abi1^Di0), 3); \ + Bi = ROL32((Ako1^Do1), 13); \ + Bo = ROL32((Asu1^Du0), 4); \ + Aga1 = Ba ^((~Be)& Bi ); \ + Ame0 = Be ^((~Bi)& Bo ); \ + Abi1 = Bi ^((~Bo)& Bu ); \ + Ako1 = Bo ^((~Bu)& Ba ); \ + Asu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Aga0^Da1), 9); \ + Ba = (Ame1^De0); \ + Be = ROL32((Abi0^Di1), 3); \ + Bi = ROL32((Ako0^Do0), 12); \ + Bo = ROL32((Asu0^Du1), 4); \ + Aga0 = Ba ^((~Be)& Bi ); \ + Ame1 = Be ^((~Bi)& Bo ); \ + Abi0 = Bi ^((~Bo)& Bu ); \ + Ako0 = Bo ^((~Bu)& Ba ); \ + Asu0 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Asa1^Da0), 18); \ + Bi = ROL32((Age1^De0), 5); \ + Bo = ROL32((Ami1^Di1), 8); \ + Bu = ROL32((Abo1^Do0), 28); \ + Ba = ROL32((Aku0^Du1), 14); \ + Asa1 = Ba ^((~Be)& Bi ); \ + Age1 = Be ^((~Bi)& Bo ); \ + Ami1 = Bi ^((~Bo)& Bu ); \ + Abo1 = Bo ^((~Bu)& Ba ); \ + Aku0 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Asa0^Da1), 18); \ + Bi = ROL32((Age0^De1), 5); \ + Bo = ROL32((Ami0^Di0), 7); \ + Bu = ROL32((Abo0^Do1), 28); \ + Ba = ROL32((Aku1^Du0), 13); \ + Asa0 = Ba ^((~Be)& Bi ); \ + Age0 = Be ^((~Bi)& Bo ); \ + Ami0 = Bi ^((~Bo)& Bu ); \ + Abo0 = Bo ^((~Bu)& Ba ); \ + Aku1 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Aka0^Da1), 21); \ + Bu = ROL32((Ase0^De0), 1); \ + Ba = ROL32((Agi1^Di0), 31); \ + Be = ROL32((Amo0^Do1), 28); \ + Bi = ROL32((Abu0^Du1), 20); \ + Aka0 = Ba ^((~Be)& Bi ); \ + Ase0 = Be ^((~Bi)& Bo ); \ + Agi1 = Bi ^((~Bo)& Bu ); \ + Amo0 = Bo ^((~Bu)& Ba ); \ + Abu0 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Aka1^Da0), 20); \ + Bu = ROL32((Ase1^De1), 1); \ + Ba = ROL32((Agi0^Di1), 31); \ + Be = ROL32((Amo1^Do0), 27); \ + Bi = ROL32((Abu1^Du0), 19); \ + Aka1 = Ba ^((~Be)& Bi ); \ + Ase1 = Be ^((~Bi)& Bo ); \ + Agi0 = Bi ^((~Bo)& Bu ); \ + Amo1 = Bo ^((~Bu)& Ba ); \ + Abu1 = Bu ^((~Ba)& Be ); + +#define KeccakRound3() \ + Cx = Amu1^Agu0^Asu1^Aku0^Abu0; \ + Du1 = Ake0^Abe1^Ame1^Age0^Ase1; \ + Da0 = Cx^ROL32(Du1, 1); \ + Cz = Amu0^Agu1^Asu0^Aku1^Abu1; \ + Du0 = Ake1^Abe0^Ame0^Age1^Ase0; \ + Da1 = Cz^Du0; \ + Cw = Asi0^Aki0^Abi1^Ami1^Agi1; \ + Do0 = Cw^ROL32(Cz, 1); \ + Cy = Asi1^Aki1^Abi0^Ami0^Agi0; \ + Do1 = Cy^Cx; \ + Cx = Aba0^Ama0^Aga1^Asa1^Aka0; \ + De0 = Cx^ROL32(Cy, 1); \ + Cz = Aba1^Ama1^Aga0^Asa0^Aka1; \ + De1 = Cz^Cw; \ + Cy = Ago1^Aso0^Ako0^Abo0^Amo1; \ + Di0 = Du0^ROL32(Cy, 1); \ + Cw = Ago0^Aso1^Ako1^Abo1^Amo0; \ + Di1 = Du1^Cw; \ + Du0 = Cw^ROL32(Cz, 1); \ + Du1 = Cy^Cx; \ +\ + Ba = (Aba0^Da0); \ + Be = ROL32((Abe0^De0), 22); \ + Bi = ROL32((Abi0^Di1), 22); \ + Bo = ROL32((Abo0^Do1), 11); \ + Bu = ROL32((Abu0^Du0), 7); \ + Aba0 = Ba ^((~Be)& Bi ); \ + Aba0 ^= *(pRoundConstants++); \ + Abe0 = Be ^((~Bi)& Bo ); \ + Abi0 = Bi ^((~Bo)& Bu ); \ + Abo0 = Bo ^((~Bu)& Ba ); \ + Abu0 = Bu ^((~Ba)& Be ); \ + Ba = (Aba1^Da1); \ + Be = ROL32((Abe1^De1), 22); \ + Bi = ROL32((Abi1^Di0), 21); \ + Bo = ROL32((Abo1^Do0), 10); \ + Bu = ROL32((Abu1^Du1), 7); \ + Aba1 = Ba ^((~Be)& Bi ); \ + Aba1 ^= *(pRoundConstants++); \ + Abe1 = Be ^((~Bi)& Bo ); \ + Abi1 = Bi ^((~Bo)& Bu ); \ + Abo1 = Bo ^((~Bu)& Ba ); \ + Abu1 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Aga0^Da1), 2); \ + Bo = ROL32((Age0^De1), 23); \ + Bu = ROL32((Agi0^Di1), 31); \ + Ba = ROL32((Ago0^Do0), 14); \ + Be = ROL32((Agu0^Du0), 10); \ + Aga0 = Ba ^((~Be)& Bi ); \ + Age0 = Be ^((~Bi)& Bo ); \ + Agi0 = Bi ^((~Bo)& Bu ); \ + Ago0 = Bo ^((~Bu)& Ba ); \ + Agu0 = Bu ^((~Ba)& Be ); \ + Bi = ROL32((Aga1^Da0), 1); \ + Bo = ROL32((Age1^De0), 22); \ + Bu = ROL32((Agi1^Di0), 30); \ + Ba = ROL32((Ago1^Do1), 14); \ + Be = ROL32((Agu1^Du1), 10); \ + Aga1 = Ba ^((~Be)& Bi ); \ + Age1 = Be ^((~Bi)& Bo ); \ + Agi1 = Bi ^((~Bo)& Bu ); \ + Ago1 = Bo ^((~Bu)& Ba ); \ + Agu1 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Aka0^Da0), 9); \ + Ba = ROL32((Ake0^De1), 1); \ + Be = ROL32((Aki0^Di0), 3); \ + Bi = ROL32((Ako0^Do1), 13); \ + Bo = ROL32((Aku0^Du0), 4); \ + Aka0 = Ba ^((~Be)& Bi ); \ + Ake0 = Be ^((~Bi)& Bo ); \ + Aki0 = Bi ^((~Bo)& Bu ); \ + Ako0 = Bo ^((~Bu)& Ba ); \ + Aku0 = Bu ^((~Ba)& Be ); \ + Bu = ROL32((Aka1^Da1), 9); \ + Ba = (Ake1^De0); \ + Be = ROL32((Aki1^Di1), 3); \ + Bi = ROL32((Ako1^Do0), 12); \ + Bo = ROL32((Aku1^Du1), 4); \ + Aka1 = Ba ^((~Be)& Bi ); \ + Ake1 = Be ^((~Bi)& Bo ); \ + Aki1 = Bi ^((~Bo)& Bu ); \ + Ako1 = Bo ^((~Bu)& Ba ); \ + Aku1 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Ama0^Da0), 18); \ + Bi = ROL32((Ame0^De0), 5); \ + Bo = ROL32((Ami0^Di1), 8); \ + Bu = ROL32((Amo0^Do0), 28); \ + Ba = ROL32((Amu0^Du1), 14); \ + Ama0 = Ba ^((~Be)& Bi ); \ + Ame0 = Be ^((~Bi)& Bo ); \ + Ami0 = Bi ^((~Bo)& Bu ); \ + Amo0 = Bo ^((~Bu)& Ba ); \ + Amu0 = Bu ^((~Ba)& Be ); \ + Be = ROL32((Ama1^Da1), 18); \ + Bi = ROL32((Ame1^De1), 5); \ + Bo = ROL32((Ami1^Di0), 7); \ + Bu = ROL32((Amo1^Do1), 28); \ + Ba = ROL32((Amu1^Du0), 13); \ + Ama1 = Ba ^((~Be)& Bi ); \ + Ame1 = Be ^((~Bi)& Bo ); \ + Ami1 = Bi ^((~Bo)& Bu ); \ + Amo1 = Bo ^((~Bu)& Ba ); \ + Amu1 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Asa0^Da1), 21); \ + Bu = ROL32((Ase0^De0), 1); \ + Ba = ROL32((Asi0^Di0), 31); \ + Be = ROL32((Aso0^Do1), 28); \ + Bi = ROL32((Asu0^Du1), 20); \ + Asa0 = Ba ^((~Be)& Bi ); \ + Ase0 = Be ^((~Bi)& Bo ); \ + Asi0 = Bi ^((~Bo)& Bu ); \ + Aso0 = Bo ^((~Bu)& Ba ); \ + Asu0 = Bu ^((~Ba)& Be ); \ + Bo = ROL32((Asa1^Da0), 20); \ + Bu = ROL32((Ase1^De1), 1); \ + Ba = ROL32((Asi1^Di1), 31); \ + Be = ROL32((Aso1^Do0), 27); \ + Bi = ROL32((Asu1^Du0), 19); \ + Asa1 = Ba ^((~Be)& Bi ); \ + Ase1 = Be ^((~Bi)& Bo ); \ + Asi1 = Bi ^((~Bo)& Bu ); \ + Aso1 = Bo ^((~Bu)& Ba ); \ + Asu1 = Bu ^((~Ba)& Be ); + +void KeccakP1600_Permute_Nrounds(void *state, unsigned int nRounds) +{ + uint32_t Da0, De0, Di0, Do0, Du0; + uint32_t Da1, De1, Di1, Do1, Du1; + uint32_t Ba, Be, Bi, Bo, Bu; + uint32_t Cx, Cy, Cz, Cw; + const uint32_t *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2; + uint32_t *stateAsHalfLanes = (uint32_t*)state; + #define Aba0 stateAsHalfLanes[ 0] + #define Aba1 stateAsHalfLanes[ 1] + #define Abe0 stateAsHalfLanes[ 2] + #define Abe1 stateAsHalfLanes[ 3] + #define Abi0 stateAsHalfLanes[ 4] + #define Abi1 stateAsHalfLanes[ 5] + #define Abo0 stateAsHalfLanes[ 6] + #define Abo1 stateAsHalfLanes[ 7] + #define Abu0 stateAsHalfLanes[ 8] + #define Abu1 stateAsHalfLanes[ 9] + #define Aga0 stateAsHalfLanes[10] + #define Aga1 stateAsHalfLanes[11] + #define Age0 stateAsHalfLanes[12] + #define Age1 stateAsHalfLanes[13] + #define Agi0 stateAsHalfLanes[14] + #define Agi1 stateAsHalfLanes[15] + #define Ago0 stateAsHalfLanes[16] + #define Ago1 stateAsHalfLanes[17] + #define Agu0 stateAsHalfLanes[18] + #define Agu1 stateAsHalfLanes[19] + #define Aka0 stateAsHalfLanes[20] + #define Aka1 stateAsHalfLanes[21] + #define Ake0 stateAsHalfLanes[22] + #define Ake1 stateAsHalfLanes[23] + #define Aki0 stateAsHalfLanes[24] + #define Aki1 stateAsHalfLanes[25] + #define Ako0 stateAsHalfLanes[26] + #define Ako1 stateAsHalfLanes[27] + #define Aku0 stateAsHalfLanes[28] + #define Aku1 stateAsHalfLanes[29] + #define Ama0 stateAsHalfLanes[30] + #define Ama1 stateAsHalfLanes[31] + #define Ame0 stateAsHalfLanes[32] + #define Ame1 stateAsHalfLanes[33] + #define Ami0 stateAsHalfLanes[34] + #define Ami1 stateAsHalfLanes[35] + #define Amo0 stateAsHalfLanes[36] + #define Amo1 stateAsHalfLanes[37] + #define Amu0 stateAsHalfLanes[38] + #define Amu1 stateAsHalfLanes[39] + #define Asa0 stateAsHalfLanes[40] + #define Asa1 stateAsHalfLanes[41] + #define Ase0 stateAsHalfLanes[42] + #define Ase1 stateAsHalfLanes[43] + #define Asi0 stateAsHalfLanes[44] + #define Asi1 stateAsHalfLanes[45] + #define Aso0 stateAsHalfLanes[46] + #define Aso1 stateAsHalfLanes[47] + #define Asu0 stateAsHalfLanes[48] + #define Asu1 stateAsHalfLanes[49] + + nRounds &= 3; + switch ( nRounds ) + { + #define I0 Ba + #define I1 Be + #define T0 Bi + #define T1 Bo + #define SwapPI13( in0,in1,in2,in3,eo0,eo1,eo2,eo3 ) \ + I0 = (in0)[0]; I1 = (in0)[1]; \ + T0 = (in1)[0]; T1 = (in1)[1]; \ + (in0)[eo0] = T0; (in0)[eo0^1] = T1; \ + T0 = (in2)[0]; T1 = (in2)[1]; \ + (in1)[eo1] = T0; (in1)[eo1^1] = T1; \ + T0 = (in3)[0]; T1 = (in3)[1]; \ + (in2)[eo2] = T0; (in2)[eo2^1] = T1; \ + (in3)[eo3] = I0; (in3)[eo3^1] = I1 + #define SwapPI2( in0,in1,in2,in3 ) \ + I0 = (in0)[0]; I1 = (in0)[1]; \ + T0 = (in1)[0]; T1 = (in1)[1]; \ + (in0)[1] = T0; (in0)[0] = T1; \ + (in1)[1] = I0; (in1)[0] = I1; \ + I0 = (in2)[0]; I1 = (in2)[1]; \ + T0 = (in3)[0]; T1 = (in3)[1]; \ + (in2)[1] = T0; (in2)[0] = T1; \ + (in3)[1] = I0; (in3)[0] = I1 + #define SwapEO( even,odd ) T0 = even; even = odd; odd = T0 + + case 1: + SwapPI13( &Aga0, &Aka0, &Asa0, &Ama0, 1, 0, 1, 0 ); + SwapPI13( &Abe0, &Age0, &Ame0, &Ake0, 0, 1, 0, 1 ); + SwapPI13( &Abi0, &Aki0, &Agi0, &Asi0, 1, 0, 1, 0 ); + SwapEO( Ami0, Ami1 ); + SwapPI13( &Abo0, &Amo0, &Aso0, &Ago0, 1, 0, 1, 0 ); + SwapEO( Ako0, Ako1 ); + SwapPI13( &Abu0, &Asu0, &Aku0, &Amu0, 0, 1, 0, 1 ); + break; + + case 2: + SwapPI2( &Aga0, &Asa0, &Aka0, &Ama0 ); + SwapPI2( &Abe0, &Ame0, &Age0, &Ake0 ); + SwapPI2( &Abi0, &Agi0, &Aki0, &Asi0 ); + SwapPI2( &Abo0, &Aso0, &Ago0, &Amo0 ); + SwapPI2( &Abu0, &Aku0, &Amu0, &Asu0 ); + break; + + case 3: + SwapPI13( &Aga0, &Ama0, &Asa0, &Aka0, 0, 1, 0, 1 ); + SwapPI13( &Abe0, &Ake0, &Ame0, &Age0, 1, 0, 1, 0 ); + SwapPI13( &Abi0, &Asi0, &Agi0, &Aki0, 0, 1, 0, 1 ); + SwapEO( Ami0, Ami1 ); + SwapPI13( &Abo0, &Ago0, &Aso0, &Amo0, 0, 1, 0, 1 ); + SwapEO( Ako0, Ako1 ); + SwapPI13( &Abu0, &Amu0, &Aku0, &Asu0, 1, 0, 1, 0 ); + break; + #undef I0 + #undef I1 + #undef T0 + #undef T1 + #undef SwapPI13 + #undef SwapPI2 + #undef SwapEO + } + + do + { + /* Code for 4 rounds, using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */ + switch ( nRounds ) + { + case 0: KeccakRound0(); /* fall through */ + case 3: KeccakRound1(); + case 2: KeccakRound2(); + case 1: KeccakRound3(); + } + nRounds = 0; + } + while ( *pRoundConstants != 0xFF ); + + #undef Aba0 + #undef Aba1 + #undef Abe0 + #undef Abe1 + #undef Abi0 + #undef Abi1 + #undef Abo0 + #undef Abo1 + #undef Abu0 + #undef Abu1 + #undef Aga0 + #undef Aga1 + #undef Age0 + #undef Age1 + #undef Agi0 + #undef Agi1 + #undef Ago0 + #undef Ago1 + #undef Agu0 + #undef Agu1 + #undef Aka0 + #undef Aka1 + #undef Ake0 + #undef Ake1 + #undef Aki0 + #undef Aki1 + #undef Ako0 + #undef Ako1 + #undef Aku0 + #undef Aku1 + #undef Ama0 + #undef Ama1 + #undef Ame0 + #undef Ame1 + #undef Ami0 + #undef Ami1 + #undef Amo0 + #undef Amo1 + #undef Amu0 + #undef Amu1 + #undef Asa0 + #undef Asa1 + #undef Ase0 + #undef Ase1 + #undef Asi0 + #undef Asi1 + #undef Aso0 + #undef Aso1 + #undef Asu0 + #undef Asu1 +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_Permute_12rounds(void *state) +{ + KeccakP1600_Permute_Nrounds(state, 12); +} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s new file mode 100644 index 0000000..d7ae46b --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s @@ -0,0 +1,664 @@ +# Copyright (c) 2006-2017, CRYPTOGAMS by +# Copyright (c) 2017 Ronny Van Keer +# All rights reserved. +# +# The source code in this file is licensed under the CRYPTOGAMS license. +# For further details see http://www.openssl.org/~appro/cryptogams/. +# +# Notes: +# The code for the permutation (__KeccakF1600) was generated with +# Andy Polyakov's keccak1600-avx2.pl from the CRYPTOGAMS project +# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx2.pl). +# The rest of the code was written by Ronny Van Keer. +# Adaptations for macOS by Stéphane Léon. + +.text + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX2_Initialize(void *state); +# +.ifdef macOS +.globl _KeccakP1600_AVX2_Initialize +_KeccakP1600_AVX2_Initialize: +.else +.globl KeccakP1600_AVX2_Initialize +.type KeccakP1600_AVX2_Initialize,@function +KeccakP1600_AVX2_Initialize: +.endif +.balign 32 + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm0,0*32(%rdi) + vmovdqu %ymm0,1*32(%rdi) + vmovdqu %ymm0,2*32(%rdi) + vmovdqu %ymm0,3*32(%rdi) + vmovdqu %ymm0,4*32(%rdi) + vmovdqu %ymm0,5*32(%rdi) + movq $0,6*32(%rdi) + ret +.ifdef macOS +.else +.size KeccakP1600_AVX2_Initialize,.-KeccakP1600_AVX2_Initialize +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset); +# %rdi %rsi %rdx +# +.ifdef macOS +.globl _KeccakP1600_AVX2_AddByte +_KeccakP1600_AVX2_AddByte: +.else +.globl KeccakP1600_AVX2_AddByte +.type KeccakP1600_AVX2_AddByte,@function +KeccakP1600_AVX2_AddByte: +.endif +.balign 32 + mov %rdx, %rax + and $7, %rax + and $0xFFFFFFF8, %edx + lea mapState(%rip), %r9 + mov (%r9, %rdx), %rdx + add %rdx, %rdi + add %rax, %rdi + xorb %sil, (%rdi) + ret +.ifdef macOS +.else +.size KeccakP1600_AVX2_AddByte,.-KeccakP1600_AVX2_AddByte +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX2_AddBytes +_KeccakP1600_AVX2_AddBytes: +.else +.globl KeccakP1600_AVX2_AddBytes +.type KeccakP1600_AVX2_AddBytes,@function +KeccakP1600_AVX2_AddBytes: +.endif +.balign 32 + cmp $0, %rcx + jz KeccakP1600_AVX2_AddBytes_Exit + mov %rdx, %rax # rax offset in lane + and $0xFFFFFFF8, %edx # rdx pointer into state index mapper + lea mapState(%rip), %r9 + add %r9, %rdx + and $7, %rax + jz KeccakP1600_AVX2_AddBytes_LaneAlignedCheck + mov $8, %r9 # r9 is (max) length of incomplete lane + sub %rax, %r9 + cmp %rcx, %r9 + cmovae %rcx, %r9 + sub %r9, %rcx # length -= length of incomplete lane + add (%rdx), %rax # rax = pointer to state lane + add $8, %rdx + add %rdi, %rax +KeccakP1600_AVX2_AddBytes_NotAlignedLoop: + mov (%rsi), %r8b + inc %rsi + xorb %r8b, (%rax) + inc %rax + dec %r9 + jnz KeccakP1600_AVX2_AddBytes_NotAlignedLoop + jmp KeccakP1600_AVX2_AddBytes_LaneAlignedCheck +KeccakP1600_AVX2_AddBytes_LaneAlignedLoop: + mov (%rsi), %r8 + add $8, %rsi + mov (%rdx), %rax + add $8, %rdx + add %rdi, %rax + xor %r8, (%rax) +KeccakP1600_AVX2_AddBytes_LaneAlignedCheck: + sub $8, %rcx + jnc KeccakP1600_AVX2_AddBytes_LaneAlignedLoop +KeccakP1600_AVX2_AddBytes_LastIncompleteLane: + add $8, %rcx + jz KeccakP1600_AVX2_AddBytes_Exit + mov (%rdx), %rax + add %rdi, %rax +KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop: + mov (%rsi), %r8b + inc %rsi + xor %r8b, (%rax) + inc %rax + dec %rcx + jnz KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop +KeccakP1600_AVX2_AddBytes_Exit: + ret +.ifdef macOS +.else +.size KeccakP1600_AVX2_AddBytes,.-KeccakP1600_AVX2_AddBytes +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX2_ExtractBytes +_KeccakP1600_AVX2_ExtractBytes: +.else +.globl KeccakP1600_AVX2_ExtractBytes +.type KeccakP1600_AVX2_ExtractBytes,@function +KeccakP1600_AVX2_ExtractBytes: +.endif +.balign 32 + push %rbx + cmp $0, %rcx + jz KeccakP1600_AVX2_ExtractBytes_Exit + mov %rdx, %rax # rax offset in lane + and $0xFFFFFFF8, %edx # rdx pointer into state index mapper + lea mapState(%rip), %r9 + add %r9, %rdx + and $7, %rax + jz KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck + mov $8, %rbx # rbx is (max) length of incomplete lane + sub %rax, %rbx + cmp %rcx, %rbx + cmovae %rcx, %rbx + sub %rbx, %rcx # length -= length of incomplete lane + mov (%rdx), %r9 + add $8, %rdx + add %rdi, %r9 + add %rax, %r9 +KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop: + mov (%r9), %r8b + inc %r9 + mov %r8b, (%rsi) + inc %rsi + dec %rbx + jnz KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop + jmp KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck +KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop: + mov (%rdx), %rax + add $8, %rdx + add %rdi, %rax + mov (%rax), %r8 + mov %r8, (%rsi) + add $8, %rsi +KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck: + sub $8, %rcx + jnc KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop +KeccakP1600_AVX2_ExtractBytes_LastIncompleteLane: + add $8, %rcx + jz KeccakP1600_AVX2_ExtractBytes_Exit + mov (%rdx), %rax + add %rdi, %rax + mov (%rax), %r8 +KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop: + mov %r8b, (%rsi) + shr $8, %r8 + inc %rsi + dec %rcx + jnz KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop +KeccakP1600_AVX2_ExtractBytes_Exit: + pop %rbx + ret +.ifdef macOS +.else +.size KeccakP1600_AVX2_ExtractBytes,.-KeccakP1600_AVX2_ExtractBytes +.endif + +# ----------------------------------------------------------------------------- +# +# internal +# +.ifdef macOS +.else +.type __KeccakF1600,@function +.endif +.balign 32 +__KeccakF1600: +.Loop_avx2: + ######################################### Theta + vpshufd $0b01001110,%ymm2,%ymm13 + vpxor %ymm3,%ymm5,%ymm12 + vpxor %ymm6,%ymm4,%ymm9 + vpxor %ymm1,%ymm12,%ymm12 + vpxor %ymm9,%ymm12,%ymm12 # C[1..4] + + vpermq $0b10010011,%ymm12,%ymm11 + vpxor %ymm2,%ymm13,%ymm13 + vpermq $0b01001110,%ymm13,%ymm7 + + vpsrlq $63,%ymm12,%ymm8 + vpaddq %ymm12,%ymm12,%ymm9 + vpor %ymm9,%ymm8,%ymm8 # ROL64(C[1..4],1) + + vpermq $0b00111001,%ymm8,%ymm15 + vpxor %ymm11,%ymm8,%ymm14 + vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4] + + vpxor %ymm0,%ymm13,%ymm13 + vpxor %ymm7,%ymm13,%ymm13 # C[0..0] + + vpsrlq $63,%ymm13,%ymm7 + vpaddq %ymm13,%ymm13,%ymm8 + vpor %ymm7,%ymm8,%ymm8 # ROL64(C[0..0],1) + + vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0] + vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0] + + vpblendd $0b11000000,%ymm8,%ymm15,%ymm15 + vpblendd $0b00000011,%ymm13,%ymm11,%ymm11 + vpxor %ymm11,%ymm15,%ymm15 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3] + + ######################################### Rho + Pi + pre-Chi shuffle + vpsllvq 0*32-96(%r8),%ymm2,%ymm10 + vpsrlvq 0*32-96(%r9),%ymm2,%ymm2 + vpor %ymm10,%ymm2,%ymm2 + + vpxor %ymm15,%ymm3,%ymm3 # ^= D[1..4] from Theta + vpsllvq 2*32-96(%r8),%ymm3,%ymm11 + vpsrlvq 2*32-96(%r9),%ymm3,%ymm3 + vpor %ymm11,%ymm3,%ymm3 + + vpxor %ymm15,%ymm4,%ymm4 # ^= D[1..4] from Theta + vpsllvq 3*32-96(%r8),%ymm4,%ymm12 + vpsrlvq 3*32-96(%r9),%ymm4,%ymm4 + vpor %ymm12,%ymm4,%ymm4 + + vpxor %ymm15,%ymm5,%ymm5 # ^= D[1..4] from Theta + vpsllvq 4*32-96(%r8),%ymm5,%ymm13 + vpsrlvq 4*32-96(%r9),%ymm5,%ymm5 + vpor %ymm13,%ymm5,%ymm5 + + vpxor %ymm15,%ymm6,%ymm6 # ^= D[1..4] from Theta + vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3 + vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4 + vpsllvq 5*32-96(%r8),%ymm6,%ymm14 + vpsrlvq 5*32-96(%r9),%ymm6,%ymm8 + vpor %ymm14,%ymm8,%ymm8 # %ymm6 -> future %ymm1 + + vpxor %ymm15,%ymm1,%ymm1 # ^= D[1..4] from Theta + vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5 + vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6 + vpsllvq 1*32-96(%r8),%ymm1,%ymm15 + vpsrlvq 1*32-96(%r9),%ymm1,%ymm9 + vpor %ymm15,%ymm9,%ymm9 # %ymm1 -> future %ymm2 + + ######################################### Chi + vpsrldq $8,%ymm8,%ymm14 + vpandn %ymm14,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0] + + vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0] + vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1] + vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4] + vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0] + vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0] + vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1] + vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4] + vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0] + vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0] + vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1] + vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4] + vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0] + vpandn %ymm15,%ymm3,%ymm3 # tgting [3][1] [1][2] [4][3] [2][4] + vpandn %ymm14,%ymm5,%ymm5 # tgting [3][2] [1][4] [4][1] [2][3] + + vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3] + vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4] + vpxor %ymm10,%ymm3,%ymm3 + vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3] + vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4] + vpxor %ymm12,%ymm5,%ymm5 + vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3] + vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4] + vpandn %ymm15,%ymm6,%ymm6 # tgting [3][3] [1][1] [4][4] [2][2] + vpxor %ymm13,%ymm6,%ymm6 + + vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3] + vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3] + vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2] + vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2] + vpandn %ymm15,%ymm1,%ymm1 # tgting [0][4] [0][3] [0][2] [0][1] + + vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1] + vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2] + vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1] + vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2] + vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1] + vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2] + vpandn %ymm14,%ymm2,%ymm2 # tgting [3][0] [1][0] [4][0] [2][0] + vpxor %ymm9,%ymm2,%ymm2 + + vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0] + vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle + vpermq $0b10001101,%ymm5,%ymm5 + vpermq $0b01110010,%ymm6,%ymm6 + + vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2] + vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3] + vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2] + vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3] + vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2] + vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3] + vpandn %ymm14,%ymm4,%ymm4 # tgting [3][4] [1][3] [4][2] [2][1] + + vpxor %ymm7,%ymm0,%ymm0 + vpxor %ymm8,%ymm1,%ymm1 + vpxor %ymm11,%ymm4,%ymm4 + + ######################################### Iota + vpxor (%r10),%ymm0,%ymm0 + lea 32(%r10),%r10 + + dec %eax + jnz .Loop_avx2 + ret +.ifdef macOS +.else +.size __KeccakF1600,.-__KeccakF1600 +.endif + + + +.ifdef macOS +.globl _KeccakP1600_AVX2_Permute_12rounds +_KeccakP1600_AVX2_Permute_12rounds: +.else +.globl KeccakP1600_AVX2_Permute_12rounds +.type KeccakP1600_AVX2_Permute_12rounds,@function +KeccakP1600_AVX2_Permute_12rounds: +.endif +.balign 32 + lea rhotates_left+96(%rip),%r8 + lea rhotates_right+96(%rip),%r9 + lea iotas+12*4*8(%rip),%r10 + mov $12,%eax + lea 96(%rdi),%rdi + vzeroupper + vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] + vmovdqu 8+32*0-96(%rdi),%ymm1 + vmovdqu 8+32*1-96(%rdi),%ymm2 + vmovdqu 8+32*2-96(%rdi),%ymm3 + vmovdqu 8+32*3-96(%rdi),%ymm4 + vmovdqu 8+32*4-96(%rdi),%ymm5 + vmovdqu 8+32*5-96(%rdi),%ymm6 + call __KeccakF1600 + vmovq %xmm0,-96(%rdi) + vmovdqu %ymm1,8+32*0-96(%rdi) + vmovdqu %ymm2,8+32*1-96(%rdi) + vmovdqu %ymm3,8+32*2-96(%rdi) + vmovdqu %ymm4,8+32*3-96(%rdi) + vmovdqu %ymm5,8+32*4-96(%rdi) + vmovdqu %ymm6,8+32*5-96(%rdi) + vzeroupper + ret +.ifdef macOS +.else +.size KeccakP1600_AVX2_Permute_12rounds,.-KeccakP1600_AVX2_Permute_12rounds +.endif + +# ----------------------------------------------------------------------------- +# +# size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX2_12rounds_FastLoop_Absorb +_KeccakP1600_AVX2_12rounds_FastLoop_Absorb: +.else +.globl KeccakP1600_AVX2_12rounds_FastLoop_Absorb +.type KeccakP1600_AVX2_12rounds_FastLoop_Absorb,@function +KeccakP1600_AVX2_12rounds_FastLoop_Absorb: +.endif +.balign 32 + push %rbx + push %r10 + shr $3, %rcx # rcx = data length in lanes + mov %rdx, %rbx # rbx = initial data pointer + cmp %rsi, %rcx + jb KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit + vzeroupper + cmp $21, %rsi + jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes + sub $21, %rcx + lea rhotates_left+96(%rip),%r8 + lea rhotates_right+96(%rip),%r9 + lea 96(%rdi),%rdi + vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] + vmovdqu 8+32*0-96(%rdi),%ymm1 + vmovdqu 8+32*1-96(%rdi),%ymm2 + vmovdqu 8+32*2-96(%rdi),%ymm3 + vmovdqu 8+32*3-96(%rdi),%ymm4 + vmovdqu 8+32*4-96(%rdi),%ymm5 + vmovdqu 8+32*5-96(%rdi),%ymm6 +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes: + vpbroadcastq (%rdx),%ymm7 + vmovdqu 8(%rdx),%ymm8 + + vmovdqa map2(%rip), %xmm15 + vpcmpeqd %ymm14, %ymm14, %ymm14 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 + + vmovdqa mask3_21(%rip), %ymm14 + vpxor %ymm10, %ymm10, %ymm10 + vmovdqa map3(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 + + vmovdqa mask4_21(%rip), %ymm14 + vpxor %ymm11, %ymm11, %ymm11 + vmovdqa map4(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 + + vmovdqa mask5_21(%rip), %ymm14 + vpxor %ymm12, %ymm12, %ymm12 + vmovdqa map5(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 + + vmovdqa mask6_21(%rip), %ymm14 + vpxor %ymm13, %ymm13, %ymm13 + vmovdqa map6(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 + + vpxor %ymm7,%ymm0,%ymm0 + vpxor %ymm8,%ymm1,%ymm1 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm11,%ymm4,%ymm4 + vpxor %ymm12,%ymm5,%ymm5 + vpxor %ymm13,%ymm6,%ymm6 + add $21*8, %rdx + lea iotas+12*4*8(%rip),%r10 + mov $12,%eax + call __KeccakF1600 + sub $21, %rcx + jnc KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit: + vmovq %xmm0,-96(%rdi) + vmovdqu %ymm1,8+32*0-96(%rdi) + vmovdqu %ymm2,8+32*1-96(%rdi) + vmovdqu %ymm3,8+32*2-96(%rdi) + vmovdqu %ymm4,8+32*3-96(%rdi) + vmovdqu %ymm5,8+32*4-96(%rdi) + vmovdqu %ymm6,8+32*5-96(%rdi) +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit: + vzeroupper + mov %rdx, %rax # return number of bytes processed + sub %rbx, %rax + pop %r10 + pop %rbx + ret +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes: + cmp $17, %rsi + jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes + sub $17, %rcx + lea rhotates_left+96(%rip),%r8 + lea rhotates_right+96(%rip),%r9 + lea 96(%rdi),%rdi + vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] + vmovdqu 8+32*0-96(%rdi),%ymm1 + vmovdqu 8+32*1-96(%rdi),%ymm2 + vmovdqu 8+32*2-96(%rdi),%ymm3 + vmovdqu 8+32*3-96(%rdi),%ymm4 + vmovdqu 8+32*4-96(%rdi),%ymm5 + vmovdqu 8+32*5-96(%rdi),%ymm6 +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes: + vpbroadcastq (%rdx),%ymm7 + vmovdqu 8(%rdx),%ymm8 + + vmovdqa mask2_17(%rip), %ymm14 + vpxor %ymm9, %ymm9, %ymm9 + vmovdqa map2(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 + + vmovdqa mask3_17(%rip), %ymm14 + vpxor %ymm10, %ymm10, %ymm10 + vmovdqa map3(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 + + vmovdqa mask4_17(%rip), %ymm14 + vpxor %ymm11, %ymm11, %ymm11 + vmovdqa map4(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 + + vmovdqa mask5_17(%rip), %ymm14 + vpxor %ymm12, %ymm12, %ymm12 + vmovdqa map5(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 + + vmovdqa mask6_17(%rip), %ymm14 + vpxor %ymm13, %ymm13, %ymm13 + vmovdqa map6(%rip), %xmm15 + vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 + + vpxor %ymm7,%ymm0,%ymm0 + vpxor %ymm8,%ymm1,%ymm1 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm11,%ymm4,%ymm4 + vpxor %ymm12,%ymm5,%ymm5 + vpxor %ymm13,%ymm6,%ymm6 + add $17*8, %rdx + lea iotas+12*4*8(%rip),%r10 + mov $12,%eax + call __KeccakF1600 + sub $17, %rcx + jnc KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes + jmp KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes: + lea mapState(%rip), %r9 + mov %rsi, %rax +KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop: + mov (%rdx), %r8 + add $8, %rdx + mov (%r9), %r10 + add $8, %r9 + add %rdi, %r10 + xor %r8, (%r10) + sub $1, %rax + jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop + sub %rsi, %rcx + push %rdi + push %rsi + push %rdx + push %rcx +.ifdef macOS + call _KeccakP1600_AVX2_Permute_12rounds +.else + call KeccakP1600_AVX2_Permute_12rounds@PLT +.endif + pop %rcx + pop %rdx + pop %rsi + pop %rdi + cmp %rsi, %rcx + jae KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes + jmp KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit +.ifdef macOS +.else +.size KeccakP1600_AVX2_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX2_12rounds_FastLoop_Absorb +.endif + +.equ ALLON, 0xFFFFFFFFFFFFFFFF + +.balign 64 +rhotates_left: + .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] + .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] + .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] + .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] + .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] + .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] +rhotates_right: + .quad 64-3, 64-18, 64-36, 64-41 + .quad 64-1, 64-62, 64-28, 64-27 + .quad 64-45, 64-6, 64-56, 64-39 + .quad 64-10, 64-61, 64-55, 64-8 + .quad 64-2, 64-15, 64-25, 64-20 + .quad 64-44, 64-43, 64-21, 64-14 +iotas: + .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 + .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 + .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a + .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 + .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b + .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 + .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 + .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 + .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a + .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 + .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 + .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a + .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b + .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b + .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 + .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 + .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 + .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 + .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a + .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a + .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 + .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 + .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 + .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 + +mapState: + .quad 0*8, 1*8, 2*8, 3*8, 4*8 + .quad 7*8, 21*8, 10*8, 15*8, 20*8 + .quad 5*8, 13*8, 22*8, 19*8, 12*8 + .quad 8*8, 9*8, 18*8, 23*8, 16*8 + .quad 6*8, 17*8, 14*8, 11*8, 24*8 + + .balign 16 +map2: + .long 10*8, 20*8, 5*8, 15*8 +map3: + .long 16*8, 7*8, 23*8, 14*8 +map4: + .long 11*8, 22*8, 8*8, 19*8 +map5: + .long 21*8, 17*8, 13*8, 9*8 +map6: + .long 6*8, 12*8, 18*8, 24*8 + + .balign 32 +mask3_21: + .quad ALLON, ALLON, 0, ALLON +mask4_21: + .quad ALLON, 0, ALLON, ALLON +mask5_21: + .quad 0, ALLON, ALLON, ALLON +mask6_21: + .quad ALLON, ALLON, ALLON, 0 + +mask2_17: + .quad ALLON, 0, ALLON, ALLON +mask3_17: + .quad ALLON, ALLON, 0, ALLON +mask4_17: + .quad ALLON, 0, ALLON, 0 +mask5_17: + .quad 0, 0, ALLON, ALLON +mask6_17: + .quad ALLON, ALLON, 0, 0 + +.asciz "Keccak-1600 for AVX2, CRYPTOGAMS by " diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c new file mode 100644 index 0000000..b426421 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c @@ -0,0 +1,241 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +We would like to thank Vladimir Sedach, we have used parts of his Keccak AVX-512 C++ code. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "align.h" + +typedef __m512i V512; + +#define XOR(a,b) _mm512_xor_si512(a,b) +#define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96) +#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) +#define ROL(a,offset) _mm512_rol_epi64(a,offset) +#define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2) + +#define LOAD_Lanes(m,a) _mm512_maskz_loadu_epi64(m,a) +#define LOAD_Lane(a) LOAD_Lanes(0x01,a) +#define LOAD_Plane(a) LOAD_Lanes(0x1F,a) +#define LOAD_8Lanes(a) LOAD_Lanes(0xFF,a) +#define STORE_Lanes(a,m,v) _mm512_mask_storeu_epi64(a,m,v) +#define STORE_Lane(a,v) STORE_Lanes(a,0x01,v) +#define STORE_Plane(a,v) STORE_Lanes(a,0x1F,v) +#define STORE_8Lanes(a,v) STORE_Lanes(a,0xFF,v) + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AVX512_Initialize(void *state) +{ + memset(state, 0, 1600/8); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + uint8_t *stateAsBytes; + uint64_t *stateAsLanes; + + for( stateAsBytes = (uint8_t*)state; ((offset % 8) != 0) && (length != 0); ++offset, --length) + stateAsBytes[offset] ^= *(data++); + for (stateAsLanes = (uint64_t*)(stateAsBytes + offset); length >= 8*8; stateAsLanes += 8, data += 8*8, length -= 8*8) + STORE_8Lanes( stateAsLanes, XOR(LOAD_8Lanes(stateAsLanes), LOAD_8Lanes((const uint64_t*)data))); + for (/* empty */; length >= 8; ++stateAsLanes, data += 8, length -= 8) + STORE_Lane( stateAsLanes, XOR(LOAD_Lane(stateAsLanes), LOAD_Lane((const uint64_t*)data))); + for ( stateAsBytes = (uint8_t*)stateAsLanes; length != 0; --length) + *(stateAsBytes++) ^= *(data++); +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + memcpy(data, (unsigned char*)state+offset, length); +} + +/* ---------------------------------------------------------------- */ + +const uint64_t KeccakP1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL }; + +#define KeccakP_DeclareVars \ + V512 b0, b1, b2, b3, b4; \ + V512 Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou; \ + V512 moveThetaPrev = _mm512_setr_epi64(4, 0, 1, 2, 3, 5, 6, 7); \ + V512 moveThetaNext = _mm512_setr_epi64(1, 2, 3, 4, 0, 5, 6, 7); \ + V512 rhoB = _mm512_setr_epi64( 0, 1, 62, 28, 27, 0, 0, 0); \ + V512 rhoG = _mm512_setr_epi64(36, 44, 6, 55, 20, 0, 0, 0); \ + V512 rhoK = _mm512_setr_epi64( 3, 10, 43, 25, 39, 0, 0, 0); \ + V512 rhoM = _mm512_setr_epi64(41, 45, 15, 21, 8, 0, 0, 0); \ + V512 rhoS = _mm512_setr_epi64(18, 2, 61, 56, 14, 0, 0, 0); \ + V512 pi1B = _mm512_setr_epi64(0, 3, 1, 4, 2, 5, 6, 7); \ + V512 pi1G = _mm512_setr_epi64(1, 4, 2, 0, 3, 5, 6, 7); \ + V512 pi1K = _mm512_setr_epi64(2, 0, 3, 1, 4, 5, 6, 7); \ + V512 pi1M = _mm512_setr_epi64(3, 1, 4, 2, 0, 5, 6, 7); \ + V512 pi1S = _mm512_setr_epi64(4, 2, 0, 3, 1, 5, 6, 7); \ + V512 pi2S1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 0+8, 2+8); \ + V512 pi2S2 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 1+8, 3+8); \ + V512 pi2BG = _mm512_setr_epi64(0, 1, 0+8, 1+8, 6, 5, 6, 7); \ + V512 pi2KM = _mm512_setr_epi64(2, 3, 2+8, 3+8, 7, 5, 6, 7); \ + V512 pi2S3 = _mm512_setr_epi64(4, 5, 4+8, 5+8, 4, 5, 6, 7); + +#define copyFromState(pState) \ + Baeiou = LOAD_Plane(pState+ 0); \ + Gaeiou = LOAD_Plane(pState+ 5); \ + Kaeiou = LOAD_Plane(pState+10); \ + Maeiou = LOAD_Plane(pState+15); \ + Saeiou = LOAD_Plane(pState+20); + +#define copyToState(pState) \ + STORE_Plane(pState+ 0, Baeiou); \ + STORE_Plane(pState+ 5, Gaeiou); \ + STORE_Plane(pState+10, Kaeiou); \ + STORE_Plane(pState+15, Maeiou); \ + STORE_Plane(pState+20, Saeiou); + +#define KeccakP_Round(i) \ + /* Theta */ \ + b0 = XOR5( Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou ); \ + b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); \ + b0 = _mm512_permutexvar_epi64(moveThetaNext, b0); \ + b0 = _mm512_rol_epi64(b0, 1); \ + Baeiou = XOR3( Baeiou, b0, b1 ); \ + Gaeiou = XOR3( Gaeiou, b0, b1 ); \ + Kaeiou = XOR3( Kaeiou, b0, b1 ); \ + Maeiou = XOR3( Maeiou, b0, b1 ); \ + Saeiou = XOR3( Saeiou, b0, b1 ); \ + /* Rho */ \ + Baeiou = _mm512_rolv_epi64(Baeiou, rhoB); \ + Gaeiou = _mm512_rolv_epi64(Gaeiou, rhoG); \ + Kaeiou = _mm512_rolv_epi64(Kaeiou, rhoK); \ + Maeiou = _mm512_rolv_epi64(Maeiou, rhoM); \ + Saeiou = _mm512_rolv_epi64(Saeiou, rhoS); \ + /* Pi 1 */ \ + b0 = _mm512_permutexvar_epi64(pi1B, Baeiou); \ + b1 = _mm512_permutexvar_epi64(pi1G, Gaeiou); \ + b2 = _mm512_permutexvar_epi64(pi1K, Kaeiou); \ + b3 = _mm512_permutexvar_epi64(pi1M, Maeiou); \ + b4 = _mm512_permutexvar_epi64(pi1S, Saeiou); \ + /* Chi */ \ + Baeiou = Chi(b0, b1, b2); \ + Gaeiou = Chi(b1, b2, b3); \ + Kaeiou = Chi(b2, b3, b4); \ + Maeiou = Chi(b3, b4, b0); \ + Saeiou = Chi(b4, b0, b1); \ + /* Iota */ \ + Baeiou = XOR(Baeiou, LOAD_Lane(KeccakP1600RoundConstants+i)); \ + /* Pi 2 */ \ + b0 = _mm512_unpacklo_epi64(Baeiou, Gaeiou); \ + b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); \ + b0 = _mm512_permutex2var_epi64(b0, pi2S1, Saeiou); \ + b2 = _mm512_unpackhi_epi64(Baeiou, Gaeiou); \ + b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); \ + b2 = _mm512_permutex2var_epi64(b2, pi2S2, Saeiou); \ + Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); \ + Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); \ + Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); \ + Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); \ + b0 = _mm512_permutex2var_epi64(b0, pi2S3, b1); \ + Saeiou = _mm512_mask_blend_epi64(0x10, b0, Saeiou) + +#define rounds12 \ + KeccakP_Round( 12 ); \ + KeccakP_Round( 13 ); \ + KeccakP_Round( 14 ); \ + KeccakP_Round( 15 ); \ + KeccakP_Round( 16 ); \ + KeccakP_Round( 17 ); \ + KeccakP_Round( 18 ); \ + KeccakP_Round( 19 ); \ + KeccakP_Round( 20 ); \ + KeccakP_Round( 21 ); \ + KeccakP_Round( 22 ); \ + KeccakP_Round( 23 ) + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_AVX512_Permute_12rounds(void *state) +{ + KeccakP_DeclareVars + uint64_t *stateAsLanes = (uint64_t*)state; + + copyFromState(stateAsLanes); + rounds12; + copyToState(stateAsLanes); +} + +/* ---------------------------------------------------------------- */ + +#include + +size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) +{ + size_t originalDataByteLen = dataByteLen; + + assert(laneCount == 21); + + KeccakP_DeclareVars; + uint64_t *stateAsLanes = (uint64_t*)state; + uint64_t *inDataAsLanes = (uint64_t*)data; + + copyFromState(stateAsLanes); + while(dataByteLen >= 21*8) { + Baeiou = XOR(Baeiou, LOAD_Plane(inDataAsLanes+ 0)); + Gaeiou = XOR(Gaeiou, LOAD_Plane(inDataAsLanes+ 5)); + Kaeiou = XOR(Kaeiou, LOAD_Plane(inDataAsLanes+10)); + Maeiou = XOR(Maeiou, LOAD_Plane(inDataAsLanes+15)); + Saeiou = XOR(Saeiou, LOAD_Lane(inDataAsLanes+20)); + rounds12; + inDataAsLanes += 21; + dataByteLen -= 21*8; + } + copyToState(stateAsLanes); + + return originalDataByteLen - dataByteLen; +} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s new file mode 100644 index 0000000..383ca43 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s @@ -0,0 +1,551 @@ +# Copyright (c) 2006-2017, CRYPTOGAMS by +# Copyright (c) 2018 Ronny Van Keer +# All rights reserved. +# +# The source code in this file is licensed under the CRYPTOGAMS license. +# For further details see http://www.openssl.org/~appro/cryptogams/. +# +# Notes: +# The code for the permutation (__KeccakF1600) was generated with +# Andy Polyakov's keccak1600-avx512.pl from the CRYPTOGAMS project +# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx512.pl). +# The rest of the code was written by Ronny Van Keer. +# Adaptations for macOS by Stéphane Léon. + +.text + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX512_Initialize(void *state); +# +.ifdef macOS +.globl _KeccakP1600_AVX512_Initialize +_KeccakP1600_AVX512_Initialize: +.else +.globl KeccakP1600_AVX512_Initialize +.type KeccakP1600_AVX512_Initialize,@function +KeccakP1600_AVX512_Initialize: +.endif +.balign 32 + vpxorq %zmm0,%zmm0,%zmm0 + vmovdqu64 %zmm0,0*64(%rdi) + vmovdqu64 %zmm0,1*64(%rdi) + vmovdqu64 %zmm0,2*64(%rdi) + movq $0,3*64(%rdi) + ret +.ifdef macOS +.else +.size KeccakP1600_AVX512_Initialize,.-KeccakP1600_AVX512_Initialize +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset); +# %rdi %rsi %rdx +#!! +#.globl KeccakP1600_AVX512_AddByte +#.type KeccakP1600_AVX512_AddByte,@function +#.balign 32 +#KeccakP1600_AVX512_AddByte: +# mov %rdx, %rax +# and $7, %rax +# and $0xFFFFFFF8, %edx +# mov mapState(%rdx), %rdx +# add %rdx, %rdi +# add %rax, %rdi +# xorb %sil, (%rdi) +# ret +#.size KeccakP1600_AVX512_AddByte,.-KeccakP1600_AVX512_AddByte + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX512_AddBytes +_KeccakP1600_AVX512_AddBytes: +.else +.globl KeccakP1600_AVX512_AddBytes +.type KeccakP1600_AVX512_AddBytes,@function +KeccakP1600_AVX512_AddBytes: +.endif +.balign 32 + cmp $0, %rcx + jz KeccakP1600_AVX512_AddBytes_Exit + add %rdx, %rdi # state += offset + and $7, %rdx + jz KeccakP1600_AVX512_AddBytes_LaneAlignedCheck + mov $8, %r9 # r9 is (max) length of incomplete lane + sub %rdx, %r9 + cmp %rcx, %r9 + cmovae %rcx, %r9 + sub %r9, %rcx # length -= length of incomplete lane +KeccakP1600_AVX512_AddBytes_NotAlignedLoop: + mov (%rsi), %r8b + inc %rsi + xorb %r8b, (%rdi) + inc %rdi + dec %r9 + jnz KeccakP1600_AVX512_AddBytes_NotAlignedLoop + jmp KeccakP1600_AVX512_AddBytes_LaneAlignedCheck +KeccakP1600_AVX512_AddBytes_LaneAlignedLoop: + mov (%rsi), %r8 + add $8, %rsi + xor %r8, (%rdi) + add $8, %rdi +KeccakP1600_AVX512_AddBytes_LaneAlignedCheck: + sub $8, %rcx + jnc KeccakP1600_AVX512_AddBytes_LaneAlignedLoop +KeccakP1600_AVX512_AddBytes_LastIncompleteLane: + add $8, %rcx + jz KeccakP1600_AVX512_AddBytes_Exit +KeccakP1600_AVX512_AddBytes_LastIncompleteLaneLoop: + mov (%rsi), %r8b + inc %rsi + xor %r8b, (%rdi) + inc %rdi + dec %rcx + jnz KeccakP1600_AVX512_AddBytes_LastIncompleteLaneLoop +KeccakP1600_AVX512_AddBytes_Exit: + ret +.ifdef macOS +.else +.size KeccakP1600_AVX512_AddBytes,.-KeccakP1600_AVX512_AddBytes +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX512_ExtractBytes +_KeccakP1600_AVX512_ExtractBytes: +.else +.globl KeccakP1600_AVX512_ExtractBytes +.type KeccakP1600_AVX512_ExtractBytes,@function +KeccakP1600_AVX512_ExtractBytes: +.endif +.balign 32 + cmp $0, %rcx + jz KeccakP1600_AVX512_ExtractBytes_Exit + add %rdx, %rdi # state += offset + and $7, %rdx + jz KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck + mov $8, %rax # rax is (max) length of incomplete lane + sub %rdx, %rax + cmp %rcx, %rax + cmovae %rcx, %rax + sub %rax, %rcx # length -= length of incomplete lane +KeccakP1600_AVX512_ExtractBytes_NotAlignedLoop: + mov (%rdi), %r8b + inc %rdi + mov %r8b, (%rsi) + inc %rsi + dec %rax + jnz KeccakP1600_AVX512_ExtractBytes_NotAlignedLoop + jmp KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck +KeccakP1600_AVX512_ExtractBytes_LaneAlignedLoop: + mov (%rdi), %r8 + add $8, %rdi + mov %r8, (%rsi) + add $8, %rsi +KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck: + sub $8, %rcx + jnc KeccakP1600_AVX512_ExtractBytes_LaneAlignedLoop +KeccakP1600_AVX512_ExtractBytes_LastIncompleteLane: + add $8, %rcx + jz KeccakP1600_AVX512_ExtractBytes_Exit + mov (%rdi), %r8 +KeccakP1600_AVX512_ExtractBytes_LastIncompleteLaneLoop: + mov %r8b, (%rsi) + shr $8, %r8 + inc %rsi + dec %rcx + jnz KeccakP1600_AVX512_ExtractBytes_LastIncompleteLaneLoop +KeccakP1600_AVX512_ExtractBytes_Exit: + ret +.ifdef macOS +.else +.size KeccakP1600_AVX512_ExtractBytes,.-KeccakP1600_AVX512_ExtractBytes +.endif + +# ----------------------------------------------------------------------------- +# +# internal +# +.text +.ifdef macOS +.else +.type __KeccakF1600,@function +.endif +.balign 32 +__KeccakF1600: +.Loop_avx512: + ######################################### Theta, even round + vmovdqa64 %zmm0,%zmm5 # put aside original A00 + vpternlogq $0x96,%zmm2,%zmm1,%zmm0 # and use it as "C00" + vpternlogq $0x96,%zmm4,%zmm3,%zmm0 + vprolq $1,%zmm0,%zmm6 + vpermq %zmm0,%zmm13,%zmm0 + vpermq %zmm6,%zmm16,%zmm6 + vpternlogq $0x96,%zmm0,%zmm6,%zmm5 # T[0] is original A00 + vpternlogq $0x96,%zmm0,%zmm6,%zmm1 + vpternlogq $0x96,%zmm0,%zmm6,%zmm2 + vpternlogq $0x96,%zmm0,%zmm6,%zmm3 + vpternlogq $0x96,%zmm0,%zmm6,%zmm4 + ######################################### Rho + vprolvq %zmm22,%zmm5,%zmm0 # T[0] is original A00 + vprolvq %zmm23,%zmm1,%zmm1 + vprolvq %zmm24,%zmm2,%zmm2 + vprolvq %zmm25,%zmm3,%zmm3 + vprolvq %zmm26,%zmm4,%zmm4 + ######################################### Pi + vpermq %zmm0,%zmm17,%zmm0 + vpermq %zmm1,%zmm18,%zmm1 + vpermq %zmm2,%zmm19,%zmm2 + vpermq %zmm3,%zmm20,%zmm3 + vpermq %zmm4,%zmm21,%zmm4 + ######################################### Chi + vmovdqa64 %zmm0,%zmm5 + vmovdqa64 %zmm1,%zmm6 + vpternlogq $0xD2,%zmm2,%zmm1,%zmm0 + vpternlogq $0xD2,%zmm3,%zmm2,%zmm1 + vpternlogq $0xD2,%zmm4,%zmm3,%zmm2 + vpternlogq $0xD2,%zmm5,%zmm4,%zmm3 + vpternlogq $0xD2,%zmm6,%zmm5,%zmm4 + ######################################### Iota + vpxorq (%r10),%zmm0,%zmm0{%k1} + lea 16(%r10),%r10 + ######################################### Harmonize rounds + vpblendmq %zmm2,%zmm1,%zmm6{%k2} + vpblendmq %zmm3,%zmm2,%zmm7{%k2} + vpblendmq %zmm4,%zmm3,%zmm8{%k2} + vpblendmq %zmm1,%zmm0,%zmm5{%k2} + vpblendmq %zmm0,%zmm4,%zmm9{%k2} + vpblendmq %zmm3,%zmm6,%zmm6{%k3} + vpblendmq %zmm4,%zmm7,%zmm7{%k3} + vpblendmq %zmm2,%zmm5,%zmm5{%k3} + vpblendmq %zmm0,%zmm8,%zmm8{%k3} + vpblendmq %zmm1,%zmm9,%zmm9{%k3} + vpblendmq %zmm4,%zmm6,%zmm6{%k4} + vpblendmq %zmm3,%zmm5,%zmm5{%k4} + vpblendmq %zmm0,%zmm7,%zmm7{%k4} + vpblendmq %zmm1,%zmm8,%zmm8{%k4} + vpblendmq %zmm2,%zmm9,%zmm9{%k4} + vpblendmq %zmm4,%zmm5,%zmm5{%k5} + vpblendmq %zmm0,%zmm6,%zmm6{%k5} + vpblendmq %zmm1,%zmm7,%zmm7{%k5} + vpblendmq %zmm2,%zmm8,%zmm8{%k5} + vpblendmq %zmm3,%zmm9,%zmm9{%k5} + #vpermq %zmm5,%zmm33,%zmm0 # doesn't actually change order + vpermq %zmm6,%zmm13,%zmm1 + vpermq %zmm7,%zmm14,%zmm2 + vpermq %zmm8,%zmm15,%zmm3 + vpermq %zmm9,%zmm16,%zmm4 + ######################################### Theta, odd round + vmovdqa64 %zmm5,%zmm0 # real A00 + vpternlogq $0x96,%zmm2,%zmm1,%zmm5 # C00 is %zmm5's alias + vpternlogq $0x96,%zmm4,%zmm3,%zmm5 + vprolq $1,%zmm5,%zmm6 + vpermq %zmm5,%zmm13,%zmm5 + vpermq %zmm6,%zmm16,%zmm6 + vpternlogq $0x96,%zmm5,%zmm6,%zmm0 + vpternlogq $0x96,%zmm5,%zmm6,%zmm3 + vpternlogq $0x96,%zmm5,%zmm6,%zmm1 + vpternlogq $0x96,%zmm5,%zmm6,%zmm4 + vpternlogq $0x96,%zmm5,%zmm6,%zmm2 + ######################################### Rho + vprolvq %zmm27,%zmm0,%zmm0 + vprolvq %zmm30,%zmm3,%zmm6 + vprolvq %zmm28,%zmm1,%zmm7 + vprolvq %zmm31,%zmm4,%zmm8 + vprolvq %zmm29,%zmm2,%zmm9 + vpermq %zmm0,%zmm16,%zmm10 + vpermq %zmm0,%zmm15,%zmm11 + ######################################### Iota + vpxorq -8(%r10),%zmm0,%zmm0{%k1} + ######################################### Pi + vpermq %zmm6,%zmm14,%zmm1 + vpermq %zmm7,%zmm16,%zmm2 + vpermq %zmm8,%zmm13,%zmm3 + vpermq %zmm9,%zmm15,%zmm4 + ######################################### Chi + vpternlogq $0xD2,%zmm11,%zmm10,%zmm0 + vpermq %zmm6,%zmm13,%zmm12 + #vpermq %zmm6,%zmm33,%zmm6 + vpternlogq $0xD2,%zmm6,%zmm12,%zmm1 + vpermq %zmm7,%zmm15,%zmm5 + vpermq %zmm7,%zmm14,%zmm7 + vpternlogq $0xD2,%zmm7,%zmm5,%zmm2 + #vpermq %zmm8,%zmm33,%zmm8 + vpermq %zmm8,%zmm16,%zmm6 + vpternlogq $0xD2,%zmm6,%zmm8,%zmm3 + vpermq %zmm9,%zmm14,%zmm5 + vpermq %zmm9,%zmm13,%zmm9 + vpternlogq $0xD2,%zmm9,%zmm5,%zmm4 + dec %eax + jnz .Loop_avx512 + ret +.ifdef macOS +.else +.size __KeccakF1600,.-__KeccakF1600 +.endif + +# ----------------------------------------------------------------------------- +# +# void KeccakP1600_AVX512_Permute_12rounds(void *state); +# %rdi +# +.ifdef macOS +.globl _KeccakP1600_AVX512_Permute_12rounds +_KeccakP1600_AVX512_Permute_12rounds: +.else +.globl KeccakP1600_AVX512_Permute_12rounds +.type KeccakP1600_AVX512_Permute_12rounds,@function +KeccakP1600_AVX512_Permute_12rounds: +.endif +.balign 32 + lea 96(%rdi),%rdi + lea theta_perm(%rip),%r8 + kxnorw %k6,%k6,%k6 + kshiftrw $15,%k6,%k1 + kshiftrw $11,%k6,%k6 + kshiftlw $1,%k1,%k2 + kshiftlw $2,%k1,%k3 + kshiftlw $3,%k1,%k4 + kshiftlw $4,%k1,%k5 + #vmovdqa64 64*0(%r8),%zmm33 + vmovdqa64 64*1(%r8),%zmm13 + vmovdqa64 64*2(%r8),%zmm14 + vmovdqa64 64*3(%r8),%zmm15 + vmovdqa64 64*4(%r8),%zmm16 + vmovdqa64 64*5(%r8),%zmm27 + vmovdqa64 64*6(%r8),%zmm28 + vmovdqa64 64*7(%r8),%zmm29 + vmovdqa64 64*8(%r8),%zmm30 + vmovdqa64 64*9(%r8),%zmm31 + vmovdqa64 64*10(%r8),%zmm22 + vmovdqa64 64*11(%r8),%zmm23 + vmovdqa64 64*12(%r8),%zmm24 + vmovdqa64 64*13(%r8),%zmm25 + vmovdqa64 64*14(%r8),%zmm26 + vmovdqa64 64*15(%r8),%zmm17 + vmovdqa64 64*16(%r8),%zmm18 + vmovdqa64 64*17(%r8),%zmm19 + vmovdqa64 64*18(%r8),%zmm20 + vmovdqa64 64*19(%r8),%zmm21 + vmovdqu64 40*0-96(%rdi),%zmm0{%k6}{z} +# vpxorq %zmm5,%zmm5,%zmm5 + vmovdqu64 40*1-96(%rdi),%zmm1{%k6}{z} + vmovdqu64 40*2-96(%rdi),%zmm2{%k6}{z} + vmovdqu64 40*3-96(%rdi),%zmm3{%k6}{z} + vmovdqu64 40*4-96(%rdi),%zmm4{%k6}{z} + lea iotas+12*8(%rip), %r10 + mov $12/2, %eax + call __KeccakF1600 + vmovdqu64 %zmm0,40*0-96(%rdi){%k6} + vmovdqu64 %zmm1,40*1-96(%rdi){%k6} + vmovdqu64 %zmm2,40*2-96(%rdi){%k6} + vmovdqu64 %zmm3,40*3-96(%rdi){%k6} + vmovdqu64 %zmm4,40*4-96(%rdi){%k6} + vzeroupper + ret +.ifdef macOS +.else +.size KeccakP1600_AVX512_Permute_12rounds,.-KeccakP1600_AVX512_Permute_12rounds +.endif + +# ----------------------------------------------------------------------------- +# +# size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); +# %rdi %rsi %rdx %rcx +# +.ifdef macOS +.globl _KeccakP1600_AVX512_12rounds_FastLoop_Absorb +_KeccakP1600_AVX512_12rounds_FastLoop_Absorb: +.else +.globl KeccakP1600_AVX512_12rounds_FastLoop_Absorb +.type KeccakP1600_AVX512_12rounds_FastLoop_Absorb,@function +KeccakP1600_AVX512_12rounds_FastLoop_Absorb: +.endif +.balign 32 + push %rbx + push %r10 + shr $3, %rcx # rcx = data length in lanes + mov %rdx, %rbx # rbx = initial data pointer + cmp %rsi, %rcx + jb KeccakP1600_AVX512_FastLoop_Absorb_Exit + lea 96(%rdi),%rdi + lea theta_perm(%rip),%r8 + kxnorw %k6,%k6,%k6 + kshiftrw $15,%k6,%k1 + kshiftrw $11,%k6,%k6 + kshiftlw $1,%k1,%k2 + kshiftlw $2,%k1,%k3 + kshiftlw $3,%k1,%k4 + kshiftlw $4,%k1,%k5 + vmovdqa64 64*1(%r8),%zmm13 + vmovdqa64 64*2(%r8),%zmm14 + vmovdqa64 64*3(%r8),%zmm15 + vmovdqa64 64*4(%r8),%zmm16 + vmovdqa64 64*5(%r8),%zmm27 + vmovdqa64 64*6(%r8),%zmm28 + vmovdqa64 64*7(%r8),%zmm29 + vmovdqa64 64*8(%r8),%zmm30 + vmovdqa64 64*9(%r8),%zmm31 + vmovdqa64 64*10(%r8),%zmm22 + vmovdqa64 64*11(%r8),%zmm23 + vmovdqa64 64*12(%r8),%zmm24 + vmovdqa64 64*13(%r8),%zmm25 + vmovdqa64 64*14(%r8),%zmm26 + vmovdqa64 64*15(%r8),%zmm17 + vmovdqa64 64*16(%r8),%zmm18 + vmovdqa64 64*17(%r8),%zmm19 + vmovdqa64 64*18(%r8),%zmm20 + vmovdqa64 64*19(%r8),%zmm21 + vmovdqu64 40*0-96(%rdi),%zmm0{%k6}{z} + vmovdqu64 40*1-96(%rdi),%zmm1{%k6}{z} + vmovdqu64 40*2-96(%rdi),%zmm2{%k6}{z} + vmovdqu64 40*3-96(%rdi),%zmm3{%k6}{z} + vmovdqu64 40*4-96(%rdi),%zmm4{%k6}{z} + cmp $21, %rsi + jnz KeccakP1600_AVX512_FastLoop_Absorb_Not21Lanes + sub $21, %rcx +KeccakP1600_AVX512_FastLoop_Absorb_Loop21Lanes: + vmovdqu64 8*0(%rdx),%zmm5{%k6}{z} + vmovdqu64 8*5(%rdx),%zmm6{%k6}{z} + vmovdqu64 8*10(%rdx),%zmm7{%k6}{z} + vmovdqu64 8*15(%rdx),%zmm8{%k6}{z} + vmovdqu64 8*20(%rdx),%zmm9{%k1}{z} + vpxorq %zmm5,%zmm0,%zmm0 + vpxorq %zmm6,%zmm1,%zmm1 + vpxorq %zmm7,%zmm2,%zmm2 + vpxorq %zmm8,%zmm3,%zmm3 + vpxorq %zmm9,%zmm4,%zmm4 + add $21*8, %rdx + lea iotas+12*8(%rip), %r10 + mov $12/2, %eax + call __KeccakF1600 + sub $21, %rcx + jnc KeccakP1600_AVX512_FastLoop_Absorb_Loop21Lanes +KeccakP1600_AVX512_FastLoop_Absorb_SaveAndExit: + vmovdqu64 %zmm0,40*0-96(%rdi){%k6} + vmovdqu64 %zmm1,40*1-96(%rdi){%k6} + vmovdqu64 %zmm2,40*2-96(%rdi){%k6} + vmovdqu64 %zmm3,40*3-96(%rdi){%k6} + vmovdqu64 %zmm4,40*4-96(%rdi){%k6} +KeccakP1600_AVX512_FastLoop_Absorb_Exit: + vzeroupper + mov %rdx, %rax # return number of bytes processed + sub %rbx, %rax + pop %r10 + pop %rbx + ret +KeccakP1600_AVX512_FastLoop_Absorb_Not21Lanes: + cmp $17, %rsi + jnz KeccakP1600_AVX512_FastLoop_Absorb_Not17Lanes + sub $17, %rcx +KeccakP1600_AVX512_FastLoop_Absorb_Loop17Lanes: + vmovdqu64 8*0(%rdx),%zmm5{%k6}{z} + vmovdqu64 8*5(%rdx),%zmm6{%k6}{z} + vmovdqu64 8*10(%rdx),%zmm7{%k6}{z} + vmovdqu64 8*15(%rdx),%zmm8{%k1}{z} + vmovdqu64 8*15(%rdx),%zmm8{%k2} + vpxorq %zmm5,%zmm0,%zmm0 + vpxorq %zmm6,%zmm1,%zmm1 + vpxorq %zmm7,%zmm2,%zmm2 + vpxorq %zmm8,%zmm3,%zmm3 + add $17*8, %rdx + lea iotas+12*8(%rip), %r10 + mov $12/2, %eax + call __KeccakF1600 + sub $17, %rcx + jnc KeccakP1600_AVX512_FastLoop_Absorb_Loop17Lanes + jmp KeccakP1600_AVX512_FastLoop_Absorb_SaveAndExit +KeccakP1600_AVX512_FastLoop_Absorb_Not17Lanes: + lea -96(%rdi), %rdi +KeccakP1600_AVX512_FastLoop_Absorb_LanesLoop: + mov %rsi, %rax + mov %rdi, %r10 +KeccakP1600_AVX512_FastLoop_Absorb_LanesAddLoop: + mov (%rdx), %r8 + add $8, %rdx + xor %r8, (%r10) + add $8, %r10 + sub $1, %rax + jnz KeccakP1600_AVX512_FastLoop_Absorb_LanesAddLoop + sub %rsi, %rcx + push %rdi + push %rsi + push %rdx + push %rcx +.ifdef macOS + call _KeccakP1600_AVX512_Permute_12rounds +.else + call KeccakP1600_AVX512_Permute_12rounds@PLT +.endif + pop %rcx + pop %rdx + pop %rsi + pop %rdi + cmp %rsi, %rcx + jae KeccakP1600_AVX512_FastLoop_Absorb_LanesLoop + jmp KeccakP1600_AVX512_FastLoop_Absorb_Exit +.ifdef macOS +.else +.size KeccakP1600_AVX512_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX512_12rounds_FastLoop_Absorb +.endif +.balign 64 +theta_perm: + .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used] + .quad 4, 0, 1, 2, 3, 5, 6, 7 + .quad 3, 4, 0, 1, 2, 5, 6, 7 + .quad 2, 3, 4, 0, 1, 5, 6, 7 + .quad 1, 2, 3, 4, 0, 5, 6, 7 +rhotates1: + .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4] + .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4] + .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4] + .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4] + .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4] +rhotates0: + .quad 0, 1, 62, 28, 27, 0, 0, 0 + .quad 36, 44, 6, 55, 20, 0, 0, 0 + .quad 3, 10, 43, 25, 39, 0, 0, 0 + .quad 41, 45, 15, 21, 8, 0, 0, 0 + .quad 18, 2, 61, 56, 14, 0, 0, 0 +pi0_perm: + .quad 0, 3, 1, 4, 2, 5, 6, 7 + .quad 1, 4, 2, 0, 3, 5, 6, 7 + .quad 2, 0, 3, 1, 4, 5, 6, 7 + .quad 3, 1, 4, 2, 0, 5, 6, 7 + .quad 4, 2, 0, 3, 1, 5, 6, 7 +iotas: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +iotas_end: +.asciz "Keccak-1600 for AVX-512F, CRYPTOGAMS by " diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h new file mode 100644 index 0000000..709469c --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h @@ -0,0 +1,74 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#ifndef _KeccakP_1600_SnP_h_ +#define _KeccakP_1600_SnP_h_ + +/* Keccak-p[1600] */ + +#define KeccakP1600_stateSizeInBytes 200 +#define KeccakP1600_stateAlignment 8 +#define KeccakP1600_12rounds_FastLoop_supported + +const char * KeccakP1600_GetImplementation(); +void KeccakP1600_Initialize(void *state); +void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_Permute_12rounds(void *state); +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +void KeccakP1600_AVX512_Initialize(void *state); +void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_AVX512_Permute_12rounds(void *state); +void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +void KeccakP1600_AVX2_Initialize(void *state); +void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_AVX2_Permute_12rounds(void *state); +void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +void KeccakP1600_opt64_Initialize(void *state); +void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset); +void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); +void KeccakP1600_opt64_Permute_12rounds(void *state); +void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); +size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); + +/* Keccak-p[1600]×2 */ + +int KeccakP1600times2_IsAvailable(); +const char * KeccakP1600times2_GetImplementation(); + +/* Keccak-p[1600]×4 */ + +int KeccakP1600times4_IsAvailable(); +const char * KeccakP1600times4_GetImplementation(); + +/* Keccak-p[1600]×8 */ + +int KeccakP1600times8_IsAvailable(); +const char * KeccakP1600times8_GetImplementation(); + +#endif diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c new file mode 100644 index 0000000..e98056d --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c @@ -0,0 +1,1026 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include +#include "brg_endian.h" +#include + +#define KeccakP1600_opt64_implementation_config "all rounds unrolled" +#define KeccakP1600_opt64_fullUnrolling +/* Or */ +/* +#define KeccakP1600_opt64_implementation_config "6 rounds unrolled" +#define KeccakP1600_opt64_unrolling 6 +*/ +/* Or */ +/* +#define KeccakP1600_opt64_implementation_config "lane complementing, 6 rounds unrolled" +#define KeccakP1600_opt64_unrolling 6 +#define KeccakP1600_opt64_useLaneComplementing +*/ +/* Or */ +/* +#define KeccakP1600_opt64_implementation_config "lane complementing, all rounds unrolled" +#define KeccakP1600_opt64_fullUnrolling +#define KeccakP1600_opt64_useLaneComplementing +*/ +/* Or */ +/* +#define KeccakP1600_opt64_implementation_config "lane complementing, all rounds unrolled, using SHLD for rotations" +#define KeccakP1600_opt64_fullUnrolling +#define KeccakP1600_opt64_useLaneComplementing +#define KeccakP1600_opt64_useSHLD +*/ + +#if defined(KeccakP1600_opt64_useLaneComplementing) +#define UseBebigokimisa +#endif + +#if defined(_MSC_VER) +#define ROL64(a, offset) _rotl64(a, offset) +#elif defined(KeccakP1600_opt64_useSHLD) + #define ROL64(x,N) ({ \ + register uint64_t __out; \ + register uint64_t __in = x; \ + __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \ + __out; \ + }) +#else +#define ROL64(a, offset) ((((uint64_t)a) << offset) ^ (((uint64_t)a) >> (64-offset))) +#endif + +#ifdef KeccakP1600_opt64_fullUnrolling +#define FullUnrolling +#else +#define Unrolling KeccakP1600_opt64_unrolling +#endif + +static const uint64_t KeccakF1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL }; + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_Initialize(void *state) +{ + memset(state, 0, 200); +#ifdef KeccakP1600_opt64_useLaneComplementing + ((uint64_t*)state)[ 1] = ~(uint64_t)0; + ((uint64_t*)state)[ 2] = ~(uint64_t)0; + ((uint64_t*)state)[ 8] = ~(uint64_t)0; + ((uint64_t*)state)[12] = ~(uint64_t)0; + ((uint64_t*)state)[17] = ~(uint64_t)0; + ((uint64_t*)state)[20] = ~(uint64_t)0; +#endif +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + uint64_t lane; + if (length == 0) + return; + if (length == 1) + lane = data[0]; + else { + lane = 0; + memcpy(&lane, data, length); + } + lane <<= offset*8; +#else + uint64_t lane = 0; + unsigned int i; + for(i=0; i 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +#define declareABCDE \ + uint64_t Aba, Abe, Abi, Abo, Abu; \ + uint64_t Aga, Age, Agi, Ago, Agu; \ + uint64_t Aka, Ake, Aki, Ako, Aku; \ + uint64_t Ama, Ame, Ami, Amo, Amu; \ + uint64_t Asa, Ase, Asi, Aso, Asu; \ + uint64_t Bba, Bbe, Bbi, Bbo, Bbu; \ + uint64_t Bga, Bge, Bgi, Bgo, Bgu; \ + uint64_t Bka, Bke, Bki, Bko, Bku; \ + uint64_t Bma, Bme, Bmi, Bmo, Bmu; \ + uint64_t Bsa, Bse, Bsi, Bso, Bsu; \ + uint64_t Ca, Ce, Ci, Co, Cu; \ + uint64_t Da, De, Di, Do, Du; \ + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; \ + uint64_t Ega, Ege, Egi, Ego, Egu; \ + uint64_t Eka, Eke, Eki, Eko, Eku; \ + uint64_t Ema, Eme, Emi, Emo, Emu; \ + uint64_t Esa, Ese, Esi, Eso, Esu; \ + +#define prepareTheta \ + Ca = Aba^Aga^Aka^Ama^Asa; \ + Ce = Abe^Age^Ake^Ame^Ase; \ + Ci = Abi^Agi^Aki^Ami^Asi; \ + Co = Abo^Ago^Ako^Amo^Aso; \ + Cu = Abu^Agu^Aku^Amu^Asu; \ + +#ifdef UseBebigokimisa +/* --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^( Bbe | Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + Ca = E##ba; \ + E##be = Bbe ^((~Bbi)| Bbo ); \ + Ce = E##be; \ + E##bi = Bbi ^( Bbo & Bbu ); \ + Ci = E##bi; \ + E##bo = Bbo ^( Bbu | Bba ); \ + Co = E##bo; \ + E##bu = Bbu ^( Bba & Bbe ); \ + Cu = E##bu; \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^( Bge | Bgi ); \ + Ca ^= E##ga; \ + E##ge = Bge ^( Bgi & Bgo ); \ + Ce ^= E##ge; \ + E##gi = Bgi ^( Bgo |(~Bgu)); \ + Ci ^= E##gi; \ + E##go = Bgo ^( Bgu | Bga ); \ + Co ^= E##go; \ + E##gu = Bgu ^( Bga & Bge ); \ + Cu ^= E##gu; \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^( Bke | Bki ); \ + Ca ^= E##ka; \ + E##ke = Bke ^( Bki & Bko ); \ + Ce ^= E##ke; \ + E##ki = Bki ^((~Bko)& Bku ); \ + Ci ^= E##ki; \ + E##ko = (~Bko)^( Bku | Bka ); \ + Co ^= E##ko; \ + E##ku = Bku ^( Bka & Bke ); \ + Cu ^= E##ku; \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^( Bme & Bmi ); \ + Ca ^= E##ma; \ + E##me = Bme ^( Bmi | Bmo ); \ + Ce ^= E##me; \ + E##mi = Bmi ^((~Bmo)| Bmu ); \ + Ci ^= E##mi; \ + E##mo = (~Bmo)^( Bmu & Bma ); \ + Co ^= E##mo; \ + E##mu = Bmu ^( Bma | Bme ); \ + Cu ^= E##mu; \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + Ca ^= E##sa; \ + E##se = (~Bse)^( Bsi | Bso ); \ + Ce ^= E##se; \ + E##si = Bsi ^( Bso & Bsu ); \ + Ci ^= E##si; \ + E##so = Bso ^( Bsu | Bsa ); \ + Co ^= E##so; \ + E##su = Bsu ^( Bsa & Bse ); \ + Cu ^= E##su; \ +\ + +/* --- Code for round (lane complementing pattern 'bebigokimisa') */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIota(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^( Bbe | Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + E##be = Bbe ^((~Bbi)| Bbo ); \ + E##bi = Bbi ^( Bbo & Bbu ); \ + E##bo = Bbo ^( Bbu | Bba ); \ + E##bu = Bbu ^( Bba & Bbe ); \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^( Bge | Bgi ); \ + E##ge = Bge ^( Bgi & Bgo ); \ + E##gi = Bgi ^( Bgo |(~Bgu)); \ + E##go = Bgo ^( Bgu | Bga ); \ + E##gu = Bgu ^( Bga & Bge ); \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^( Bke | Bki ); \ + E##ke = Bke ^( Bki & Bko ); \ + E##ki = Bki ^((~Bko)& Bku ); \ + E##ko = (~Bko)^( Bku | Bka ); \ + E##ku = Bku ^( Bka & Bke ); \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^( Bme & Bmi ); \ + E##me = Bme ^( Bmi | Bmo ); \ + E##mi = Bmi ^((~Bmo)| Bmu ); \ + E##mo = (~Bmo)^( Bmu & Bma ); \ + E##mu = Bmu ^( Bma | Bme ); \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + E##se = (~Bse)^( Bsi | Bso ); \ + E##si = Bsi ^( Bso & Bsu ); \ + E##so = Bso ^( Bsu | Bsa ); \ + E##su = Bsu ^( Bsa & Bse ); \ +\ + +#else /* UseBebigokimisa */ +/* --- Code for round, with prepare-theta */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^((~Bbe)& Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + Ca = E##ba; \ + E##be = Bbe ^((~Bbi)& Bbo ); \ + Ce = E##be; \ + E##bi = Bbi ^((~Bbo)& Bbu ); \ + Ci = E##bi; \ + E##bo = Bbo ^((~Bbu)& Bba ); \ + Co = E##bo; \ + E##bu = Bbu ^((~Bba)& Bbe ); \ + Cu = E##bu; \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^((~Bge)& Bgi ); \ + Ca ^= E##ga; \ + E##ge = Bge ^((~Bgi)& Bgo ); \ + Ce ^= E##ge; \ + E##gi = Bgi ^((~Bgo)& Bgu ); \ + Ci ^= E##gi; \ + E##go = Bgo ^((~Bgu)& Bga ); \ + Co ^= E##go; \ + E##gu = Bgu ^((~Bga)& Bge ); \ + Cu ^= E##gu; \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^((~Bke)& Bki ); \ + Ca ^= E##ka; \ + E##ke = Bke ^((~Bki)& Bko ); \ + Ce ^= E##ke; \ + E##ki = Bki ^((~Bko)& Bku ); \ + Ci ^= E##ki; \ + E##ko = Bko ^((~Bku)& Bka ); \ + Co ^= E##ko; \ + E##ku = Bku ^((~Bka)& Bke ); \ + Cu ^= E##ku; \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^((~Bme)& Bmi ); \ + Ca ^= E##ma; \ + E##me = Bme ^((~Bmi)& Bmo ); \ + Ce ^= E##me; \ + E##mi = Bmi ^((~Bmo)& Bmu ); \ + Ci ^= E##mi; \ + E##mo = Bmo ^((~Bmu)& Bma ); \ + Co ^= E##mo; \ + E##mu = Bmu ^((~Bma)& Bme ); \ + Cu ^= E##mu; \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + Ca ^= E##sa; \ + E##se = Bse ^((~Bsi)& Bso ); \ + Ce ^= E##se; \ + E##si = Bsi ^((~Bso)& Bsu ); \ + Ci ^= E##si; \ + E##so = Bso ^((~Bsu)& Bsa ); \ + Co ^= E##so; \ + E##su = Bsu ^((~Bsa)& Bse ); \ + Cu ^= E##su; \ +\ + +/* --- Code for round */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIota(i, A, E) \ + Da = Cu^ROL64(Ce, 1); \ + De = Ca^ROL64(Ci, 1); \ + Di = Ce^ROL64(Co, 1); \ + Do = Ci^ROL64(Cu, 1); \ + Du = Co^ROL64(Ca, 1); \ +\ + A##ba ^= Da; \ + Bba = A##ba; \ + A##ge ^= De; \ + Bbe = ROL64(A##ge, 44); \ + A##ki ^= Di; \ + Bbi = ROL64(A##ki, 43); \ + A##mo ^= Do; \ + Bbo = ROL64(A##mo, 21); \ + A##su ^= Du; \ + Bbu = ROL64(A##su, 14); \ + E##ba = Bba ^((~Bbe)& Bbi ); \ + E##ba ^= KeccakF1600RoundConstants[i]; \ + E##be = Bbe ^((~Bbi)& Bbo ); \ + E##bi = Bbi ^((~Bbo)& Bbu ); \ + E##bo = Bbo ^((~Bbu)& Bba ); \ + E##bu = Bbu ^((~Bba)& Bbe ); \ +\ + A##bo ^= Do; \ + Bga = ROL64(A##bo, 28); \ + A##gu ^= Du; \ + Bge = ROL64(A##gu, 20); \ + A##ka ^= Da; \ + Bgi = ROL64(A##ka, 3); \ + A##me ^= De; \ + Bgo = ROL64(A##me, 45); \ + A##si ^= Di; \ + Bgu = ROL64(A##si, 61); \ + E##ga = Bga ^((~Bge)& Bgi ); \ + E##ge = Bge ^((~Bgi)& Bgo ); \ + E##gi = Bgi ^((~Bgo)& Bgu ); \ + E##go = Bgo ^((~Bgu)& Bga ); \ + E##gu = Bgu ^((~Bga)& Bge ); \ +\ + A##be ^= De; \ + Bka = ROL64(A##be, 1); \ + A##gi ^= Di; \ + Bke = ROL64(A##gi, 6); \ + A##ko ^= Do; \ + Bki = ROL64(A##ko, 25); \ + A##mu ^= Du; \ + Bko = ROL64(A##mu, 8); \ + A##sa ^= Da; \ + Bku = ROL64(A##sa, 18); \ + E##ka = Bka ^((~Bke)& Bki ); \ + E##ke = Bke ^((~Bki)& Bko ); \ + E##ki = Bki ^((~Bko)& Bku ); \ + E##ko = Bko ^((~Bku)& Bka ); \ + E##ku = Bku ^((~Bka)& Bke ); \ +\ + A##bu ^= Du; \ + Bma = ROL64(A##bu, 27); \ + A##ga ^= Da; \ + Bme = ROL64(A##ga, 36); \ + A##ke ^= De; \ + Bmi = ROL64(A##ke, 10); \ + A##mi ^= Di; \ + Bmo = ROL64(A##mi, 15); \ + A##so ^= Do; \ + Bmu = ROL64(A##so, 56); \ + E##ma = Bma ^((~Bme)& Bmi ); \ + E##me = Bme ^((~Bmi)& Bmo ); \ + E##mi = Bmi ^((~Bmo)& Bmu ); \ + E##mo = Bmo ^((~Bmu)& Bma ); \ + E##mu = Bmu ^((~Bma)& Bme ); \ +\ + A##bi ^= Di; \ + Bsa = ROL64(A##bi, 62); \ + A##go ^= Do; \ + Bse = ROL64(A##go, 55); \ + A##ku ^= Du; \ + Bsi = ROL64(A##ku, 39); \ + A##ma ^= Da; \ + Bso = ROL64(A##ma, 41); \ + A##se ^= De; \ + Bsu = ROL64(A##se, 2); \ + E##sa = Bsa ^((~Bse)& Bsi ); \ + E##se = Bse ^((~Bsi)& Bso ); \ + E##si = Bsi ^((~Bso)& Bsu ); \ + E##so = Bso ^((~Bsu)& Bsa ); \ + E##su = Bsu ^((~Bsa)& Bse ); \ +\ + +#endif /* UseBebigokimisa */ + +#define copyFromState(X, state) \ + X##ba = state[ 0]; \ + X##be = state[ 1]; \ + X##bi = state[ 2]; \ + X##bo = state[ 3]; \ + X##bu = state[ 4]; \ + X##ga = state[ 5]; \ + X##ge = state[ 6]; \ + X##gi = state[ 7]; \ + X##go = state[ 8]; \ + X##gu = state[ 9]; \ + X##ka = state[10]; \ + X##ke = state[11]; \ + X##ki = state[12]; \ + X##ko = state[13]; \ + X##ku = state[14]; \ + X##ma = state[15]; \ + X##me = state[16]; \ + X##mi = state[17]; \ + X##mo = state[18]; \ + X##mu = state[19]; \ + X##sa = state[20]; \ + X##se = state[21]; \ + X##si = state[22]; \ + X##so = state[23]; \ + X##su = state[24]; \ + +#define copyToState(state, X) \ + state[ 0] = X##ba; \ + state[ 1] = X##be; \ + state[ 2] = X##bi; \ + state[ 3] = X##bo; \ + state[ 4] = X##bu; \ + state[ 5] = X##ga; \ + state[ 6] = X##ge; \ + state[ 7] = X##gi; \ + state[ 8] = X##go; \ + state[ 9] = X##gu; \ + state[10] = X##ka; \ + state[11] = X##ke; \ + state[12] = X##ki; \ + state[13] = X##ko; \ + state[14] = X##ku; \ + state[15] = X##ma; \ + state[16] = X##me; \ + state[17] = X##mi; \ + state[18] = X##mo; \ + state[19] = X##mu; \ + state[20] = X##sa; \ + state[21] = X##se; \ + state[22] = X##si; \ + state[23] = X##so; \ + state[24] = X##su; \ + +#define copyStateVariables(X, Y) \ + X##ba = Y##ba; \ + X##be = Y##be; \ + X##bi = Y##bi; \ + X##bo = Y##bo; \ + X##bu = Y##bu; \ + X##ga = Y##ga; \ + X##ge = Y##ge; \ + X##gi = Y##gi; \ + X##go = Y##go; \ + X##gu = Y##gu; \ + X##ka = Y##ka; \ + X##ke = Y##ke; \ + X##ki = Y##ki; \ + X##ko = Y##ko; \ + X##ku = Y##ku; \ + X##ma = Y##ma; \ + X##me = Y##me; \ + X##mi = Y##mi; \ + X##mo = Y##mo; \ + X##mu = Y##mu; \ + X##sa = Y##sa; \ + X##se = Y##se; \ + X##si = Y##si; \ + X##so = Y##so; \ + X##su = Y##su; \ + +#if ((defined(FullUnrolling)) || (Unrolling == 12)) +#define rounds12 \ + prepareTheta \ + thetaRhoPiChiIotaPrepareTheta(12, A, E) \ + thetaRhoPiChiIotaPrepareTheta(13, E, A) \ + thetaRhoPiChiIotaPrepareTheta(14, A, E) \ + thetaRhoPiChiIotaPrepareTheta(15, E, A) \ + thetaRhoPiChiIotaPrepareTheta(16, A, E) \ + thetaRhoPiChiIotaPrepareTheta(17, E, A) \ + thetaRhoPiChiIotaPrepareTheta(18, A, E) \ + thetaRhoPiChiIotaPrepareTheta(19, E, A) \ + thetaRhoPiChiIotaPrepareTheta(20, A, E) \ + thetaRhoPiChiIotaPrepareTheta(21, E, A) \ + thetaRhoPiChiIotaPrepareTheta(22, A, E) \ + thetaRhoPiChiIota(23, E, A) \ + +#elif (Unrolling == 6) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=6) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ + } \ + +#elif (Unrolling == 4) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=4) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + } \ + +#elif (Unrolling == 3) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=3) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + copyStateVariables(A, E) \ + } \ + +#elif (Unrolling == 2) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=2) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + } \ + +#elif (Unrolling == 1) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i++) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + copyStateVariables(A, E) \ + } \ + +#else +#error "Unrolling is not correctly specified!" +#endif + +void KeccakP1600_opt64_Permute_12rounds(void *state) +{ + declareABCDE + #ifndef KeccakP1600_opt64_fullUnrolling + unsigned int i; + #endif + uint64_t *stateAsLanes = (uint64_t*)state; + + copyFromState(A, stateAsLanes) + rounds12 + copyToState(stateAsLanes, A) +} + +/* ---------------------------------------------------------------- */ + +void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) +{ + uint64_t lane = ((uint64_t*)state)[lanePosition]; +#ifdef KeccakP1600_opt64_useLaneComplementing + if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20)) + lane = ~lane; +#endif +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + { + uint64_t lane1[1]; + lane1[0] = lane; + memcpy(data, (uint8_t*)lane1+offset, length); + } +#else + unsigned int i; + lane >>= offset*8; + for(i=0; i>= 8; + } +#endif +} + +/* ---------------------------------------------------------------- */ + +#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) +static void fromWordToBytes(uint8_t *bytes, const uint64_t word) +{ + unsigned int i; + + for(i=0; i<(64/8); i++) + bytes[i] = (word >> (8*i)) & 0xFF; +} +#endif + +void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) +{ +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + memcpy(data, state, laneCount*8); +#else + unsigned int i; + + for(i=0; i 1) { + ((uint64_t*)data)[ 1] = ~((uint64_t*)data)[ 1]; + if (laneCount > 2) { + ((uint64_t*)data)[ 2] = ~((uint64_t*)data)[ 2]; + if (laneCount > 8) { + ((uint64_t*)data)[ 8] = ~((uint64_t*)data)[ 8]; + if (laneCount > 12) { + ((uint64_t*)data)[12] = ~((uint64_t*)data)[12]; + if (laneCount > 17) { + ((uint64_t*)data)[17] = ~((uint64_t*)data)[17]; + if (laneCount > 20) { + ((uint64_t*)data)[20] = ~((uint64_t*)data)[20]; + } + } + } + } + } + } +#endif +} + +/* ---------------------------------------------------------------- */ + +#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ + { \ + if ((offset) == 0) { \ + SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ + SnP_ExtractBytesInLane(state, \ + (length)/SnP_laneLengthInBytes, \ + (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ + 0, \ + (length)%SnP_laneLengthInBytes); \ + } \ + else { \ + unsigned int _sizeLeft = (length); \ + unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ + unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ + unsigned char *_curData = (data); \ + while(_sizeLeft > 0) { \ + unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ + if (_bytesInLane > _sizeLeft) \ + _bytesInLane = _sizeLeft; \ + SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ + _sizeLeft -= _bytesInLane; \ + _lanePosition++; \ + _offsetInLane = 0; \ + _curData += _bytesInLane; \ + } \ + } \ + } + +void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8); +} + +/* ---------------------------------------------------------------- */ + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define HTOLE64(x) (x) +#else +#define HTOLE64(x) (\ + ((x & 0xff00000000000000ull) >> 56) | \ + ((x & 0x00ff000000000000ull) >> 40) | \ + ((x & 0x0000ff0000000000ull) >> 24) | \ + ((x & 0x000000ff00000000ull) >> 8) | \ + ((x & 0x00000000ff000000ull) << 8) | \ + ((x & 0x0000000000ff0000ull) << 24) | \ + ((x & 0x000000000000ff00ull) << 40) | \ + ((x & 0x00000000000000ffull) << 56)) +#endif + +#define addInput(X, input, laneCount) \ + if (laneCount == 21) { \ + X##ba ^= HTOLE64(input[ 0]); \ + X##be ^= HTOLE64(input[ 1]); \ + X##bi ^= HTOLE64(input[ 2]); \ + X##bo ^= HTOLE64(input[ 3]); \ + X##bu ^= HTOLE64(input[ 4]); \ + X##ga ^= HTOLE64(input[ 5]); \ + X##ge ^= HTOLE64(input[ 6]); \ + X##gi ^= HTOLE64(input[ 7]); \ + X##go ^= HTOLE64(input[ 8]); \ + X##gu ^= HTOLE64(input[ 9]); \ + X##ka ^= HTOLE64(input[10]); \ + X##ke ^= HTOLE64(input[11]); \ + X##ki ^= HTOLE64(input[12]); \ + X##ko ^= HTOLE64(input[13]); \ + X##ku ^= HTOLE64(input[14]); \ + X##ma ^= HTOLE64(input[15]); \ + X##me ^= HTOLE64(input[16]); \ + X##mi ^= HTOLE64(input[17]); \ + X##mo ^= HTOLE64(input[18]); \ + X##mu ^= HTOLE64(input[19]); \ + X##sa ^= HTOLE64(input[20]); \ + } \ + +#include + +size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) +{ + size_t originalDataByteLen = dataByteLen; + declareABCDE + #ifndef KeccakP1600_opt64_fullUnrolling + unsigned int i; + #endif + uint64_t *stateAsLanes = (uint64_t*)state; + uint64_t *inDataAsLanes = (uint64_t*)data; + + assert(laneCount == 21); + + #define laneCount 21 + copyFromState(A, stateAsLanes) + while(dataByteLen >= laneCount*8) { + addInput(A, inDataAsLanes, laneCount) + rounds12 + inDataAsLanes += laneCount; + dataByteLen -= laneCount*8; + } + #undef laneCount + copyToState(stateAsLanes, A) + return originalDataByteLen - dataByteLen; +} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c new file mode 100644 index 0000000..22a0901 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c @@ -0,0 +1,406 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include +#include "brg_endian.h" +#include "KeccakP-1600-SnP.h" + +#ifdef KeccakP1600_disableParallelism +#undef KeccakP1600_enable_simd_options +#else + +// Forward declaration +void KangarooTwelve_SetProcessorCapabilities(); +#ifdef KeccakP1600_enable_simd_options +int K12_SSSE3_requested_disabled = 0; +int K12_AVX2_requested_disabled = 0; +int K12_AVX512_requested_disabled = 0; +#endif // KeccakP1600_enable_simd_options +int K12_enableSSSE3 = 0; +int K12_enableAVX2 = 0; +int K12_enableAVX512 = 0; + +/* ---------------------------------------------------------------- */ + +void KangarooTwelve_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output); +void KangarooTwelve_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output); + +int KeccakP1600times2_IsAvailable() +{ + int result = 0; + result |= K12_enableAVX512; + result |= K12_enableSSSE3; + return result; +} + +const char * KeccakP1600times2_GetImplementation() +{ + if (K12_enableAVX512) { + return "AVX-512 implementation"; + } else if (K12_enableSSSE3) { + return "SSSE3 implementation"; + } else { + return ""; + } +} + +void KangarooTwelve_Process2Leaves(const unsigned char *input, unsigned char *output) +{ + if (K12_enableAVX512) { + KangarooTwelve_AVX512_Process2Leaves(input, output); + } else if (K12_enableSSSE3) { + KangarooTwelve_SSSE3_Process2Leaves(input, output); + } +} + + +void KangarooTwelve_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output); +void KangarooTwelve_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output); + +int KeccakP1600times4_IsAvailable() +{ + int result = 0; + result |= K12_enableAVX512; + result |= K12_enableAVX2; + return result; +} + +const char * KeccakP1600times4_GetImplementation() +{ + if (K12_enableAVX512) { + return "AVX-512 implementation"; + } else if (K12_enableAVX2) { + return "AVX2 implementation"; + } else { + return ""; + } +} + +void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output) +{ + if (K12_enableAVX512) { + KangarooTwelve_AVX512_Process4Leaves(input, output); + } else if (K12_enableAVX2) { + KangarooTwelve_AVX2_Process4Leaves(input, output); + } +} + + +void KangarooTwelve_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output); + +int KeccakP1600times8_IsAvailable() +{ + int result = 0; + result |= K12_enableAVX512; + return result; +} + +const char * KeccakP1600times8_GetImplementation() +{ + if (K12_enableAVX512) { + return "AVX-512 implementation"; + } else { + return ""; + } +} + +void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output) +{ + if (K12_enableAVX512) + KangarooTwelve_AVX512_Process8Leaves(input, output); +} + +#endif // KeccakP1600_disableParallelism + +const char * KeccakP1600_GetImplementation() +{ + if (K12_enableAVX512) + return "AVX-512 implementation"; + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + return "AVX2 implementation"; + else +#endif + return "generic 64-bit implementation"; +} + +void KeccakP1600_Initialize(void *state) +{ + KangarooTwelve_SetProcessorCapabilities(); + if (K12_enableAVX512) + KeccakP1600_AVX512_Initialize(state); + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + KeccakP1600_AVX2_Initialize(state); + else +#endif + KeccakP1600_opt64_Initialize(state); +} + +void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset) +{ + if (K12_enableAVX512) + ((unsigned char*)(state))[offset] ^= data; + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + KeccakP1600_AVX2_AddByte(state, data, offset); + else +#endif + KeccakP1600_opt64_AddByte(state, data, offset); +} + +void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + if (K12_enableAVX512) + KeccakP1600_AVX512_AddBytes(state, data, offset, length); + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + KeccakP1600_AVX2_AddBytes(state, data, offset, length); + else +#endif + KeccakP1600_opt64_AddBytes(state, data, offset, length); +} + +void KeccakP1600_Permute_12rounds(void *state) +{ + if (K12_enableAVX512) + KeccakP1600_AVX512_Permute_12rounds(state); + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + KeccakP1600_AVX2_Permute_12rounds(state); + else +#endif + KeccakP1600_opt64_Permute_12rounds(state); +} + +void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + if (K12_enableAVX512) + KeccakP1600_AVX512_ExtractBytes(state, data, offset, length); + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + KeccakP1600_AVX2_ExtractBytes(state, data, offset, length); + else +#endif + KeccakP1600_opt64_ExtractBytes(state, data, offset, length); +} + +size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) +{ + if (K12_enableAVX512) + return KeccakP1600_AVX512_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); + else +#ifndef KeccakP1600_noAssembly + if (K12_enableAVX2) + return KeccakP1600_AVX2_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); + else +#endif + return KeccakP1600_opt64_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); +} + +/* ---------------------------------------------------------------- */ + +/* Processor capability detection code by Samuel Neves and Jack O'Connor, see + * https://github.com/BLAKE3-team/BLAKE3/blob/master/c/blake3_dispatch.c + */ + +#if defined(__x86_64__) || defined(_M_X64) +#define IS_X86 +#define IS_X86_64 +#endif + +#if defined(__i386__) || defined(_M_IX86) +#define IS_X86 +#define IS_X86_32 +#endif + +#if defined(IS_X86) +static uint64_t xgetbv() { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax = 0, edx = 0; + __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; +#endif +} + +static void cpuid(uint32_t out[4], uint32_t id) { +#if defined(_MSC_VER) + __cpuid((int *)out, id); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#endif +} + +static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { +#if defined(_MSC_VER) + __cpuidex((int *)out, id, sid); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#endif +} + +#endif + +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +static enum cpu_feature g_cpu_features = UNDEFINED; + +static enum cpu_feature + get_cpu_features(void) { + + if (g_cpu_features != UNDEFINED) { + return g_cpu_features; + } else { +#if defined(IS_X86) + uint32_t regs[4] = {0}; + uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; + (void)edx; + enum cpu_feature features = 0; + cpuid(regs, 0); + const int max_id = *eax; + cpuid(regs, 1); +#if defined(__amd64__) || defined(_M_X64) + features |= SSE2; +#else + if (*edx & (1UL << 26)) + features |= SSE2; +#endif + if (*ecx & (1UL << 9)) + features |= SSSE3; + if (*ecx & (1UL << 19)) + features |= SSE41; + + if (*ecx & (1UL << 27)) { // OSXSAVE + const uint64_t mask = xgetbv(); + if ((mask & 6) == 6) { // SSE and AVX states + if (*ecx & (1UL << 28)) + features |= AVX; + if (max_id >= 7) { + cpuidex(regs, 7, 0); + if (*ebx & (1UL << 5)) + features |= AVX2; + if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm + if (*ebx & (1UL << 31)) + features |= AVX512VL; + if (*ebx & (1UL << 16)) + features |= AVX512F; + } + } + } + } + g_cpu_features = features; + return features; +#else + /* How to detect NEON? */ + return 0; +#endif + } +} + +void KangarooTwelve_SetProcessorCapabilities() +{ + enum cpu_feature features = get_cpu_features(); + K12_enableSSSE3 = (features & SSSE3); + K12_enableAVX2 = (features & AVX2); + K12_enableAVX512 = (features & AVX512F) && (features & AVX512VL); +#ifdef KeccakP1600_enable_simd_options + K12_enableSSSE3 = K12_enableSSSE3 && !K12_SSSE3_requested_disabled; + K12_enableAVX2 = K12_enableAVX2 && !K12_AVX2_requested_disabled; + K12_enableAVX512 = K12_enableAVX512 && !K12_AVX512_requested_disabled; +#endif // KeccakP1600_enable_simd_options +} + +#ifdef KeccakP1600_enable_simd_options +int KangarooTwelve_DisableSSSE3(void) { + KangarooTwelve_SetProcessorCapabilities(); + K12_SSSE3_requested_disabled = 1; + if (K12_enableSSSE3) { + KangarooTwelve_SetProcessorCapabilities(); + return 1; // SSSE3 was disabled on this call. + } else { + return 0; // Nothing changed. + } +} + +int KangarooTwelve_DisableAVX2(void) { + KangarooTwelve_SetProcessorCapabilities(); + K12_AVX2_requested_disabled = 1; + if (K12_enableAVX2) { + KangarooTwelve_SetProcessorCapabilities(); + return 1; // AVX2 was disabled on this call. + } else { + return 0; // Nothing changed. + } +} + +int KangarooTwelve_DisableAVX512(void) { + KangarooTwelve_SetProcessorCapabilities(); + K12_AVX512_requested_disabled = 1; + if (K12_enableAVX512) { + KangarooTwelve_SetProcessorCapabilities(); + return 1; // AVX512 was disabled on this call. + } else { + return 0; // Nothing changed. + } +} + +void KangarooTwelve_EnableAllCpuFeatures(void) { + K12_SSSE3_requested_disabled = 0; + K12_AVX2_requested_disabled = 0; + K12_AVX512_requested_disabled = 0; + KangarooTwelve_SetProcessorCapabilities(); +} +#endif // KeccakP1600_enable_simd_options diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c new file mode 100644 index 0000000..0abab49 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c @@ -0,0 +1,419 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include "KeccakP-1600-SnP.h" +#include "align.h" + +#define AVX2alignment 32 + +#define ANDnu256(a, b) _mm256_andnot_si256(a, b) +#define CONST256(a) _mm256_load_si256((const __m256i *)&(a)) +#define CONST256_64(a) _mm256_set1_epi64x(a) +#define LOAD256(a) _mm256_load_si256((const __m256i *)&(a)) +#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d)) +#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o))) +#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) +#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56)) +static ALIGN(AVX2alignment) const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F}; +static ALIGN(AVX2alignment) const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; +#define STORE256(a, b) _mm256_store_si256((__m256i *)&(a), b) +#define STORE256u(a, b) _mm256_storeu_si256((__m256i *)&(a), b) +#define XOR256(a, b) _mm256_xor_si256(a, b) +#define XOReq256(a, b) a = _mm256_xor_si256(a, b) +#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) +#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) +#define PERM128( a, b, c ) _mm256_permute2f128_si256(a, b, c) +#define SHUFFLE64( a, b, c ) _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c)) +#define ZERO() _mm256_setzero_si256() + +static ALIGN(AVX2alignment) const uint64_t KeccakP1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL}; + +#define declareABCDE \ + __m256i Aba, Abe, Abi, Abo, Abu; \ + __m256i Aga, Age, Agi, Ago, Agu; \ + __m256i Aka, Ake, Aki, Ako, Aku; \ + __m256i Ama, Ame, Ami, Amo, Amu; \ + __m256i Asa, Ase, Asi, Aso, Asu; \ + __m256i Bba, Bbe, Bbi, Bbo, Bbu; \ + __m256i Bga, Bge, Bgi, Bgo, Bgu; \ + __m256i Bka, Bke, Bki, Bko, Bku; \ + __m256i Bma, Bme, Bmi, Bmo, Bmu; \ + __m256i Bsa, Bse, Bsi, Bso, Bsu; \ + __m256i Ca, Ce, Ci, Co, Cu; \ + __m256i Ca1, Ce1, Ci1, Co1, Cu1; \ + __m256i Da, De, Di, Do, Du; \ + __m256i Eba, Ebe, Ebi, Ebo, Ebu; \ + __m256i Ega, Ege, Egi, Ego, Egu; \ + __m256i Eka, Eke, Eki, Eko, Eku; \ + __m256i Ema, Eme, Emi, Emo, Emu; \ + __m256i Esa, Ese, Esi, Eso, Esu; \ + +#define prepareTheta \ + Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \ + Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \ + Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \ + Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \ + Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \ + +/* --- Theta Rho Pi Chi Iota Prepare-theta */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + ROL64in256(Ce1, Ce, 1); \ + Da = XOR256(Cu, Ce1); \ + ROL64in256(Ci1, Ci, 1); \ + De = XOR256(Ca, Ci1); \ + ROL64in256(Co1, Co, 1); \ + Di = XOR256(Ce, Co1); \ + ROL64in256(Cu1, Cu, 1); \ + Do = XOR256(Ci, Cu1); \ + ROL64in256(Ca1, Ca, 1); \ + Du = XOR256(Co, Ca1); \ +\ + XOReq256(A##ba, Da); \ + Bba = A##ba; \ + XOReq256(A##ge, De); \ + ROL64in256(Bbe, A##ge, 44); \ + XOReq256(A##ki, Di); \ + ROL64in256(Bbi, A##ki, 43); \ + E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ + XOReq256(E##ba, CONST256_64(KeccakP1600RoundConstants[i])); \ + Ca = E##ba; \ + XOReq256(A##mo, Do); \ + ROL64in256(Bbo, A##mo, 21); \ + E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ + Ce = E##be; \ + XOReq256(A##su, Du); \ + ROL64in256(Bbu, A##su, 14); \ + E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ + Ci = E##bi; \ + E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ + Co = E##bo; \ + E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ + Cu = E##bu; \ +\ + XOReq256(A##bo, Do); \ + ROL64in256(Bga, A##bo, 28); \ + XOReq256(A##gu, Du); \ + ROL64in256(Bge, A##gu, 20); \ + XOReq256(A##ka, Da); \ + ROL64in256(Bgi, A##ka, 3); \ + E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ + XOReq256(Ca, E##ga); \ + XOReq256(A##me, De); \ + ROL64in256(Bgo, A##me, 45); \ + E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ + XOReq256(Ce, E##ge); \ + XOReq256(A##si, Di); \ + ROL64in256(Bgu, A##si, 61); \ + E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ + XOReq256(Ci, E##gi); \ + E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ + XOReq256(Co, E##go); \ + E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ + XOReq256(Cu, E##gu); \ +\ + XOReq256(A##be, De); \ + ROL64in256(Bka, A##be, 1); \ + XOReq256(A##gi, Di); \ + ROL64in256(Bke, A##gi, 6); \ + XOReq256(A##ko, Do); \ + ROL64in256(Bki, A##ko, 25); \ + E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ + XOReq256(Ca, E##ka); \ + XOReq256(A##mu, Du); \ + ROL64in256_8(Bko, A##mu); \ + E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ + XOReq256(Ce, E##ke); \ + XOReq256(A##sa, Da); \ + ROL64in256(Bku, A##sa, 18); \ + E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ + XOReq256(Ci, E##ki); \ + E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ + XOReq256(Co, E##ko); \ + E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ + XOReq256(Cu, E##ku); \ +\ + XOReq256(A##bu, Du); \ + ROL64in256(Bma, A##bu, 27); \ + XOReq256(A##ga, Da); \ + ROL64in256(Bme, A##ga, 36); \ + XOReq256(A##ke, De); \ + ROL64in256(Bmi, A##ke, 10); \ + E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ + XOReq256(Ca, E##ma); \ + XOReq256(A##mi, Di); \ + ROL64in256(Bmo, A##mi, 15); \ + E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ + XOReq256(Ce, E##me); \ + XOReq256(A##so, Do); \ + ROL64in256_56(Bmu, A##so); \ + E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ + XOReq256(Ci, E##mi); \ + E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ + XOReq256(Co, E##mo); \ + E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ + XOReq256(Cu, E##mu); \ +\ + XOReq256(A##bi, Di); \ + ROL64in256(Bsa, A##bi, 62); \ + XOReq256(A##go, Do); \ + ROL64in256(Bse, A##go, 55); \ + XOReq256(A##ku, Du); \ + ROL64in256(Bsi, A##ku, 39); \ + E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ + XOReq256(Ca, E##sa); \ + XOReq256(A##ma, Da); \ + ROL64in256(Bso, A##ma, 41); \ + E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ + XOReq256(Ce, E##se); \ + XOReq256(A##se, De); \ + ROL64in256(Bsu, A##se, 2); \ + E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ + XOReq256(Ci, E##si); \ + E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ + XOReq256(Co, E##so); \ + E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ + XOReq256(Cu, E##su); \ +\ + +/* --- Theta Rho Pi Chi Iota */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIota(i, A, E) \ + ROL64in256(Ce1, Ce, 1); \ + Da = XOR256(Cu, Ce1); \ + ROL64in256(Ci1, Ci, 1); \ + De = XOR256(Ca, Ci1); \ + ROL64in256(Co1, Co, 1); \ + Di = XOR256(Ce, Co1); \ + ROL64in256(Cu1, Cu, 1); \ + Do = XOR256(Ci, Cu1); \ + ROL64in256(Ca1, Ca, 1); \ + Du = XOR256(Co, Ca1); \ +\ + XOReq256(A##ba, Da); \ + Bba = A##ba; \ + XOReq256(A##ge, De); \ + ROL64in256(Bbe, A##ge, 44); \ + XOReq256(A##ki, Di); \ + ROL64in256(Bbi, A##ki, 43); \ + E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ + XOReq256(E##ba, CONST256_64(KeccakP1600RoundConstants[i])); \ + XOReq256(A##mo, Do); \ + ROL64in256(Bbo, A##mo, 21); \ + E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ + XOReq256(A##su, Du); \ + ROL64in256(Bbu, A##su, 14); \ + E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ + E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ + E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ +\ + XOReq256(A##bo, Do); \ + ROL64in256(Bga, A##bo, 28); \ + XOReq256(A##gu, Du); \ + ROL64in256(Bge, A##gu, 20); \ + XOReq256(A##ka, Da); \ + ROL64in256(Bgi, A##ka, 3); \ + E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ + XOReq256(A##me, De); \ + ROL64in256(Bgo, A##me, 45); \ + E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ + XOReq256(A##si, Di); \ + ROL64in256(Bgu, A##si, 61); \ + E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ + E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ + E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ +\ + XOReq256(A##be, De); \ + ROL64in256(Bka, A##be, 1); \ + XOReq256(A##gi, Di); \ + ROL64in256(Bke, A##gi, 6); \ + XOReq256(A##ko, Do); \ + ROL64in256(Bki, A##ko, 25); \ + E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ + XOReq256(A##mu, Du); \ + ROL64in256_8(Bko, A##mu); \ + E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ + XOReq256(A##sa, Da); \ + ROL64in256(Bku, A##sa, 18); \ + E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ + E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ + E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ +\ + XOReq256(A##bu, Du); \ + ROL64in256(Bma, A##bu, 27); \ + XOReq256(A##ga, Da); \ + ROL64in256(Bme, A##ga, 36); \ + XOReq256(A##ke, De); \ + ROL64in256(Bmi, A##ke, 10); \ + E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ + XOReq256(A##mi, Di); \ + ROL64in256(Bmo, A##mi, 15); \ + E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ + XOReq256(A##so, Do); \ + ROL64in256_56(Bmu, A##so); \ + E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ + E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ + E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ +\ + XOReq256(A##bi, Di); \ + ROL64in256(Bsa, A##bi, 62); \ + XOReq256(A##go, Do); \ + ROL64in256(Bse, A##go, 55); \ + XOReq256(A##ku, Du); \ + ROL64in256(Bsi, A##ku, 39); \ + E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ + XOReq256(A##ma, Da); \ + ROL64in256(Bso, A##ma, 41); \ + E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ + XOReq256(A##se, De); \ + ROL64in256(Bsu, A##se, 2); \ + E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ + E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ + E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ +\ + +#define initializeState(X) \ + X##ba = ZERO(); \ + X##be = ZERO(); \ + X##bi = ZERO(); \ + X##bo = ZERO(); \ + X##bu = ZERO(); \ + X##ga = ZERO(); \ + X##ge = ZERO(); \ + X##gi = ZERO(); \ + X##go = ZERO(); \ + X##gu = ZERO(); \ + X##ka = ZERO(); \ + X##ke = ZERO(); \ + X##ki = ZERO(); \ + X##ko = ZERO(); \ + X##ku = ZERO(); \ + X##ma = ZERO(); \ + X##me = ZERO(); \ + X##mi = ZERO(); \ + X##mo = ZERO(); \ + X##mu = ZERO(); \ + X##sa = ZERO(); \ + X##se = ZERO(); \ + X##si = ZERO(); \ + X##so = ZERO(); \ + X##su = ZERO(); \ + +#define XORdata16(X, data0, data1, data2, data3) \ + XOReq256(X##ba, LOAD4_64((data3)[ 0], (data2)[ 0], (data1)[ 0], (data0)[ 0])); \ + XOReq256(X##be, LOAD4_64((data3)[ 1], (data2)[ 1], (data1)[ 1], (data0)[ 1])); \ + XOReq256(X##bi, LOAD4_64((data3)[ 2], (data2)[ 2], (data1)[ 2], (data0)[ 2])); \ + XOReq256(X##bo, LOAD4_64((data3)[ 3], (data2)[ 3], (data1)[ 3], (data0)[ 3])); \ + XOReq256(X##bu, LOAD4_64((data3)[ 4], (data2)[ 4], (data1)[ 4], (data0)[ 4])); \ + XOReq256(X##ga, LOAD4_64((data3)[ 5], (data2)[ 5], (data1)[ 5], (data0)[ 5])); \ + XOReq256(X##ge, LOAD4_64((data3)[ 6], (data2)[ 6], (data1)[ 6], (data0)[ 6])); \ + XOReq256(X##gi, LOAD4_64((data3)[ 7], (data2)[ 7], (data1)[ 7], (data0)[ 7])); \ + XOReq256(X##go, LOAD4_64((data3)[ 8], (data2)[ 8], (data1)[ 8], (data0)[ 8])); \ + XOReq256(X##gu, LOAD4_64((data3)[ 9], (data2)[ 9], (data1)[ 9], (data0)[ 9])); \ + XOReq256(X##ka, LOAD4_64((data3)[10], (data2)[10], (data1)[10], (data0)[10])); \ + XOReq256(X##ke, LOAD4_64((data3)[11], (data2)[11], (data1)[11], (data0)[11])); \ + XOReq256(X##ki, LOAD4_64((data3)[12], (data2)[12], (data1)[12], (data0)[12])); \ + XOReq256(X##ko, LOAD4_64((data3)[13], (data2)[13], (data1)[13], (data0)[13])); \ + XOReq256(X##ku, LOAD4_64((data3)[14], (data2)[14], (data1)[14], (data0)[14])); \ + XOReq256(X##ma, LOAD4_64((data3)[15], (data2)[15], (data1)[15], (data0)[15])); \ + +#define XORdata21(X, data0, data1, data2, data3) \ + XORdata16(X, data0, data1, data2, data3) \ + XOReq256(X##me, LOAD4_64((data3)[16], (data2)[16], (data1)[16], (data0)[16])); \ + XOReq256(X##mi, LOAD4_64((data3)[17], (data2)[17], (data1)[17], (data0)[17])); \ + XOReq256(X##mo, LOAD4_64((data3)[18], (data2)[18], (data1)[18], (data0)[18])); \ + XOReq256(X##mu, LOAD4_64((data3)[19], (data2)[19], (data1)[19], (data0)[19])); \ + XOReq256(X##sa, LOAD4_64((data3)[20], (data2)[20], (data1)[20], (data0)[20])); \ + +#define rounds12 \ + prepareTheta \ + thetaRhoPiChiIotaPrepareTheta(12, A, E) \ + thetaRhoPiChiIotaPrepareTheta(13, E, A) \ + thetaRhoPiChiIotaPrepareTheta(14, A, E) \ + thetaRhoPiChiIotaPrepareTheta(15, E, A) \ + thetaRhoPiChiIotaPrepareTheta(16, A, E) \ + thetaRhoPiChiIotaPrepareTheta(17, E, A) \ + thetaRhoPiChiIotaPrepareTheta(18, A, E) \ + thetaRhoPiChiIotaPrepareTheta(19, E, A) \ + thetaRhoPiChiIotaPrepareTheta(20, A, E) \ + thetaRhoPiChiIotaPrepareTheta(21, E, A) \ + thetaRhoPiChiIotaPrepareTheta(22, A, E) \ + thetaRhoPiChiIota(23, E, A) + +#define chunkSize 8192 +#define rateInBytes (21*8) + +void KangarooTwelve_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output) +{ + declareABCDE + unsigned int j; + + initializeState(A); + + for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { + XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); + rounds12 + input += rateInBytes; + } + + XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); + XOReq256(Ame, CONST256_64(0x0BULL)); + XOReq256(Asa, CONST256_64(0x8000000000000000ULL)); + rounds12 + + { + __m256i lanesL01, lanesL23, lanesH01, lanesH23; + + lanesL01 = UNPACKL( Aba, Abe ); + lanesH01 = UNPACKH( Aba, Abe ); + lanesL23 = UNPACKL( Abi, Abo ); + lanesH23 = UNPACKH( Abi, Abo ); + STORE256u( output[ 0], PERM128( lanesL01, lanesL23, 0x20 ) ); + STORE256u( output[32], PERM128( lanesH01, lanesH23, 0x20 ) ); + STORE256u( output[64], PERM128( lanesL01, lanesL23, 0x31 ) ); + STORE256u( output[96], PERM128( lanesH01, lanesH23, 0x31 ) ); + } +} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c new file mode 100644 index 0000000..a19fc35 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c @@ -0,0 +1,458 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include +#include "KeccakP-1600-SnP.h" +#include "align.h" + +#define AVX512alignment 64 + +#define LOAD4_32(a,b,c,d) _mm_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d)) +#define LOAD8_32(a,b,c,d,e,f,g,h) _mm256_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d), (uint32_t)(e), (uint32_t)(f), (uint32_t)(g), (uint32_t)(h)) +#define LOAD_GATHER2_64(idx,p) _mm_i32gather_epi64( (const void*)(p), idx, 8) +#define LOAD_GATHER4_64(idx,p) _mm256_i32gather_epi64( (const void*)(p), idx, 8) +#define LOAD_GATHER8_64(idx,p) _mm512_i32gather_epi64( idx, (const void*)(p), 8) +#define STORE_SCATTER8_64(p,idx, v) _mm512_i32scatter_epi64( (void*)(p), idx, v, 8) + + +/* Keccak-p[1600]×2 */ + +#define XOR(a,b) _mm_xor_si128(a,b) +#define XOReq(a, b) a = _mm_xor_si128(a, b) +#define XOR3(a,b,c) _mm_ternarylogic_epi64(a,b,c,0x96) +#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) +#define ROL(a,offset) _mm_rol_epi64(a,offset) +#define Chi(a,b,c) _mm_ternarylogic_epi64(a,b,c,0xD2) +#define CONST_64(a) _mm_set1_epi64x(a) +#define LOAD6464(a, b) _mm_set_epi64x(a, b) +#define STORE128u(a, b) _mm_storeu_si128((__m128i *)&(a), b) +#define UNPACKL( a, b ) _mm_unpacklo_epi64((a), (b)) +#define UNPACKH( a, b ) _mm_unpackhi_epi64((a), (b)) +#define ZERO() _mm_setzero_si128() + +static ALIGN(AVX512alignment) const uint64_t KeccakP1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL}; + +#define KeccakP_DeclareVars(type) \ + type _Ba, _Be, _Bi, _Bo, _Bu; \ + type _Da, _De, _Di, _Do, _Du; \ + type _ba, _be, _bi, _bo, _bu; \ + type _ga, _ge, _gi, _go, _gu; \ + type _ka, _ke, _ki, _ko, _ku; \ + type _ma, _me, _mi, _mo, _mu; \ + type _sa, _se, _si, _so, _su + +#define KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bb1, _Bb2, _Bb3, _Bb4, _Bb5, _Rr1, _Rr2, _Rr3, _Rr4, _Rr5 ) \ + _Bb1 = XOR(_L1, _Da); \ + _Bb2 = XOR(_L2, _De); \ + _Bb3 = XOR(_L3, _Di); \ + _Bb4 = XOR(_L4, _Do); \ + _Bb5 = XOR(_L5, _Du); \ + if (_Rr1 != 0) _Bb1 = ROL(_Bb1, _Rr1); \ + _Bb2 = ROL(_Bb2, _Rr2); \ + _Bb3 = ROL(_Bb3, _Rr3); \ + _Bb4 = ROL(_Bb4, _Rr4); \ + _Bb5 = ROL(_Bb5, _Rr5); \ + _L1 = Chi( _Ba, _Be, _Bi); \ + _L2 = Chi( _Be, _Bi, _Bo); \ + _L3 = Chi( _Bi, _Bo, _Bu); \ + _L4 = Chi( _Bo, _Bu, _Ba); \ + _L5 = Chi( _Bu, _Ba, _Be); + +#define KeccakP_ThetaRhoPiChiIota0( _L1, _L2, _L3, _L4, _L5, _rc ) \ + _Ba = XOR5( _ba, _ga, _ka, _ma, _sa ); /* Theta effect */ \ + _Be = XOR5( _be, _ge, _ke, _me, _se ); \ + _Bi = XOR5( _bi, _gi, _ki, _mi, _si ); \ + _Bo = XOR5( _bo, _go, _ko, _mo, _so ); \ + _Bu = XOR5( _bu, _gu, _ku, _mu, _su ); \ + _Da = ROL( _Be, 1 ); \ + _De = ROL( _Bi, 1 ); \ + _Di = ROL( _Bo, 1 ); \ + _Do = ROL( _Bu, 1 ); \ + _Du = ROL( _Ba, 1 ); \ + _Da = XOR( _Da, _Bu ); \ + _De = XOR( _De, _Ba ); \ + _Di = XOR( _Di, _Be ); \ + _Do = XOR( _Do, _Bi ); \ + _Du = XOR( _Du, _Bo ); \ + KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Ba, _Be, _Bi, _Bo, _Bu, 0, 44, 43, 21, 14 ); \ + _L1 = XOR(_L1, _rc) /* Iota */ + +#define KeccakP_ThetaRhoPiChi1( _L1, _L2, _L3, _L4, _L5 ) \ + KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bi, _Bo, _Bu, _Ba, _Be, 3, 45, 61, 28, 20 ) + +#define KeccakP_ThetaRhoPiChi2( _L1, _L2, _L3, _L4, _L5 ) \ + KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bu, _Ba, _Be, _Bi, _Bo, 18, 1, 6, 25, 8 ) + +#define KeccakP_ThetaRhoPiChi3( _L1, _L2, _L3, _L4, _L5 ) \ + KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Be, _Bi, _Bo, _Bu, _Ba, 36, 10, 15, 56, 27 ) + +#define KeccakP_ThetaRhoPiChi4( _L1, _L2, _L3, _L4, _L5 ) \ + KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bo, _Bu, _Ba, _Be, _Bi, 41, 2, 62, 55, 39 ) + +#define KeccakP_4rounds( i ) \ + KeccakP_ThetaRhoPiChiIota0(_ba, _ge, _ki, _mo, _su, CONST_64(KeccakP1600RoundConstants[i]) ); \ + KeccakP_ThetaRhoPiChi1( _ka, _me, _si, _bo, _gu ); \ + KeccakP_ThetaRhoPiChi2( _sa, _be, _gi, _ko, _mu ); \ + KeccakP_ThetaRhoPiChi3( _ga, _ke, _mi, _so, _bu ); \ + KeccakP_ThetaRhoPiChi4( _ma, _se, _bi, _go, _ku ); \ +\ + KeccakP_ThetaRhoPiChiIota0(_ba, _me, _gi, _so, _ku, CONST_64(KeccakP1600RoundConstants[i+1]) ); \ + KeccakP_ThetaRhoPiChi1( _sa, _ke, _bi, _mo, _gu ); \ + KeccakP_ThetaRhoPiChi2( _ma, _ge, _si, _ko, _bu ); \ + KeccakP_ThetaRhoPiChi3( _ka, _be, _mi, _go, _su ); \ + KeccakP_ThetaRhoPiChi4( _ga, _se, _ki, _bo, _mu ); \ +\ + KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST_64(KeccakP1600RoundConstants[i+2]) ); \ + KeccakP_ThetaRhoPiChi1( _ma, _be, _ki, _so, _gu ); \ + KeccakP_ThetaRhoPiChi2( _ga, _me, _bi, _ko, _su ); \ + KeccakP_ThetaRhoPiChi3( _sa, _ge, _mi, _bo, _ku ); \ + KeccakP_ThetaRhoPiChi4( _ka, _se, _gi, _mo, _bu ); \ +\ + KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST_64(KeccakP1600RoundConstants[i+3]) ); \ + KeccakP_ThetaRhoPiChi1( _ga, _ge, _gi, _go, _gu ); \ + KeccakP_ThetaRhoPiChi2( _ka, _ke, _ki, _ko, _ku ); \ + KeccakP_ThetaRhoPiChi3( _ma, _me, _mi, _mo, _mu ); \ + KeccakP_ThetaRhoPiChi4( _sa, _se, _si, _so, _su ) + +#define rounds12 \ + KeccakP_4rounds( 12 ); \ + KeccakP_4rounds( 16 ); \ + KeccakP_4rounds( 20 ) + +#define initializeState(X) \ + X##ba = ZERO(); \ + X##be = ZERO(); \ + X##bi = ZERO(); \ + X##bo = ZERO(); \ + X##bu = ZERO(); \ + X##ga = ZERO(); \ + X##ge = ZERO(); \ + X##gi = ZERO(); \ + X##go = ZERO(); \ + X##gu = ZERO(); \ + X##ka = ZERO(); \ + X##ke = ZERO(); \ + X##ki = ZERO(); \ + X##ko = ZERO(); \ + X##ku = ZERO(); \ + X##ma = ZERO(); \ + X##me = ZERO(); \ + X##mi = ZERO(); \ + X##mo = ZERO(); \ + X##mu = ZERO(); \ + X##sa = ZERO(); \ + X##se = ZERO(); \ + X##si = ZERO(); \ + X##so = ZERO(); \ + X##su = ZERO(); \ + +#define XORdata16(X, data0, data1) \ + XOReq(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \ + XOReq(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \ + XOReq(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \ + XOReq(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \ + XOReq(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \ + XOReq(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \ + XOReq(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \ + XOReq(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \ + XOReq(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \ + XOReq(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \ + XOReq(X##ka, LOAD6464((data1)[10], (data0)[10])); \ + XOReq(X##ke, LOAD6464((data1)[11], (data0)[11])); \ + XOReq(X##ki, LOAD6464((data1)[12], (data0)[12])); \ + XOReq(X##ko, LOAD6464((data1)[13], (data0)[13])); \ + XOReq(X##ku, LOAD6464((data1)[14], (data0)[14])); \ + XOReq(X##ma, LOAD6464((data1)[15], (data0)[15])); \ + +#define XORdata21(X, data0, data1) \ + XORdata16(X, data0, data1) \ + XOReq(X##me, LOAD6464((data1)[16], (data0)[16])); \ + XOReq(X##mi, LOAD6464((data1)[17], (data0)[17])); \ + XOReq(X##mo, LOAD6464((data1)[18], (data0)[18])); \ + XOReq(X##mu, LOAD6464((data1)[19], (data0)[19])); \ + XOReq(X##sa, LOAD6464((data1)[20], (data0)[20])); \ + +#define chunkSize 8192 +#define rateInBytes (21*8) + +void KangarooTwelve_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output) +{ + KeccakP_DeclareVars(__m128i); + unsigned int j; + + initializeState(_); + + for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { + XORdata21(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); + rounds12 + input += rateInBytes; + } + + XORdata16(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); + XOReq(_me, CONST_64(0x0BULL)); + XOReq(_sa, CONST_64(0x8000000000000000ULL)); + rounds12 + + STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( _ba, _be ) ); + STORE128u( *(__m128i*)&(output[16]), UNPACKL( _bi, _bo ) ); + STORE128u( *(__m128i*)&(output[32]), UNPACKH( _ba, _be ) ); + STORE128u( *(__m128i*)&(output[48]), UNPACKH( _bi, _bo ) ); +} + +#undef XOR +#undef XOReq +#undef XOR3 +#undef XOR5 +#undef ROL +#undef Chi +#undef CONST_64 +#undef LOAD6464 +#undef STORE128u +#undef UNPACKL +#undef UNPACKH +#undef ZERO +#undef XORdata16 +#undef XORdata21 + + +/* Keccak-p[1600]×4 */ + +#define XOR(a,b) _mm256_xor_si256(a,b) +#define XOReq(a,b) a = _mm256_xor_si256(a,b) +#define XOR3(a,b,c) _mm256_ternarylogic_epi64(a,b,c,0x96) +#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) +#define XOR512(a,b) _mm512_xor_si512(a,b) +#define ROL(a,offset) _mm256_rol_epi64(a,offset) +#define Chi(a,b,c) _mm256_ternarylogic_epi64(a,b,c,0xD2) +#define CONST_64(a) _mm256_set1_epi64x(a) +#define ZERO() _mm256_setzero_si256() +#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d)) + +#define XORdata16(X, data0, data1, data2, data3) \ + XOReq(X##ba, LOAD4_64((data3)[ 0], (data2)[ 0], (data1)[ 0], (data0)[ 0])); \ + XOReq(X##be, LOAD4_64((data3)[ 1], (data2)[ 1], (data1)[ 1], (data0)[ 1])); \ + XOReq(X##bi, LOAD4_64((data3)[ 2], (data2)[ 2], (data1)[ 2], (data0)[ 2])); \ + XOReq(X##bo, LOAD4_64((data3)[ 3], (data2)[ 3], (data1)[ 3], (data0)[ 3])); \ + XOReq(X##bu, LOAD4_64((data3)[ 4], (data2)[ 4], (data1)[ 4], (data0)[ 4])); \ + XOReq(X##ga, LOAD4_64((data3)[ 5], (data2)[ 5], (data1)[ 5], (data0)[ 5])); \ + XOReq(X##ge, LOAD4_64((data3)[ 6], (data2)[ 6], (data1)[ 6], (data0)[ 6])); \ + XOReq(X##gi, LOAD4_64((data3)[ 7], (data2)[ 7], (data1)[ 7], (data0)[ 7])); \ + XOReq(X##go, LOAD4_64((data3)[ 8], (data2)[ 8], (data1)[ 8], (data0)[ 8])); \ + XOReq(X##gu, LOAD4_64((data3)[ 9], (data2)[ 9], (data1)[ 9], (data0)[ 9])); \ + XOReq(X##ka, LOAD4_64((data3)[10], (data2)[10], (data1)[10], (data0)[10])); \ + XOReq(X##ke, LOAD4_64((data3)[11], (data2)[11], (data1)[11], (data0)[11])); \ + XOReq(X##ki, LOAD4_64((data3)[12], (data2)[12], (data1)[12], (data0)[12])); \ + XOReq(X##ko, LOAD4_64((data3)[13], (data2)[13], (data1)[13], (data0)[13])); \ + XOReq(X##ku, LOAD4_64((data3)[14], (data2)[14], (data1)[14], (data0)[14])); \ + XOReq(X##ma, LOAD4_64((data3)[15], (data2)[15], (data1)[15], (data0)[15])); \ + +#define XORdata21(X, data0, data1, data2, data3) \ + XORdata16(X, data0, data1, data2, data3) \ + XOReq(X##me, LOAD4_64((data3)[16], (data2)[16], (data1)[16], (data0)[16])); \ + XOReq(X##mi, LOAD4_64((data3)[17], (data2)[17], (data1)[17], (data0)[17])); \ + XOReq(X##mo, LOAD4_64((data3)[18], (data2)[18], (data1)[18], (data0)[18])); \ + XOReq(X##mu, LOAD4_64((data3)[19], (data2)[19], (data1)[19], (data0)[19])); \ + XOReq(X##sa, LOAD4_64((data3)[20], (data2)[20], (data1)[20], (data0)[20])); \ + +void KangarooTwelve_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output) +{ + KeccakP_DeclareVars(__m256i); + unsigned int j; + + initializeState(_); + + for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { + XORdata21(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); + rounds12 + input += rateInBytes; + } + + XORdata16(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); + XOReq(_me, CONST_64(0x0BULL)); + XOReq(_sa, CONST_64(0x8000000000000000ULL)); + rounds12 + +#define STORE256u(a, b) _mm256_storeu_si256((__m256i *)&(a), b) +#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) +#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) +#define PERM128( a, b, c ) _mm256_permute2f128_si256(a, b, c) + { + __m256i lanesL01, lanesL23, lanesH01, lanesH23; + + lanesL01 = UNPACKL( _ba, _be ); + lanesH01 = UNPACKH( _ba, _be ); + lanesL23 = UNPACKL( _bi, _bo ); + lanesH23 = UNPACKH( _bi, _bo ); + STORE256u( output[ 0], PERM128( lanesL01, lanesL23, 0x20 ) ); + STORE256u( output[32], PERM128( lanesH01, lanesH23, 0x20 ) ); + STORE256u( output[64], PERM128( lanesL01, lanesL23, 0x31 ) ); + STORE256u( output[96], PERM128( lanesH01, lanesH23, 0x31 ) ); + } +/* TODO: check if something like this would be better: + index512 = LOAD8_32(3*laneOffset+1, 2*laneOffset+1, 1*laneOffset+1, 0*laneOffset+1, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset); + STORE_SCATTER8_64(dataAsLanes+0, index512, stateAsLanes512[0/2]); + STORE_SCATTER8_64(dataAsLanes+2, index512, stateAsLanes512[2/2]); +*/ +} + +#undef XOR +#undef XOReq +#undef XOR3 +#undef XOR5 +#undef XOR512 +#undef ROL +#undef Chi +#undef CONST_64 +#undef ZERO +#undef LOAD4_64 +#undef XORdata16 +#undef XORdata21 + + +/* Keccak-p[1600]×8 */ + +#define XOR(a,b) _mm512_xor_si512(a,b) +#define XOReq(a,b) a = _mm512_xor_si512(a,b) +#define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96) +#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) +#define XOReq512(a, b) a = XOR(a,b) +#define ROL(a,offset) _mm512_rol_epi64(a,offset) +#define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2) +#define CONST_64(a) _mm512_set1_epi64(a) +#define ZERO() _mm512_setzero_si512() +#define LOAD(p) _mm512_loadu_si512(p) + +#define LoadAndTranspose8(dataAsLanes, offset) \ + t0 = LOAD((dataAsLanes) + (offset) + 0*chunkSize/8); \ + t1 = LOAD((dataAsLanes) + (offset) + 1*chunkSize/8); \ + t2 = LOAD((dataAsLanes) + (offset) + 2*chunkSize/8); \ + t3 = LOAD((dataAsLanes) + (offset) + 3*chunkSize/8); \ + t4 = LOAD((dataAsLanes) + (offset) + 4*chunkSize/8); \ + t5 = LOAD((dataAsLanes) + (offset) + 5*chunkSize/8); \ + t6 = LOAD((dataAsLanes) + (offset) + 6*chunkSize/8); \ + t7 = LOAD((dataAsLanes) + (offset) + 7*chunkSize/8); \ + r0 = _mm512_unpacklo_epi64(t0, t1); \ + r1 = _mm512_unpackhi_epi64(t0, t1); \ + r2 = _mm512_unpacklo_epi64(t2, t3); \ + r3 = _mm512_unpackhi_epi64(t2, t3); \ + r4 = _mm512_unpacklo_epi64(t4, t5); \ + r5 = _mm512_unpackhi_epi64(t4, t5); \ + r6 = _mm512_unpacklo_epi64(t6, t7); \ + r7 = _mm512_unpackhi_epi64(t6, t7); \ + t0 = _mm512_shuffle_i32x4(r0, r2, 0x88); \ + t1 = _mm512_shuffle_i32x4(r1, r3, 0x88); \ + t2 = _mm512_shuffle_i32x4(r0, r2, 0xdd); \ + t3 = _mm512_shuffle_i32x4(r1, r3, 0xdd); \ + t4 = _mm512_shuffle_i32x4(r4, r6, 0x88); \ + t5 = _mm512_shuffle_i32x4(r5, r7, 0x88); \ + t6 = _mm512_shuffle_i32x4(r4, r6, 0xdd); \ + t7 = _mm512_shuffle_i32x4(r5, r7, 0xdd); \ + r0 = _mm512_shuffle_i32x4(t0, t4, 0x88); \ + r1 = _mm512_shuffle_i32x4(t1, t5, 0x88); \ + r2 = _mm512_shuffle_i32x4(t2, t6, 0x88); \ + r3 = _mm512_shuffle_i32x4(t3, t7, 0x88); \ + r4 = _mm512_shuffle_i32x4(t0, t4, 0xdd); \ + r5 = _mm512_shuffle_i32x4(t1, t5, 0xdd); \ + r6 = _mm512_shuffle_i32x4(t2, t6, 0xdd); \ + r7 = _mm512_shuffle_i32x4(t3, t7, 0xdd); \ + +#define XORdata16(X, index, dataAsLanes) \ + LoadAndTranspose8(dataAsLanes, 0) \ + XOReq(X##ba, r0); \ + XOReq(X##be, r1); \ + XOReq(X##bi, r2); \ + XOReq(X##bo, r3); \ + XOReq(X##bu, r4); \ + XOReq(X##ga, r5); \ + XOReq(X##ge, r6); \ + XOReq(X##gi, r7); \ + LoadAndTranspose8(dataAsLanes, 8) \ + XOReq(X##go, r0); \ + XOReq(X##gu, r1); \ + XOReq(X##ka, r2); \ + XOReq(X##ke, r3); \ + XOReq(X##ki, r4); \ + XOReq(X##ko, r5); \ + XOReq(X##ku, r6); \ + XOReq(X##ma, r7); \ + +#define XORdata21(X, index, dataAsLanes) \ + XORdata16(X, index, dataAsLanes) \ + XOReq(X##me, LOAD_GATHER8_64(index, (dataAsLanes) + 16)); \ + XOReq(X##mi, LOAD_GATHER8_64(index, (dataAsLanes) + 17)); \ + XOReq(X##mo, LOAD_GATHER8_64(index, (dataAsLanes) + 18)); \ + XOReq(X##mu, LOAD_GATHER8_64(index, (dataAsLanes) + 19)); \ + XOReq(X##sa, LOAD_GATHER8_64(index, (dataAsLanes) + 20)); \ + +void KangarooTwelve_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output) +{ + KeccakP_DeclareVars(__m512i); + unsigned int j; + const uint64_t *outputAsLanes = (const uint64_t *)output; + __m256i index; + __m512i t0, t1, t2, t3, t4, t5, t6, t7; + __m512i r0, r1, r2, r3, r4, r5, r6, r7; + + initializeState(_); + + index = LOAD8_32(7*(chunkSize / 8), 6*(chunkSize / 8), 5*(chunkSize / 8), 4*(chunkSize / 8), 3*(chunkSize / 8), 2*(chunkSize / 8), 1*(chunkSize / 8), 0*(chunkSize / 8)); + for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { + XORdata21(_, index, (const uint64_t *)input); + rounds12 + input += rateInBytes; + } + + XORdata16(_, index, (const uint64_t *)input); + XOReq(_me, CONST_64(0x0BULL)); + XOReq(_sa, CONST_64(0x8000000000000000ULL)); + rounds12 + + index = LOAD8_32(7*4, 6*4, 5*4, 4*4, 3*4, 2*4, 1*4, 0*4); + STORE_SCATTER8_64(outputAsLanes+0, index, _ba); + STORE_SCATTER8_64(outputAsLanes+1, index, _be); + STORE_SCATTER8_64(outputAsLanes+2, index, _bi); + STORE_SCATTER8_64(outputAsLanes+3, index, _bo); +} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c new file mode 100644 index 0000000..036df52 --- /dev/null +++ b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c @@ -0,0 +1,438 @@ +/* +K12 based on the eXtended Keccak Code Package (XKCP) +https://github.com/XKCP/XKCP + +The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. + +Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". + +For more information, feedback or questions, please refer to the Keccak Team website: +https://keccak.team/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ + +--- + +Please refer to the XKCP for more details. +*/ + +#include +#include +#include "KeccakP-1600-SnP.h" +#include "align.h" + +#define KeccakP1600times2_SSSE3_unrolling 2 + +#define SSSE3alignment 16 + +#define ANDnu128(a, b) _mm_andnot_si128(a, b) +#define CONST128(a) _mm_load_si128((const __m128i *)&(a)) +#define LOAD128(a) _mm_load_si128((const __m128i *)&(a)) +#define LOAD6464(a, b) _mm_set_epi64x(a, b) +#define CONST128_64(a) _mm_set1_epi64x(a) +#define ROL64in128(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o))) +#define ROL64in128_8(a) _mm_shuffle_epi8(a, CONST128(rho8)) +#define ROL64in128_56(a) _mm_shuffle_epi8(a, CONST128(rho56)) +static const uint64_t rho8[2] = {0x0605040302010007, 0x0E0D0C0B0A09080F}; +static const uint64_t rho56[2] = {0x0007060504030201, 0x080F0E0D0C0B0A09}; +#define STORE128(a, b) _mm_store_si128((__m128i *)&(a), b) +#define STORE128u(a, b) _mm_storeu_si128((__m128i *)&(a), b) +#define XOR128(a, b) _mm_xor_si128(a, b) +#define XOReq128(a, b) a = _mm_xor_si128(a, b) +#define UNPACKL( a, b ) _mm_unpacklo_epi64((a), (b)) +#define UNPACKH( a, b ) _mm_unpackhi_epi64((a), (b)) +#define ZERO() _mm_setzero_si128() + +static ALIGN(SSSE3alignment) const uint64_t KeccakP1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL}; + +#define declareABCDE \ + __m128i Aba, Abe, Abi, Abo, Abu; \ + __m128i Aga, Age, Agi, Ago, Agu; \ + __m128i Aka, Ake, Aki, Ako, Aku; \ + __m128i Ama, Ame, Ami, Amo, Amu; \ + __m128i Asa, Ase, Asi, Aso, Asu; \ + __m128i Bba, Bbe, Bbi, Bbo, Bbu; \ + __m128i Bga, Bge, Bgi, Bgo, Bgu; \ + __m128i Bka, Bke, Bki, Bko, Bku; \ + __m128i Bma, Bme, Bmi, Bmo, Bmu; \ + __m128i Bsa, Bse, Bsi, Bso, Bsu; \ + __m128i Ca, Ce, Ci, Co, Cu; \ + __m128i Da, De, Di, Do, Du; \ + __m128i Eba, Ebe, Ebi, Ebo, Ebu; \ + __m128i Ega, Ege, Egi, Ego, Egu; \ + __m128i Eka, Eke, Eki, Eko, Eku; \ + __m128i Ema, Eme, Emi, Emo, Emu; \ + __m128i Esa, Ese, Esi, Eso, Esu; \ + +#define prepareTheta \ + Ca = XOR128(Aba, XOR128(Aga, XOR128(Aka, XOR128(Ama, Asa)))); \ + Ce = XOR128(Abe, XOR128(Age, XOR128(Ake, XOR128(Ame, Ase)))); \ + Ci = XOR128(Abi, XOR128(Agi, XOR128(Aki, XOR128(Ami, Asi)))); \ + Co = XOR128(Abo, XOR128(Ago, XOR128(Ako, XOR128(Amo, Aso)))); \ + Cu = XOR128(Abu, XOR128(Agu, XOR128(Aku, XOR128(Amu, Asu)))); \ + +/* --- Theta Rho Pi Chi Iota Prepare-theta */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + Da = XOR128(Cu, ROL64in128(Ce, 1)); \ + De = XOR128(Ca, ROL64in128(Ci, 1)); \ + Di = XOR128(Ce, ROL64in128(Co, 1)); \ + Do = XOR128(Ci, ROL64in128(Cu, 1)); \ + Du = XOR128(Co, ROL64in128(Ca, 1)); \ +\ + XOReq128(A##ba, Da); \ + Bba = A##ba; \ + XOReq128(A##ge, De); \ + Bbe = ROL64in128(A##ge, 44); \ + XOReq128(A##ki, Di); \ + Bbi = ROL64in128(A##ki, 43); \ + E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \ + XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \ + Ca = E##ba; \ + XOReq128(A##mo, Do); \ + Bbo = ROL64in128(A##mo, 21); \ + E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \ + Ce = E##be; \ + XOReq128(A##su, Du); \ + Bbu = ROL64in128(A##su, 14); \ + E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \ + Ci = E##bi; \ + E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \ + Co = E##bo; \ + E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \ + Cu = E##bu; \ +\ + XOReq128(A##bo, Do); \ + Bga = ROL64in128(A##bo, 28); \ + XOReq128(A##gu, Du); \ + Bge = ROL64in128(A##gu, 20); \ + XOReq128(A##ka, Da); \ + Bgi = ROL64in128(A##ka, 3); \ + E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \ + XOReq128(Ca, E##ga); \ + XOReq128(A##me, De); \ + Bgo = ROL64in128(A##me, 45); \ + E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \ + XOReq128(Ce, E##ge); \ + XOReq128(A##si, Di); \ + Bgu = ROL64in128(A##si, 61); \ + E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \ + XOReq128(Ci, E##gi); \ + E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \ + XOReq128(Co, E##go); \ + E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \ + XOReq128(Cu, E##gu); \ +\ + XOReq128(A##be, De); \ + Bka = ROL64in128(A##be, 1); \ + XOReq128(A##gi, Di); \ + Bke = ROL64in128(A##gi, 6); \ + XOReq128(A##ko, Do); \ + Bki = ROL64in128(A##ko, 25); \ + E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \ + XOReq128(Ca, E##ka); \ + XOReq128(A##mu, Du); \ + Bko = ROL64in128_8(A##mu); \ + E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \ + XOReq128(Ce, E##ke); \ + XOReq128(A##sa, Da); \ + Bku = ROL64in128(A##sa, 18); \ + E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \ + XOReq128(Ci, E##ki); \ + E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \ + XOReq128(Co, E##ko); \ + E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \ + XOReq128(Cu, E##ku); \ +\ + XOReq128(A##bu, Du); \ + Bma = ROL64in128(A##bu, 27); \ + XOReq128(A##ga, Da); \ + Bme = ROL64in128(A##ga, 36); \ + XOReq128(A##ke, De); \ + Bmi = ROL64in128(A##ke, 10); \ + E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \ + XOReq128(Ca, E##ma); \ + XOReq128(A##mi, Di); \ + Bmo = ROL64in128(A##mi, 15); \ + E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \ + XOReq128(Ce, E##me); \ + XOReq128(A##so, Do); \ + Bmu = ROL64in128_56(A##so); \ + E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \ + XOReq128(Ci, E##mi); \ + E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \ + XOReq128(Co, E##mo); \ + E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \ + XOReq128(Cu, E##mu); \ +\ + XOReq128(A##bi, Di); \ + Bsa = ROL64in128(A##bi, 62); \ + XOReq128(A##go, Do); \ + Bse = ROL64in128(A##go, 55); \ + XOReq128(A##ku, Du); \ + Bsi = ROL64in128(A##ku, 39); \ + E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \ + XOReq128(Ca, E##sa); \ + XOReq128(A##ma, Da); \ + Bso = ROL64in128(A##ma, 41); \ + E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \ + XOReq128(Ce, E##se); \ + XOReq128(A##se, De); \ + Bsu = ROL64in128(A##se, 2); \ + E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \ + XOReq128(Ci, E##si); \ + E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \ + XOReq128(Co, E##so); \ + E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \ + XOReq128(Cu, E##su); \ +\ + +/* --- Theta Rho Pi Chi Iota */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIota(i, A, E) \ + Da = XOR128(Cu, ROL64in128(Ce, 1)); \ + De = XOR128(Ca, ROL64in128(Ci, 1)); \ + Di = XOR128(Ce, ROL64in128(Co, 1)); \ + Do = XOR128(Ci, ROL64in128(Cu, 1)); \ + Du = XOR128(Co, ROL64in128(Ca, 1)); \ +\ + XOReq128(A##ba, Da); \ + Bba = A##ba; \ + XOReq128(A##ge, De); \ + Bbe = ROL64in128(A##ge, 44); \ + XOReq128(A##ki, Di); \ + Bbi = ROL64in128(A##ki, 43); \ + E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \ + XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \ + XOReq128(A##mo, Do); \ + Bbo = ROL64in128(A##mo, 21); \ + E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \ + XOReq128(A##su, Du); \ + Bbu = ROL64in128(A##su, 14); \ + E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \ + E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \ + E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \ +\ + XOReq128(A##bo, Do); \ + Bga = ROL64in128(A##bo, 28); \ + XOReq128(A##gu, Du); \ + Bge = ROL64in128(A##gu, 20); \ + XOReq128(A##ka, Da); \ + Bgi = ROL64in128(A##ka, 3); \ + E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \ + XOReq128(A##me, De); \ + Bgo = ROL64in128(A##me, 45); \ + E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \ + XOReq128(A##si, Di); \ + Bgu = ROL64in128(A##si, 61); \ + E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \ + E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \ + E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \ +\ + XOReq128(A##be, De); \ + Bka = ROL64in128(A##be, 1); \ + XOReq128(A##gi, Di); \ + Bke = ROL64in128(A##gi, 6); \ + XOReq128(A##ko, Do); \ + Bki = ROL64in128(A##ko, 25); \ + E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \ + XOReq128(A##mu, Du); \ + Bko = ROL64in128_8(A##mu); \ + E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \ + XOReq128(A##sa, Da); \ + Bku = ROL64in128(A##sa, 18); \ + E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \ + E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \ + E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \ +\ + XOReq128(A##bu, Du); \ + Bma = ROL64in128(A##bu, 27); \ + XOReq128(A##ga, Da); \ + Bme = ROL64in128(A##ga, 36); \ + XOReq128(A##ke, De); \ + Bmi = ROL64in128(A##ke, 10); \ + E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \ + XOReq128(A##mi, Di); \ + Bmo = ROL64in128(A##mi, 15); \ + E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \ + XOReq128(A##so, Do); \ + Bmu = ROL64in128_56(A##so); \ + E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \ + E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \ + E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \ +\ + XOReq128(A##bi, Di); \ + Bsa = ROL64in128(A##bi, 62); \ + XOReq128(A##go, Do); \ + Bse = ROL64in128(A##go, 55); \ + XOReq128(A##ku, Du); \ + Bsi = ROL64in128(A##ku, 39); \ + E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \ + XOReq128(A##ma, Da); \ + Bso = ROL64in128(A##ma, 41); \ + E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \ + XOReq128(A##se, De); \ + Bsu = ROL64in128(A##se, 2); \ + E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \ + E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \ + E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \ +\ + +#define initializeState(X) \ + X##ba = ZERO(); \ + X##be = ZERO(); \ + X##bi = ZERO(); \ + X##bo = ZERO(); \ + X##bu = ZERO(); \ + X##ga = ZERO(); \ + X##ge = ZERO(); \ + X##gi = ZERO(); \ + X##go = ZERO(); \ + X##gu = ZERO(); \ + X##ka = ZERO(); \ + X##ke = ZERO(); \ + X##ki = ZERO(); \ + X##ko = ZERO(); \ + X##ku = ZERO(); \ + X##ma = ZERO(); \ + X##me = ZERO(); \ + X##mi = ZERO(); \ + X##mo = ZERO(); \ + X##mu = ZERO(); \ + X##sa = ZERO(); \ + X##se = ZERO(); \ + X##si = ZERO(); \ + X##so = ZERO(); \ + X##su = ZERO(); \ + +#define XORdata16(X, data0, data1) \ + XOReq128(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \ + XOReq128(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \ + XOReq128(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \ + XOReq128(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \ + XOReq128(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \ + XOReq128(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \ + XOReq128(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \ + XOReq128(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \ + XOReq128(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \ + XOReq128(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \ + XOReq128(X##ka, LOAD6464((data1)[10], (data0)[10])); \ + XOReq128(X##ke, LOAD6464((data1)[11], (data0)[11])); \ + XOReq128(X##ki, LOAD6464((data1)[12], (data0)[12])); \ + XOReq128(X##ko, LOAD6464((data1)[13], (data0)[13])); \ + XOReq128(X##ku, LOAD6464((data1)[14], (data0)[14])); \ + XOReq128(X##ma, LOAD6464((data1)[15], (data0)[15])); \ + +#define XORdata21(X, data0, data1) \ + XORdata16(X, data0, data1) \ + XOReq128(X##me, LOAD6464((data1)[16], (data0)[16])); \ + XOReq128(X##mi, LOAD6464((data1)[17], (data0)[17])); \ + XOReq128(X##mo, LOAD6464((data1)[18], (data0)[18])); \ + XOReq128(X##mu, LOAD6464((data1)[19], (data0)[19])); \ + XOReq128(X##sa, LOAD6464((data1)[20], (data0)[20])); \ + +#if ((defined(KeccakP1600times2_SSSE3_fullUnrolling)) || (KeccakP1600times2_SSSE3_unrolling == 12)) +#define rounds12 \ + prepareTheta \ + thetaRhoPiChiIotaPrepareTheta(12, A, E) \ + thetaRhoPiChiIotaPrepareTheta(13, E, A) \ + thetaRhoPiChiIotaPrepareTheta(14, A, E) \ + thetaRhoPiChiIotaPrepareTheta(15, E, A) \ + thetaRhoPiChiIotaPrepareTheta(16, A, E) \ + thetaRhoPiChiIotaPrepareTheta(17, E, A) \ + thetaRhoPiChiIotaPrepareTheta(18, A, E) \ + thetaRhoPiChiIotaPrepareTheta(19, E, A) \ + thetaRhoPiChiIotaPrepareTheta(20, A, E) \ + thetaRhoPiChiIotaPrepareTheta(21, E, A) \ + thetaRhoPiChiIotaPrepareTheta(22, A, E) \ + thetaRhoPiChiIota(23, E, A) \ + +#elif (KeccakP1600times2_SSSE3_unrolling == 6) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=6) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ + } \ + +#elif (KeccakP1600times2_SSSE3_unrolling == 4) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=4) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + } \ + +#elif (KeccakP1600times2_SSSE3_unrolling == 2) +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=2) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + } \ + +#else +#error "KeccakP1600times2_SSSE3_unrolling is not correctly specified!" +#endif + +#define chunkSize 8192 +#define rateInBytes (21*8) + +void KangarooTwelve_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output) +{ + declareABCDE + #ifndef KeccakP1600times2_SSSE3_fullUnrolling + unsigned int i; + #endif + unsigned int j; + + initializeState(A); + + for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { + XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); + rounds12 + input += rateInBytes; + } + + XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); + XOReq128(Ame, _mm_set1_epi64x(0x0BULL)); + XOReq128(Asa, _mm_set1_epi64x(0x8000000000000000ULL)); + rounds12 + + STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( Aba, Abe ) ); + STORE128u( *(__m128i*)&(output[16]), UNPACKL( Abi, Abo ) ); + STORE128u( *(__m128i*)&(output[32]), UNPACKH( Aba, Abe ) ); + STORE128u( *(__m128i*)&(output[48]), UNPACKH( Abi, Abo ) ); +}